Rebase mainupstream/users/avillega/clangd-refactor-check-to-have-more-explicit-cli-args

Created using spr 1.3.5
author: Andrés Villegas <andresvi@google.com> 2024-01-02 20:11:15 +0000
committer: Andrés Villegas <andresvi@google.com> 2024-01-02 20:11:15 +0000
commit: 9b11c0d602551ecf17d3dfaf487a0b76b1d8485b (patch)
tree: 91d2c2779fc3d53bb2b3199d2cf1eb9a56cec347
parent: cba0ef6a22f7ced03e34434250edb1ff2f60653a (diff)
parent: d50e5f5778f0fba272ab3b2a4f7a8103631bcde4 (diff)
1740 files changed, 100048 insertions, 30114 deletions
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f6b380c7f54a..988302e6a57a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -82,9 +82,12 @@
 /mlir/**/*EmulateNarrowType* @hanhanW
 /mlir/lib/Dialect/Vector/Transforms/* @hanhanW @nicolasvasilache
 
+# Presburger library in MLIR
+/mlir/**/*Presburger* @Groverkss @Superty
+
 # Tensor Dialect in MLIR.
 /mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp @hanhanW @nicolasvasilache
-/mlir/lib/Dialect/Tensor/Transforms/FoldIntoPackAndUnpackPatterns.cpp @hanhanW @nicolasvasilache
+/mlir/lib/Dialect/Tensor/Transforms/* @hanhanW @nicolasvasilache
 
 # Transform Dialect in MLIR.
 /mlir/include/mlir/Dialect/Transform/* @ftynse @nicolasvasilache
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 370cf830a60c..25e8c8c1ef21 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -185,7 +185,7 @@ jobs:
           std_modules: 'OFF'
         # Use a larger machine for MSAN to avoid timeout and memory allocation issues.
         - config: 'generic-msan'
-          machine: libcxx-runners-32-set
+          machine: libcxx-runners-8-set
           std_modules: 'OFF'
     runs-on: ${{ matrix.machine }}
     steps:
diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp
index caa5ecbea521..762d3419edd3 100644
--- a/bolt/lib/Core/DIEBuilder.cpp
+++ b/bolt/lib/Core/DIEBuilder.cpp
@@ -266,13 +266,11 @@ void DIEBuilder::buildCompileUnits(const bool Init) {
 }
 void DIEBuilder::buildCompileUnits(const std::vector<DWARFUnit *> &CUs) {
   BuilderState.reset(new State());
-  // Initializing to full size because there could be cross CU references with
-  // different abbrev offsets. LLVM happens to output CUs that have cross CU
-  // references with the same abbrev table. So destinations end up in the first
-  // set, even if they themselves don't have src cross cu ref. We could have
-  // cases where this is not the case. In which case this container needs to be
-  // big enough for all.
-  getState().CloneUnitCtxMap.resize(DwarfContext->getNumCompileUnits());
+  // Allocating enough for current batch being processed.
+  // In real use cases we either processing a batch of CUs with no cross
+  // references, or if they do have them it is due to LTO. With clang they will
+  // share the same abbrev table. In either case this vector will not grow.
+  getState().CloneUnitCtxMap.resize(CUs.size());
   getState().Type = ProcessingType::CUs;
   for (DWARFUnit *CU : CUs)
     registerUnit(*CU, false);
@@ -897,6 +895,10 @@ void DIEBuilder::registerUnit(DWARFUnit &DU, bool NeedSort) {
                 });
   }
   getState().UnitIDMap[getHash(DU)] = getState().DUList.size();
+  // This handles the case where we do have cross cu references, but CUs do not
+  // share the same abbrev table.
+  if (getState().DUList.size() == getState().CloneUnitCtxMap.size())
+    getState().CloneUnitCtxMap.emplace_back();
   getState().DUList.push_back(&DU);
 }
 
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index d9f5da3e3bca..5de075973004 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -175,8 +175,12 @@ struct SplitCacheDirected final : public SplitStrategy {
   void fragment(const BlockIt Start, const BlockIt End) override {
     BasicBlockOrder BlockOrder(Start, End);
     BinaryFunction &BF = *BlockOrder.front()->getFunction();
+    // No need to re-split small functions.
+    if (BlockOrder.size() <= 2)
+      return;
 
     size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
+    assert(BestSplitIndex < BlockOrder.size());
 
     // Assign fragments based on the computed best split index.
     // All basic blocks with index up to the best split index become hot.
@@ -200,10 +204,12 @@ private:
   };
 
   struct SplitScore {
-    size_t SplitIndex;
+    size_t SplitIndex = size_t(-1);
     size_t HotSizeReduction = 0;
     double LocalScore = 0;
     double CoverCallScore = 0;
+
+    double sum() const { return LocalScore + CoverCallScore; }
   };
 
   // Auxiliary variables used by the algorithm.
@@ -303,7 +309,7 @@ private:
                              const size_t SplitIndex) {
     assert(SplitIndex < BlockOrder.size() && "Invalid split index");
 
-    // Update function layout assuming hot-warm splitting at SplitIndex
+    // Update function layout assuming hot-warm splitting at SplitIndex.
     for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
       BinaryBasicBlock *BB = BlockOrder[Index];
       if (BB->getFragmentNum() == FragmentNum::cold())
@@ -319,8 +325,8 @@ private:
     // Populate BB.OutputAddressRange with estimated new start and end addresses
     // and compute the old end address of the hot section and the new end
     // address of the hot section.
-    size_t OldHotEndAddr;
-    size_t NewHotEndAddr;
+    size_t OldHotEndAddr{0};
+    size_t NewHotEndAddr{0};
     size_t CurrentAddr = BBOffsets[BlockOrder[0]];
     for (BinaryBasicBlock *BB : BlockOrder) {
       // We only care about new addresses of blocks in hot/warm.
@@ -492,20 +498,15 @@ private:
   }
 
   /// Compute the split score of splitting a function at a given index.
-  /// The split score consists of local score and cover score. Cover call score
-  /// is expensive to compute. As a result, we pass in a \p ReferenceScore and
-  /// compute cover score only when the local score exceeds that in the
-  /// ReferenceScore or that the size reduction of the hot fragment is larger
-  /// than that achieved by the split index of the ReferenceScore. This function
-  /// returns \p Score of SplitScore type. It contains the local score and cover
-  /// score (if computed) of the current splitting index. For easier book
-  /// keeping and comparison, it also stores the split index and the resulting
-  /// reduction in hot fragment size.
+  /// The split score consists of local score and cover score. This function
+  /// returns \p Score of SplitScore type. It contains the local score and
+  /// cover score of the current splitting index. For easier book keeping and
+  /// comparison, it also stores the split index and the resulting reduction
+  /// in hot fragment size.
   SplitScore computeSplitScore(const BinaryFunction &BF,
                                const BasicBlockOrder &BlockOrder,
                                const size_t SplitIndex,
-                               const std::vector<CallInfo> &CoverCalls,
-                               const SplitScore &ReferenceScore) {
+                               const std::vector<CallInfo> &CoverCalls) {
     // Populate BinaryBasicBlock::OutputAddressRange with estimated
     // new start and end addresses after hot-warm splitting at SplitIndex.
     size_t OldHotEnd;
@@ -533,47 +534,74 @@ private:
     // increamented in place.
     computeJumpScore(BlockOrder, SplitIndex, Score);
 
-    // There is no need to compute CoverCallScore if we have already found
-    // another split index with a bigger LocalScore and bigger HotSizeReduction.
-    if (Score.LocalScore <= ReferenceScore.LocalScore &&
-        Score.HotSizeReduction <= ReferenceScore.HotSizeReduction)
-      return Score;
-
     // Compute CoverCallScore and store in Score in place.
     computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
     return Score;
   }
 
+  /// Find the most likely successor of a basic block when it has one or two
+  /// successors. Return nullptr otherwise.
+  const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) {
+    if (BB->succ_size() == 1)
+      return BB->getSuccessor();
+    if (BB->succ_size() == 2) {
+      uint64_t TakenCount = BB->getTakenBranchInfo().Count;
+      assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
+      uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count;
+      assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
+      if (TakenCount > NonTakenCount)
+        return BB->getConditionalSuccessor(true);
+      else if (TakenCount < NonTakenCount)
+        return BB->getConditionalSuccessor(false);
+    }
+    return nullptr;
+  }
+
   /// Find the best index for splitting. The returned value is the index of the
   /// last hot basic block. Hence, "no splitting" is equivalent to returning the
   /// value which is one less than the size of the function.
   size_t findSplitIndex(const BinaryFunction &BF,
                         const BasicBlockOrder &BlockOrder) {
+    assert(BlockOrder.size() > 2);
     // Find all function calls that can be shortened if we move blocks of the
     // current function to warm/cold
     const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
 
-    // Try all possible split indices (blocks with Index <= SplitIndex are in
-    // hot) and find the one maximizing the splitting score.
+    // Find the existing hot-cold splitting index.
+    size_t HotColdIndex = 0;
+    while (HotColdIndex + 1 < BlockOrder.size()) {
+      if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold())
+        break;
+      HotColdIndex++;
+    }
+    assert(HotColdIndex + 1 == BlockOrder.size() ||
+           (BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() &&
+            BlockOrder[HotColdIndex + 1]->getFragmentNum() ==
+                FragmentNum::cold()));
+
+    // Try all possible split indices up to HotColdIndex (blocks that have
+    // Index <= SplitIndex are in hot) and find the one maximizing the
+    // splitting score.
     SplitScore BestScore;
-    double BestScoreSum = -1.0;
-    SplitScore ReferenceScore;
-    for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
+    for (size_t Index = 0; Index <= HotColdIndex; Index++) {
       const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
-      // No need to keep cold blocks in the hot section.
-      if (LastHotBB->getFragmentNum() == FragmentNum::cold())
-        break;
+      assert(LastHotBB->getFragmentNum() != FragmentNum::cold());
+
+      // Do not break jump to the most likely successor.
+      if (Index + 1 < BlockOrder.size() &&
+          BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB))
+        continue;
+
       const SplitScore Score =
-          computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore);
-      double ScoreSum = Score.LocalScore + Score.CoverCallScore;
-      if (ScoreSum > BestScoreSum) {
-        BestScoreSum = ScoreSum;
+          computeSplitScore(BF, BlockOrder, Index, CoverCalls);
+      if (Score.sum() > BestScore.sum())
         BestScore = Score;
-      }
-      if (Score.LocalScore > ReferenceScore.LocalScore)
-        ReferenceScore = Score;
     }
 
+    // If we don't find a good splitting point, fallback to the original one.
+    if (BestScore.SplitIndex == size_t(-1))
+      return HotColdIndex;
+
     return BestScore.SplitIndex;
   }
 };
diff --git a/bolt/test/X86/cdsplit-call-scale.s b/bolt/test/X86/cdsplit-call-scale.s
index 5b4f92832624..5701d9e6dfd6 100644
--- a/bolt/test/X86/cdsplit-call-scale.s
+++ b/bolt/test/X86/cdsplit-call-scale.s
@@ -2,8 +2,9 @@
 # When -call-scale=0.0, the tested function is 2-way splitted.
 # When -call-scale=1.0, the tested function is 3-way splitted with 5 blocks
 # in warm because of the increased benefit of shortening the call edges.
-# When -call-scale=1000.0, the tested function is 3-way splitted with 7 blocks
-# in warm because of the strong benefit of shortening the call edges.
+# When -call-scale=1000.0, the tested function is still 3-way splitted with
+# 5 blocks in warm because cdsplit does not allow hot-warm splitting to break
+# a fall through branch from a basic block to its most likely successor.
 
 # RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
 # RUN: link_fdata %s %t.o %t.fdata
@@ -39,12 +40,10 @@
 # MEDINCENTIVE: {{^\.Ltmp5}}
 
 # HIGHINCENTIVE: Binary Function "chain" after split-functions
-# HIGHINCENTIVE: {{^\.LBB00}}
+# HIGHINCENTIVE: {{^\.Ltmp1}}
 # HIGHINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
 # HIGHINCENTIVE: {{^\.LFT1}}
 # HIGHINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
-# HIGHINCENTIVE: {{^\.LFT0}}
-# HIGHINCENTIVE: {{^\.Ltmp1}}
 # HIGHINCENTIVE: {{^\.Ltmp0}}
 # HIGHINCENTIVE: {{^\.Ltmp2}}
 # HIGHINCENTIVE: {{^\.Ltmp3}}
diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index 7a910037368c..435cb1e3fbcf 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -83,6 +83,7 @@
 #include "UnhandledSelfAssignmentCheck.h"
 #include "UniquePtrArrayMismatchCheck.h"
 #include "UnsafeFunctionsCheck.h"
+#include "UnusedLocalNonTrivialVariableCheck.h"
 #include "UnusedRaiiCheck.h"
 #include "UnusedReturnValueCheck.h"
 #include "UseAfterMoveCheck.h"
@@ -235,6 +236,8 @@ public:
         "bugprone-unique-ptr-array-mismatch");
     CheckFactories.registerCheck<UnsafeFunctionsCheck>(
         "bugprone-unsafe-functions");
+    CheckFactories.registerCheck<UnusedLocalNonTrivialVariableCheck>(
+        "bugprone-unused-local-non-trivial-variable");
     CheckFactories.registerCheck<UnusedRaiiCheck>("bugprone-unused-raii");
     CheckFactories.registerCheck<UnusedReturnValueCheck>(
         "bugprone-unused-return-value");
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index d443fd8d1452..70e7fbc7ec0c 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -79,6 +79,7 @@ add_clang_library(clangTidyBugproneModule
   UnhandledSelfAssignmentCheck.cpp
   UniquePtrArrayMismatchCheck.cpp
   UnsafeFunctionsCheck.cpp
+  UnusedLocalNonTrivialVariableCheck.cpp
   UnusedRaiiCheck.cpp
   UnusedReturnValueCheck.cpp
   UseAfterMoveCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
new file mode 100644
index 000000000000..1b763d291082
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
@@ -0,0 +1,91 @@
+//===--- UnusedLocalNonTrivialVariableCheck.cpp - clang-tidy --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "UnusedLocalNonTrivialVariableCheck.h"
+#include "../utils/Matchers.h"
+#include "../utils/OptionsUtils.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/ASTTypeTraits.h"
+#include "clang/AST/Type.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/ASTMatchers/ASTMatchersMacros.h"
+
+using namespace clang::ast_matchers;
+using namespace clang::tidy::matchers;
+
+namespace clang::tidy::bugprone {
+
+namespace {
+static constexpr StringRef DefaultIncludeTypeRegex =
+    "::std::.*mutex;::std::future;::std::basic_string;::std::basic_regex;"
+    "::std::basic_istringstream;::std::basic_stringstream;::std::bitset;"
+    "::std::filesystem::path";
+
+AST_MATCHER(VarDecl, isLocalVarDecl) { return Node.isLocalVarDecl(); }
+AST_MATCHER(VarDecl, isReferenced) { return Node.isReferenced(); }
+AST_MATCHER(Type, isReferenceType) { return Node.isReferenceType(); }
+AST_MATCHER(QualType, isTrivial) {
+  return Node.isTrivialType(Finder->getASTContext()) ||
+         Node.isTriviallyCopyableType(Finder->getASTContext());
+}
+} // namespace
+
+UnusedLocalNonTrivialVariableCheck::UnusedLocalNonTrivialVariableCheck(
+    StringRef Name, ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      IncludeTypes(utils::options::parseStringList(
+          Options.get("IncludeTypes", DefaultIncludeTypeRegex))),
+      ExcludeTypes(
+          utils::options::parseStringList(Options.get("ExcludeTypes", ""))) {}
+
+void UnusedLocalNonTrivialVariableCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "IncludeTypes",
+                utils::options::serializeStringList(IncludeTypes));
+  Options.store(Opts, "ExcludeTypes",
+                utils::options::serializeStringList(ExcludeTypes));
+}
+
+void UnusedLocalNonTrivialVariableCheck::registerMatchers(MatchFinder *Finder) {
+  if (IncludeTypes.empty())
+    return;
+
+  Finder->addMatcher(
+      varDecl(isLocalVarDecl(), unless(isReferenced()),
+              unless(isExceptionVariable()), hasLocalStorage(), isDefinition(),
+              unless(hasType(isReferenceType())), unless(hasType(isTrivial())),
+              hasType(hasUnqualifiedDesugaredType(
+                  anyOf(recordType(hasDeclaration(namedDecl(
+                            matchesAnyListedName(IncludeTypes),
+                            unless(matchesAnyListedName(ExcludeTypes))))),
+                        templateSpecializationType(hasDeclaration(namedDecl(
+                            matchesAnyListedName(IncludeTypes),
+                            unless(matchesAnyListedName(ExcludeTypes)))))))))
+          .bind("var"),
+      this);
+}
+
+void UnusedLocalNonTrivialVariableCheck::check(
+    const MatchFinder::MatchResult &Result) {
+  const auto *MatchedDecl = Result.Nodes.getNodeAs<VarDecl>("var");
+  diag(MatchedDecl->getLocation(), "unused local variable %0 of type %1")
+      << MatchedDecl << MatchedDecl->getType();
+}
+
+bool UnusedLocalNonTrivialVariableCheck::isLanguageVersionSupported(
+    const LangOptions &LangOpts) const {
+  return LangOpts.CPlusPlus;
+}
+
+std::optional<TraversalKind>
+UnusedLocalNonTrivialVariableCheck::getCheckTraversalKind() const {
+  return TK_IgnoreUnlessSpelledInSource;
+}
+
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h
new file mode 100644
index 000000000000..e79b803a2158
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h
@@ -0,0 +1,44 @@
+//===--- UnusedLocalNonTrivialVariableCheck.h - clang-tidy ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNUSEDLOCALNONTRIVIALVARIABLECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNUSEDLOCALNONTRIVIALVARIABLECHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::bugprone {
+
+/// Warns when a local non trivial variable is unused within a function. By
+/// default std::.*mutex and std::future are included.
+///
+/// The check supports these options:
+///   - 'IncludeTypes': a semicolon-separated list of regular expressions
+///                     matching types to ensure must be used.
+///   - 'ExcludeTypes': a semicolon-separated list of regular expressions
+///                     matching types that are excluded from the
+///                     'IncludeTypes' matches.
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.html
+class UnusedLocalNonTrivialVariableCheck : public ClangTidyCheck {
+public:
+  UnusedLocalNonTrivialVariableCheck(StringRef Name, ClangTidyContext *Context);
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override;
+  std::optional<TraversalKind> getCheckTraversalKind() const override;
+
+private:
+  const std::vector<StringRef> IncludeTypes;
+  const std::vector<StringRef> ExcludeTypes;
+};
+
+} // namespace clang::tidy::bugprone
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNUSEDLOCALNONTRIVIALVARIABLECHECK_H
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
index e6293ed48bfd..bb05f206c717 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
@@ -11,26 +11,39 @@
 #include "clang/Lex/Lexer.h"
 
 using namespace clang::ast_matchers;
+namespace {
+
+AST_MATCHER(clang::LinkageSpecDecl, isExternCLinkage) {
+  return Node.getLanguage() == clang::LinkageSpecLanguageIDs::C;
+}
+} // namespace
 
 namespace clang::tidy::modernize {
 
+static constexpr llvm::StringLiteral ExternCDeclName = "extern-c-decl";
 static constexpr llvm::StringLiteral ParentDeclName = "parent-decl";
 static constexpr llvm::StringLiteral TagDeclName = "tag-decl";
 static constexpr llvm::StringLiteral TypedefName = "typedef";
 
 UseUsingCheck::UseUsingCheck(StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
-      IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)) {}
+      IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)),
+      IgnoreExternC(Options.get("IgnoreExternC", false)) {}
 
 void UseUsingCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "IgnoreMacros", IgnoreMacros);
+  Options.store(Opts, "IgnoreExternC", IgnoreExternC);
 }
 
 void UseUsingCheck::registerMatchers(MatchFinder *Finder) {
-  Finder->addMatcher(typedefDecl(unless(isInstantiated()),
-                                 hasParent(decl().bind(ParentDeclName)))
-                         .bind(TypedefName),
-                     this);
+  Finder->addMatcher(
+      typedefDecl(
+          unless(isInstantiated()),
+          optionally(hasAncestor(
+              linkageSpecDecl(isExternCLinkage()).bind(ExternCDeclName))),
+          hasParent(decl().bind(ParentDeclName)))
+          .bind(TypedefName),
+      this);
 
   // This matcher is used to find tag declarations in source code within
   // typedefs. They appear in the AST just *prior* to the typedefs.
@@ -70,6 +83,11 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) {
   if (MatchedDecl->getLocation().isInvalid())
     return;
 
+  const auto *ExternCDecl =
+      Result.Nodes.getNodeAs<LinkageSpecDecl>(ExternCDeclName);
+  if (ExternCDecl && IgnoreExternC)
+    return;
+
   SourceLocation StartLoc = MatchedDecl->getBeginLoc();
 
   if (StartLoc.isMacroID() && IgnoreMacros)
@@ -122,7 +140,8 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) {
       Type = FirstTypedefName + Type.substr(FirstTypedefType.size() + 1);
   }
   if (!ReplaceRange.getEnd().isMacroID()) {
-    const SourceLocation::IntTy Offset = MatchedDecl->getFunctionType() ? 0 : Name.size();
+    const SourceLocation::IntTy Offset =
+        MatchedDecl->getFunctionType() ? 0 : Name.size();
     LastReplacementEnd = ReplaceRange.getEnd().getLocWithOffset(Offset);
   }
 
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
index 5c741a92d013..7054778d84a0 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
@@ -20,6 +20,7 @@ namespace clang::tidy::modernize {
 class UseUsingCheck : public ClangTidyCheck {
 
   const bool IgnoreMacros;
+  const bool IgnoreExternC;
   SourceLocation LastReplacementEnd;
   llvm::DenseMap<const Decl *, SourceRange> LastTagDeclRanges;
 
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp
index 3c8751dbdd73..ad900fcec2de 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp
@@ -97,7 +97,7 @@ void InefficientAlgorithmCheck::check(const MatchFinder::MatchResult &Result) {
   if (!AlgDecl)
     return;
 
-  if (Unordered && AlgDecl->getName().find("bound") != llvm::StringRef::npos)
+  if (Unordered && AlgDecl->getName().contains("bound"))
     return;
 
   const auto *AlgParam = Result.Nodes.getNodeAs<Expr>("AlgParam");
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
index 03dcfa5f8110..e6f44dd51b45 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
@@ -9,6 +9,7 @@
 #include "IdentifierNamingCheck.h"
 
 #include "../GlobList.h"
+#include "../utils/ASTUtils.h"
 #include "clang/AST/CXXInheritance.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/Preprocessor.h"
@@ -286,7 +287,9 @@ IdentifierNamingCheck::FileStyle IdentifierNamingCheck::getFileStyleFromOptions(
                         HPTOpt.value_or(IdentifierNamingCheck::HPT_Off));
   }
   bool IgnoreMainLike = Options.get("IgnoreMainLikeFunctions", false);
-  return {std::move(Styles), std::move(HNOption), IgnoreMainLike};
+  bool CheckAnonFieldInParent = Options.get("CheckAnonFieldInParent", false);
+  return {std::move(Styles), std::move(HNOption), IgnoreMainLike,
+          CheckAnonFieldInParent};
 }
 
 std::string IdentifierNamingCheck::HungarianNotation::getDeclTypeName(
@@ -859,6 +862,8 @@ void IdentifierNamingCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "IgnoreFailedSplit", IgnoreFailedSplit);
   Options.store(Opts, "IgnoreMainLikeFunctions",
                 MainFileStyle->isIgnoringMainLikeFunction());
+  Options.store(Opts, "CheckAnonFieldInParent",
+                MainFileStyle->isCheckingAnonFieldInParentScope());
 }
 
 bool IdentifierNamingCheck::matchesStyle(
@@ -1111,7 +1116,7 @@ std::string IdentifierNamingCheck::fixupWithStyle(
 StyleKind IdentifierNamingCheck::findStyleKind(
     const NamedDecl *D,
     ArrayRef<std::optional<IdentifierNamingCheck::NamingStyle>> NamingStyles,
-    bool IgnoreMainLikeFunctions) const {
+    bool IgnoreMainLikeFunctions, bool CheckAnonFieldInParentScope) const {
   assert(D && D->getIdentifier() && !D->getName().empty() && !D->isImplicit() &&
          "Decl must be an explicit identifier with a name.");
 
@@ -1185,29 +1190,14 @@ StyleKind IdentifierNamingCheck::findStyleKind(
   }
 
   if (const auto *Decl = dyn_cast<FieldDecl>(D)) {
-    QualType Type = Decl->getType();
-
-    if (!Type.isNull() && Type.isConstQualified()) {
-      if (NamingStyles[SK_ConstantMember])
-        return SK_ConstantMember;
-
-      if (NamingStyles[SK_Constant])
-        return SK_Constant;
+    if (CheckAnonFieldInParentScope) {
+      const RecordDecl *Record = Decl->getParent();
+      if (Record->isAnonymousStructOrUnion()) {
+        return findStyleKindForAnonField(Decl, NamingStyles);
+      }
     }
 
-    if (Decl->getAccess() == AS_private && NamingStyles[SK_PrivateMember])
-      return SK_PrivateMember;
-
-    if (Decl->getAccess() == AS_protected && NamingStyles[SK_ProtectedMember])
-      return SK_ProtectedMember;
-
-    if (Decl->getAccess() == AS_public && NamingStyles[SK_PublicMember])
-      return SK_PublicMember;
-
-    if (NamingStyles[SK_Member])
-      return SK_Member;
-
-    return SK_Invalid;
+    return findStyleKindForField(Decl, Decl->getType(), NamingStyles);
   }
 
   if (const auto *Decl = dyn_cast<ParmVarDecl>(D)) {
@@ -1244,66 +1234,7 @@ StyleKind IdentifierNamingCheck::findStyleKind(
   }
 
   if (const auto *Decl = dyn_cast<VarDecl>(D)) {
-    QualType Type = Decl->getType();
-
-    if (Decl->isConstexpr() && NamingStyles[SK_ConstexprVariable])
-      return SK_ConstexprVariable;
-
-    if (!Type.isNull() && Type.isConstQualified()) {
-      if (Decl->isStaticDataMember() && NamingStyles[SK_ClassConstant])
-        return SK_ClassConstant;
-
-      if (Decl->isFileVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
-          NamingStyles[SK_GlobalConstantPointer])
-        return SK_GlobalConstantPointer;
-
-      if (Decl->isFileVarDecl() && NamingStyles[SK_GlobalConstant])
-        return SK_GlobalConstant;
-
-      if (Decl->isStaticLocal() && NamingStyles[SK_StaticConstant])
-        return SK_StaticConstant;
-
-      if (Decl->isLocalVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
-          NamingStyles[SK_LocalConstantPointer])
-        return SK_LocalConstantPointer;
-
-      if (Decl->isLocalVarDecl() && NamingStyles[SK_LocalConstant])
-        return SK_LocalConstant;
-
-      if (Decl->isFunctionOrMethodVarDecl() && NamingStyles[SK_LocalConstant])
-        return SK_LocalConstant;
-
-      if (NamingStyles[SK_Constant])
-        return SK_Constant;
-    }
-
-    if (Decl->isStaticDataMember() && NamingStyles[SK_ClassMember])
-      return SK_ClassMember;
-
-    if (Decl->isFileVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
-        NamingStyles[SK_GlobalPointer])
-      return SK_GlobalPointer;
-
-    if (Decl->isFileVarDecl() && NamingStyles[SK_GlobalVariable])
-      return SK_GlobalVariable;
-
-    if (Decl->isStaticLocal() && NamingStyles[SK_StaticVariable])
-      return SK_StaticVariable;
-
-    if (Decl->isLocalVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
-        NamingStyles[SK_LocalPointer])
-      return SK_LocalPointer;
-
-    if (Decl->isLocalVarDecl() && NamingStyles[SK_LocalVariable])
-      return SK_LocalVariable;
-
-    if (Decl->isFunctionOrMethodVarDecl() && NamingStyles[SK_LocalVariable])
-      return SK_LocalVariable;
-
-    if (NamingStyles[SK_Variable])
-      return SK_Variable;
-
-    return SK_Invalid;
+    return findStyleKindForVar(Decl, Decl->getType(), NamingStyles);
   }
 
   if (const auto *Decl = dyn_cast<CXXMethodDecl>(D)) {
@@ -1442,12 +1373,13 @@ IdentifierNamingCheck::getDeclFailureInfo(const NamedDecl *Decl,
   if (!FileStyle.isActive())
     return std::nullopt;
 
-  return getFailureInfo(HungarianNotation.getDeclTypeName(Decl),
-                        Decl->getName(), Decl, Loc, FileStyle.getStyles(),
-                        FileStyle.getHNOption(),
-                        findStyleKind(Decl, FileStyle.getStyles(),
-                                      FileStyle.isIgnoringMainLikeFunction()),
-                        SM, IgnoreFailedSplit);
+  return getFailureInfo(
+      HungarianNotation.getDeclTypeName(Decl), Decl->getName(), Decl, Loc,
+      FileStyle.getStyles(), FileStyle.getHNOption(),
+      findStyleKind(Decl, FileStyle.getStyles(),
+                    FileStyle.isIgnoringMainLikeFunction(),
+                    FileStyle.isCheckingAnonFieldInParentScope()),
+      SM, IgnoreFailedSplit);
 }
 
 std::optional<RenamerClangTidyCheck::FailureInfo>
@@ -1496,5 +1428,114 @@ IdentifierNamingCheck::getStyleForFile(StringRef FileName) const {
   return It.first->getValue();
 }
 
+StyleKind IdentifierNamingCheck::findStyleKindForAnonField(
+    const FieldDecl *AnonField,
+    ArrayRef<std::optional<NamingStyle>> NamingStyles) const {
+  const IndirectFieldDecl *IFD =
+      utils::findOutermostIndirectFieldDeclForField(AnonField);
+  assert(IFD && "Found an anonymous record field without an IndirectFieldDecl");
+
+  QualType Type = AnonField->getType();
+
+  if (const auto *F = dyn_cast<FieldDecl>(IFD->chain().front())) {
+    return findStyleKindForField(F, Type, NamingStyles);
+  }
+
+  if (const auto *V = IFD->getVarDecl()) {
+    return findStyleKindForVar(V, Type, NamingStyles);
+  }
+
+  return SK_Invalid;
+}
+
+StyleKind IdentifierNamingCheck::findStyleKindForField(
+    const FieldDecl *Field, QualType Type,
+    ArrayRef<std::optional<NamingStyle>> NamingStyles) const {
+  if (!Type.isNull() && Type.isConstQualified()) {
+    if (NamingStyles[SK_ConstantMember])
+      return SK_ConstantMember;
+
+    if (NamingStyles[SK_Constant])
+      return SK_Constant;
+  }
+
+  if (Field->getAccess() == AS_private && NamingStyles[SK_PrivateMember])
+    return SK_PrivateMember;
+
+  if (Field->getAccess() == AS_protected && NamingStyles[SK_ProtectedMember])
+    return SK_ProtectedMember;
+
+  if (Field->getAccess() == AS_public && NamingStyles[SK_PublicMember])
+    return SK_PublicMember;
+
+  if (NamingStyles[SK_Member])
+    return SK_Member;
+
+  return SK_Invalid;
+}
+
+StyleKind IdentifierNamingCheck::findStyleKindForVar(
+    const VarDecl *Var, QualType Type,
+    ArrayRef<std::optional<NamingStyle>> NamingStyles) const {
+  if (Var->isConstexpr() && NamingStyles[SK_ConstexprVariable])
+    return SK_ConstexprVariable;
+
+  if (!Type.isNull() && Type.isConstQualified()) {
+    if (Var->isStaticDataMember() && NamingStyles[SK_ClassConstant])
+      return SK_ClassConstant;
+
+    if (Var->isFileVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
+        NamingStyles[SK_GlobalConstantPointer])
+      return SK_GlobalConstantPointer;
+
+    if (Var->isFileVarDecl() && NamingStyles[SK_GlobalConstant])
+      return SK_GlobalConstant;
+
+    if (Var->isStaticLocal() && NamingStyles[SK_StaticConstant])
+      return SK_StaticConstant;
+
+    if (Var->isLocalVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
+        NamingStyles[SK_LocalConstantPointer])
+      return SK_LocalConstantPointer;
+
+    if (Var->isLocalVarDecl() && NamingStyles[SK_LocalConstant])
+      return SK_LocalConstant;
+
+    if (Var->isFunctionOrMethodVarDecl() && NamingStyles[SK_LocalConstant])
+      return SK_LocalConstant;
+
+    if (NamingStyles[SK_Constant])
+      return SK_Constant;
+  }
+
+  if (Var->isStaticDataMember() && NamingStyles[SK_ClassMember])
+    return SK_ClassMember;
+
+  if (Var->isFileVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
+      NamingStyles[SK_GlobalPointer])
+    return SK_GlobalPointer;
+
+  if (Var->isFileVarDecl() && NamingStyles[SK_GlobalVariable])
+    return SK_GlobalVariable;
+
+  if (Var->isStaticLocal() && NamingStyles[SK_StaticVariable])
+    return SK_StaticVariable;
+
+  if (Var->isLocalVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
+      NamingStyles[SK_LocalPointer])
+    return SK_LocalPointer;
+
+  if (Var->isLocalVarDecl() && NamingStyles[SK_LocalVariable])
+    return SK_LocalVariable;
+
+  if (Var->isFunctionOrMethodVarDecl() && NamingStyles[SK_LocalVariable])
+    return SK_LocalVariable;
+
+  if (NamingStyles[SK_Variable])
+    return SK_Variable;
+
+  return SK_Invalid;
+}
+
 } // namespace readability
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
index 14626981cc42..27c8e4bc768c 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
@@ -127,9 +127,11 @@ public:
   struct FileStyle {
     FileStyle() : IsActive(false), IgnoreMainLikeFunctions(false) {}
     FileStyle(SmallVectorImpl<std::optional<NamingStyle>> &&Styles,
-              HungarianNotationOption HNOption, bool IgnoreMainLike)
+              HungarianNotationOption HNOption, bool IgnoreMainLike,
+              bool CheckAnonFieldInParent)
         : Styles(std::move(Styles)), HNOption(std::move(HNOption)),
-          IsActive(true), IgnoreMainLikeFunctions(IgnoreMainLike) {}
+          IsActive(true), IgnoreMainLikeFunctions(IgnoreMainLike),
+          CheckAnonFieldInParentScope(CheckAnonFieldInParent) {}
 
     ArrayRef<std::optional<NamingStyle>> getStyles() const {
       assert(IsActive);
@@ -144,11 +146,16 @@ public:
     bool isActive() const { return IsActive; }
     bool isIgnoringMainLikeFunction() const { return IgnoreMainLikeFunctions; }
 
+    bool isCheckingAnonFieldInParentScope() const {
+      return CheckAnonFieldInParentScope;
+    }
+
   private:
     SmallVector<std::optional<NamingStyle>, 0> Styles;
     HungarianNotationOption HNOption;
     bool IsActive;
     bool IgnoreMainLikeFunctions;
+    bool CheckAnonFieldInParentScope;
   };
 
   IdentifierNamingCheck::FileStyle
@@ -175,7 +182,7 @@ public:
   StyleKind findStyleKind(
       const NamedDecl *D,
       ArrayRef<std::optional<IdentifierNamingCheck::NamingStyle>> NamingStyles,
-      bool IgnoreMainLikeFunctions) const;
+      bool IgnoreMainLikeFunctions, bool CheckAnonFieldInParentScope) const;
 
   std::optional<RenamerClangTidyCheck::FailureInfo> getFailureInfo(
       StringRef Type, StringRef Name, const NamedDecl *ND,
@@ -199,6 +206,19 @@ private:
 
   const FileStyle &getStyleForFile(StringRef FileName) const;
 
+  /// Find the style kind of a field in an anonymous record.
+  StyleKind findStyleKindForAnonField(
+      const FieldDecl *AnonField,
+      ArrayRef<std::optional<NamingStyle>> NamingStyles) const;
+
+  StyleKind findStyleKindForField(
+      const FieldDecl *Field, QualType Type,
+      ArrayRef<std::optional<NamingStyle>> NamingStyles) const;
+
+  StyleKind
+  findStyleKindForVar(const VarDecl *Var, QualType Type,
+                      ArrayRef<std::optional<NamingStyle>> NamingStyles) const;
+
   /// Stores the style options as a vector, indexed by the specified \ref
   /// StyleKind, for a given directory.
   mutable llvm::StringMap<FileStyle> NamingStylesCache;
diff --git a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp
index 2c011f5c0e69..e32f79589a05 100644
--- a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MisleadingIndentationCheck.h"
+#include "../utils/LexerUtils.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 
@@ -51,8 +52,20 @@ void MisleadingIndentationCheck::danglingElseCheck(const SourceManager &SM,
     diag(ElseLoc, "different indentation for 'if' and corresponding 'else'");
 }
 
-void MisleadingIndentationCheck::missingBracesCheck(const SourceManager &SM,
-                                                    const CompoundStmt *CStmt) {
+static bool isAtStartOfLineIncludingEmptyMacro(SourceLocation NextLoc,
+                                               const SourceManager &SM,
+                                               const LangOptions &LangOpts) {
+  const SourceLocation BeforeLoc =
+      utils::lexer::getPreviousTokenAndStart(NextLoc, SM, LangOpts).second;
+  if (BeforeLoc.isInvalid())
+    return false;
+  return SM.getExpansionLineNumber(BeforeLoc) !=
+         SM.getExpansionLineNumber(NextLoc);
+}
+
+void MisleadingIndentationCheck::missingBracesCheck(
+    const SourceManager &SM, const CompoundStmt *CStmt,
+    const LangOptions &LangOpts) {
   const static StringRef StmtNames[] = {"if", "for", "while"};
   for (unsigned int I = 0; I < CStmt->size() - 1; I++) {
     const Stmt *CurrentStmt = CStmt->body_begin()[I];
@@ -92,6 +105,8 @@ void MisleadingIndentationCheck::missingBracesCheck(const SourceManager &SM,
 
     if (NextLoc.isInvalid() || NextLoc.isMacroID())
       continue;
+    if (!isAtStartOfLineIncludingEmptyMacro(NextLoc, SM, LangOpts))
+      continue;
 
     if (SM.getExpansionColumnNumber(InnerLoc) ==
         SM.getExpansionColumnNumber(NextLoc)) {
@@ -117,7 +132,8 @@ void MisleadingIndentationCheck::check(const MatchFinder::MatchResult &Result) {
     danglingElseCheck(*Result.SourceManager, Result.Context, If);
 
   if (const auto *CStmt = Result.Nodes.getNodeAs<CompoundStmt>("compound"))
-    missingBracesCheck(*Result.SourceManager, CStmt);
+    missingBracesCheck(*Result.SourceManager, CStmt,
+                       Result.Context->getLangOpts());
 }
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h
index c336abbc7c4a..9c92fc1e18b6 100644
--- a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h
@@ -32,7 +32,8 @@ public:
 private:
   void danglingElseCheck(const SourceManager &SM, ASTContext *Context,
                          const IfStmt *If);
-  void missingBracesCheck(const SourceManager &SM, const CompoundStmt *CStmt);
+  void missingBracesCheck(const SourceManager &SM, const CompoundStmt *CStmt,
+                          const LangOptions &LangOpts);
 };
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.cpp b/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.cpp
index d274abcbfabe..7d4698d27ed1 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.cpp
@@ -16,7 +16,8 @@ using namespace clang::ast_matchers;
 namespace clang::tidy::readability {
 
 static const char KDefaultTypes[] =
-    "::std::basic_string;::std::basic_string_view;::std::vector;::std::array";
+    "::std::basic_string;::std::basic_string_view;::std::vector;::std::array;::"
+    "std::span";
 
 SimplifySubscriptExprCheck::SimplifySubscriptExprCheck(
     StringRef Name, ClangTidyContext *Context)
diff --git a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
index 8817e2914f6e..d96b3450fdbe 100755
--- a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
+++ b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
@@ -173,6 +173,12 @@ def main():
         help="checks filter, when not specified, use clang-tidy " "default",
         default="",
     )
+    parser.add_argument(
+        "-config-file",
+        dest="config_file",
+        help="Specify the path of .clang-tidy or custom config file",
+        default="",
+    )
     parser.add_argument("-use-color", action="store_true", help="Use colors in output")
     parser.add_argument(
         "-path", dest="build_path", help="Path used to read a compile command database."
@@ -313,6 +319,8 @@ def main():
         common_clang_tidy_args.append("-fix")
     if args.checks != "":
         common_clang_tidy_args.append("-checks=" + args.checks)
+    if args.config_file != "":
+        common_clang_tidy_args.append("-config-file=" + args.config_file)
     if args.quiet:
         common_clang_tidy_args.append("-quiet")
     if args.build_path is not None:
diff --git a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
index 64333f2c1874..fd5dadc9b01d 100644
--- a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
@@ -113,4 +113,28 @@ bool areStatementsIdentical(const Stmt *FirstStmt, const Stmt *SecondStmt,
   return DataFirst == DataSecond;
 }
 
+const IndirectFieldDecl *
+findOutermostIndirectFieldDeclForField(const FieldDecl *FD) {
+  const RecordDecl *Record = FD->getParent();
+  assert(Record->isAnonymousStructOrUnion() &&
+         "FD must be a field in an anonymous record");
+
+  const DeclContext *Context = Record;
+  while (isa<RecordDecl>(Context) &&
+         cast<RecordDecl>(Context)->isAnonymousStructOrUnion()) {
+    Context = Context->getParent();
+  }
+
+  // Search for the target IndirectFieldDecl within the located context.
+  for (const auto *D : Context->decls()) {
+    const auto *IFD = dyn_cast<IndirectFieldDecl>(D);
+    if (!IFD)
+      continue;
+    if (IFD->getAnonField() == FD)
+      return IFD;
+  }
+
+  return nullptr;
+}
+
 } // namespace clang::tidy::utils
diff --git a/clang-tools-extra/clang-tidy/utils/ASTUtils.h b/clang-tools-extra/clang-tidy/utils/ASTUtils.h
index 1bba5daf2fc7..6c3e54facd02 100644
--- a/clang-tools-extra/clang-tidy/utils/ASTUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/ASTUtils.h
@@ -40,6 +40,11 @@ bool rangeCanBeFixed(SourceRange Range, const SourceManager *SM);
 bool areStatementsIdentical(const Stmt *FirstStmt, const Stmt *SecondStmt,
                             const ASTContext &Context, bool Canonical = false);
 
+// Given a field of an anonymous record, find its corresponding
+// IndirectFieldDecl in the outermost possible scope.
+const IndirectFieldDecl *
+findOutermostIndirectFieldDeclForField(const FieldDecl *FD);
+
 } // namespace clang::tidy::utils
 
 #endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ASTUTILS_H
diff --git a/clang-tools-extra/clangd/InlayHints.cpp b/clang-tools-extra/clangd/InlayHints.cpp
index 6fbb310b660a..5722ca8f66eb 100644
--- a/clang-tools-extra/clangd/InlayHints.cpp
+++ b/clang-tools-extra/clangd/InlayHints.cpp
@@ -286,7 +286,7 @@ std::string summarizeExpr(const Expr *E) {
     // Step through implicit nodes that clang doesn't classify as such.
     std::string VisitCXXMemberCallExpr(const CXXMemberCallExpr *E) {
       // Call to operator bool() inside if (X): dispatch to X.
-      if (E->getNumArgs() == 0 &&
+      if (E->getNumArgs() == 0 && E->getMethodDecl() &&
           E->getMethodDecl()->getDeclName().getNameKind() ==
               DeclarationName::CXXConversionFunctionName &&
           E->getSourceRange() ==
diff --git a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
index 0ca95b5fed5d..0fff0dfca6c9 100644
--- a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
+++ b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
@@ -2205,6 +2205,19 @@ TEST(BlockEndHints, Macro) {
                       ExpectedHint{" // struct S1", "S1"});
 }
 
+TEST(BlockEndHints, PointerToMemberFunction) {
+  // Do not crash trying to summarize `a->*p`.
+  assertBlockEndHints(R"cpp(
+    class A {};
+    using Predicate = bool(A::*)();
+    void foo(A* a, Predicate p) {
+      if ((a->*p)()) {
+      $ptrmem[[}]]
+    } // suppress
+  )cpp",
+                      ExpectedHint{" // if", "ptrmem"});
+}
+
 // FIXME: Low-hanging fruit where we could omit a type hint:
 //  - auto x = TypeName(...);
 //  - auto x = (TypeName) (...);
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 6d91748e4cef..571808a51596 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -119,15 +119,22 @@ Improvements to clang-tidy
 
 - Improved `--dump-config` to print check options in alphabetical order.
 
-- Improved :program:`clang-tidy-diff.py` script. It now returns exit code `1`
-  if any :program:`clang-tidy` subprocess exits with a non-zero code or if
-  exporting fixes fails. It now accepts a directory as a value for
-  `-export-fixes` to export individual yaml files for each compilation unit.
+- Improved :program:`clang-tidy-diff.py` script. 
+    * Return exit code `1` if any :program:`clang-tidy` subprocess exits with
+      a non-zero code or if exporting fixes fails.
+
+    * Accept a directory as a value for `-export-fixes` to export individual
+      yaml files for each compilation unit.
+
+    * Introduce a `-config-file` option that forwards a configuration file to
+      :program:`clang-tidy`. Corresponds to the `--config-file` option in
+      :program:`clang-tidy`.
 
 - Improved :program:`run-clang-tidy.py` script. It now accepts a directory
   as a value for `-export-fixes` to export individual yaml files for each
   compilation unit.
 
+
 New checks
 ^^^^^^^^^^
 
@@ -168,6 +175,11 @@ New checks
   extracted from an optional-like type and then used to create a new instance
   of the same optional-like type.
 
+- New :doc:`bugprone-unused-local-non-trivial-variable
+  <clang-tidy/checks/bugprone/unused-local-non-trivial-variable>` check.
+
+  Warns when a local non trivial variable is unused within a function.
+
 - New :doc:`cppcoreguidelines-no-suspend-with-lock
   <clang-tidy/checks/cppcoreguidelines/no-suspend-with-lock>` check.
 
@@ -396,7 +408,8 @@ Changes in existing checks
 
 - Improved :doc:`modernize-use-using
   <clang-tidy/checks/modernize/use-using>` check to fix function pointer and
-  forward declared ``typedef`` correctly.
+  forward declared ``typedef`` correctly. Added option `IgnoreExternC` to ignore ``typedef``
+  declaration in ``extern "C"`` scope.
 
 - Improved :doc:`performance-faster-string-find
   <clang-tidy/checks/performance/faster-string-find>` check to properly escape
@@ -441,7 +454,10 @@ Changes in existing checks
   has been enhanced, particularly within complex types like function pointers
   and cases where style checks were omitted when functions started with macros.
   Added support for C++20 ``concept`` declarations. ``Camel_Snake_Case`` and
-  ``camel_Snake_Case`` now detect more invalid identifier names.
+  ``camel_Snake_Case`` now detect more invalid identifier names. Fields in
+  anonymous records (i.e. anonymous structs and unions) now can be checked with
+  the naming rules associated with their enclosing scopes rather than the naming
+  rules of public struct/union members.
 
 - Improved :doc:`readability-implicit-bool-conversion
   <clang-tidy/checks/readability/implicit-bool-conversion>` check to take
@@ -449,10 +465,18 @@ Changes in existing checks
   `AllowPointerConditions` options. It also now provides more consistent
   suggestions when parentheses are added to the return value.
 
+- Improved :doc:`readability-misleading-indentation
+  <clang-tidy/checks/readability/misleading-indentation>` check to ignore
+  false-positives for line started with empty macro.
+
 - Improved :doc:`readability-non-const-parameter
   <clang-tidy/checks/readability/non-const-parameter>` check to ignore
   false-positives in initializer list of record.
 
+- Improved :doc:`readability-simplify-subscript-expr
+  <clang-tidy/checks/readability/simplify-subscript-expr>` check by extending
+  the default value of the `Types` option to include ``std::span``.
+
 - Improved :doc:`readability-static-accessed-through-instance
   <clang-tidy/checks/readability/static-accessed-through-instance>` check to
   identify calls to static member functions with out-of-class inline definitions.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
new file mode 100644
index 000000000000..7531f19f3ebc
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
@@ -0,0 +1,55 @@
+.. title:: clang-tidy - bugprone-unused-local-non-trivial-variable
+
+bugprone-unused-local-non-trivial-variable
+==========================================
+
+Warns when a local non trivial variable is unused within a function.
+The following types of variables are excluded from this check:
+
+* trivial and trivially copyable
+* references and pointers
+* exception variables in catch clauses
+* static or thread local
+* structured bindings
+
+This check can be configured to warn on all non-trivial variables by setting
+`IncludeTypes` to `.*`, and excluding specific types using `ExcludeTypes`.
+
+In the this example, `my_lock` would generate a warning that it is unused.
+
+.. code-block:: c++
+
+   std::mutex my_lock;
+   // my_lock local variable is never used
+
+In the next example, `future2` would generate a warning that it is unused.
+
+.. code-block:: c++
+
+   std::future<MyObject> future1;
+   std::future<MyObject> future2;
+   // ...
+   MyObject foo = future1.get();
+   // future2 is not used.
+
+Options
+-------
+
+.. option:: IncludeTypes
+
+   Semicolon-separated list of regular expressions matching types of variables
+   to check. By default the following types are checked:
+
+   * `::std::.*mutex`
+   * `::std::future`
+   * `::std::basic_string`
+   * `::std::basic_regex`
+   * `::std::basic_istringstream`
+   * `::std::basic_stringstream`
+   * `::std::bitset`
+   * `::std::filesystem::path`
+
+.. option:: ExcludeTypes
+
+   A semicolon-separated list of regular expressions matching types that are
+   excluded from the `IncludeTypes` matches. By default it is an empty list.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.core.EnumCastOutOfRange.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.core.EnumCastOutOfRange.rst
new file mode 100644
index 000000000000..99dd1adf38f6
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/optin.core.EnumCastOutOfRange.rst
@@ -0,0 +1,13 @@
+.. title:: clang-tidy - clang-analyzer-optin.core.EnumCastOutOfRange
+.. meta::
+   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#optin-core-enumcastoutofrange
+
+clang-analyzer-optin.core.EnumCastOutOfRange
+============================================
+
+Check integer to enumeration casts for out of range values.
+
+The `clang-analyzer-optin.core.EnumCastOutOfRange` check is an alias, please see
+`Clang Static Analyzer Available Checkers
+<https://clang.llvm.org/docs/analyzer/checkers.html#optin-core-enumcastoutofrange>`_
+for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.cert.env.InvalidPtr.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.cert.env.InvalidPtr.rst
new file mode 100644
index 000000000000..8986fa0e684f
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/security.cert.env.InvalidPtr.rst
@@ -0,0 +1,13 @@
+.. title:: clang-tidy - clang-analyzer-security.cert.env.InvalidPtr
+.. meta::
+   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#security-cert-env-invalidptr
+
+clang-analyzer-security.cert.env.InvalidPtr
+===========================================
+
+Finds usages of possibly invalidated pointers.
+
+The `clang-analyzer-security.cert.env.InvalidPtr` check is an alias, please see
+`Clang Static Analyzer Available Checkers
+<https://clang.llvm.org/docs/analyzer/checkers.html#security-cert-env-invalidptr>`_
+for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.Errno.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.Errno.rst
new file mode 100644
index 000000000000..67a2d05811dc
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.Errno.rst
@@ -0,0 +1,13 @@
+.. title:: clang-tidy - clang-analyzer-unix.Errno
+.. meta::
+   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#unix-errno
+
+clang-analyzer-unix.Errno
+=========================
+
+Check for improper use of 'errno'.
+
+The `clang-analyzer-unix.Errno` check is an alias, please see
+`Clang Static Analyzer Available Checkers
+<https://clang.llvm.org/docs/analyzer/checkers.html#unix-errno>`_
+for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.StdCLibraryFunctions.rst b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.StdCLibraryFunctions.rst
new file mode 100644
index 000000000000..17906732c86f
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/clang-analyzer/unix.StdCLibraryFunctions.rst
@@ -0,0 +1,14 @@
+.. title:: clang-tidy - clang-analyzer-unix.StdCLibraryFunctions
+.. meta::
+   :http-equiv=refresh: 5;URL=https://clang.llvm.org/docs/analyzer/checkers.html#unix-stdclibraryfunctions
+
+clang-analyzer-unix.StdCLibraryFunctions
+========================================
+
+Check for invalid arguments of C standard library functions, and apply relations
+between arguments and return value.
+
+The `clang-analyzer-unix.StdCLibraryFunctions` check is an alias, please see
+`Clang Static Analyzer Available Checkers
+<https://clang.llvm.org/docs/analyzer/checkers.html#unix-stdclibraryfunctions>`_
+for more information.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 31f0e090db1d..b36bf7d497b9 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -149,6 +149,7 @@ Clang-Tidy Checks
    :doc:`bugprone-unhandled-self-assignment <bugprone/unhandled-self-assignment>`,
    :doc:`bugprone-unique-ptr-array-mismatch <bugprone/unique-ptr-array-mismatch>`, "Yes"
    :doc:`bugprone-unsafe-functions <bugprone/unsafe-functions>`,
+   :doc:`bugprone-unused-local-non-trivial-variable <bugprone/unused-local-non-trivial-variable>`,
    :doc:`bugprone-unused-raii <bugprone/unused-raii>`, "Yes"
    :doc:`bugprone-unused-return-value <bugprone/unused-return-value>`,
    :doc:`bugprone-use-after-move <bugprone/use-after-move>`,
@@ -439,6 +440,7 @@ Clang-Tidy Checks
    :doc:`clang-analyzer-nullability.NullableDereferenced <clang-analyzer/nullability.NullableDereferenced>`, `Clang Static Analyzer nullability.NullableDereferenced <https://clang.llvm.org/docs/analyzer/checkers.html#nullability-nullabledereferenced>`_,
    :doc:`clang-analyzer-nullability.NullablePassedToNonnull <clang-analyzer/nullability.NullablePassedToNonnull>`, `Clang Static Analyzer nullability.NullablePassedToNonnull <https://clang.llvm.org/docs/analyzer/checkers.html#nullability-nullablepassedtononnull>`_,
    :doc:`clang-analyzer-nullability.NullableReturnedFromNonnull <clang-analyzer/nullability.NullableReturnedFromNonnull>`, `Clang Static Analyzer nullability.NullableReturnedFromNonnull <https://clang.llvm.org/docs/analyzer/checkers.html#nullability-nullablereturnedfromnonnull>`_,
+   :doc:`clang-analyzer-optin.core.EnumCastOutOfRange <clang-analyzer/optin.core.EnumCastOutOfRange>`, `Clang Static Analyzer optin.core.EnumCastOutOfRange <https://clang.llvm.org/docs/analyzer/checkers.html#optin-core-enumcastoutofrange>`_,
    :doc:`clang-analyzer-optin.cplusplus.UninitializedObject <clang-analyzer/optin.cplusplus.UninitializedObject>`, `Clang Static Analyzer optin.cplusplus.UninitializedObject <https://clang.llvm.org/docs/analyzer/checkers.html#optin-cplusplus-uninitializedobject>`_,
    :doc:`clang-analyzer-optin.cplusplus.VirtualCall <clang-analyzer/optin.cplusplus.VirtualCall>`, `Clang Static Analyzer optin.cplusplus.VirtualCall <https://clang.llvm.org/docs/analyzer/checkers.html#optin-cplusplus-virtualcall>`_,
    :doc:`clang-analyzer-optin.mpi.MPI-Checker <clang-analyzer/optin.mpi.MPI-Checker>`, `Clang Static Analyzer optin.mpi.MPI-Checker <https://clang.llvm.org/docs/analyzer/checkers.html#optin-mpi-mpi-checker>`_,
@@ -478,6 +480,7 @@ Clang-Tidy Checks
    :doc:`clang-analyzer-osx.coreFoundation.containers.OutOfBounds <clang-analyzer/osx.coreFoundation.containers.OutOfBounds>`, `Clang Static Analyzer osx.coreFoundation.containers.OutOfBounds <https://clang.llvm.org/docs/analyzer/checkers.html#osx-corefoundation-containers-outofbounds>`_,
    :doc:`clang-analyzer-osx.coreFoundation.containers.PointerSizedValues <clang-analyzer/osx.coreFoundation.containers.PointerSizedValues>`, `Clang Static Analyzer osx.coreFoundation.containers.PointerSizedValues <https://clang.llvm.org/docs/analyzer/checkers.html#osx-corefoundation-containers-pointersizedvalues>`_,
    :doc:`clang-analyzer-security.FloatLoopCounter <clang-analyzer/security.FloatLoopCounter>`, `Clang Static Analyzer security.FloatLoopCounter <https://clang.llvm.org/docs/analyzer/checkers.html#security-floatloopcounter>`_,
+   :doc:`clang-analyzer-security.cert.env.InvalidPtr <clang-analyzer/security.cert.env.InvalidPtr>`, `Clang Static Analyzer security.cert.env.InvalidPtr <https://clang.llvm.org/docs/analyzer/checkers.html#security-cert-env-invalidptr>`_,
    :doc:`clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling <clang-analyzer/security.insecureAPI.DeprecatedOrUnsafeBufferHandling>`, `Clang Static Analyzer security.insecureAPI.DeprecatedOrUnsafeBufferHandling <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-deprecatedorunsafebufferhandling>`_,
    :doc:`clang-analyzer-security.insecureAPI.UncheckedReturn <clang-analyzer/security.insecureAPI.UncheckedReturn>`, `Clang Static Analyzer security.insecureAPI.UncheckedReturn <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-uncheckedreturn>`_,
    :doc:`clang-analyzer-security.insecureAPI.bcmp <clang-analyzer/security.insecureAPI.bcmp>`, `Clang Static Analyzer security.insecureAPI.bcmp <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-bcmp>`_,
@@ -492,9 +495,11 @@ Clang-Tidy Checks
    :doc:`clang-analyzer-security.insecureAPI.strcpy <clang-analyzer/security.insecureAPI.strcpy>`, `Clang Static Analyzer security.insecureAPI.strcpy <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-strcpy>`_,
    :doc:`clang-analyzer-security.insecureAPI.vfork <clang-analyzer/security.insecureAPI.vfork>`, `Clang Static Analyzer security.insecureAPI.vfork <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-vfork>`_,
    :doc:`clang-analyzer-unix.API <clang-analyzer/unix.API>`, `Clang Static Analyzer unix.API <https://clang.llvm.org/docs/analyzer/checkers.html#unix-api>`_,
+   :doc:`clang-analyzer-unix.Errno <clang-analyzer/unix.Errno>`, `Clang Static Analyzer unix.Errno <https://clang.llvm.org/docs/analyzer/checkers.html#unix-errno>`_,
    :doc:`clang-analyzer-unix.Malloc <clang-analyzer/unix.Malloc>`, `Clang Static Analyzer unix.Malloc <https://clang.llvm.org/docs/analyzer/checkers.html#unix-malloc>`_,
    :doc:`clang-analyzer-unix.MallocSizeof <clang-analyzer/unix.MallocSizeof>`, `Clang Static Analyzer unix.MallocSizeof <https://clang.llvm.org/docs/analyzer/checkers.html#unix-mallocsizeof>`_,
    :doc:`clang-analyzer-unix.MismatchedDeallocator <clang-analyzer/unix.MismatchedDeallocator>`, `Clang Static Analyzer unix.MismatchedDeallocator <https://clang.llvm.org/docs/analyzer/checkers.html#unix-mismatcheddeallocator>`_,
+   :doc:`clang-analyzer-unix.StdCLibraryFunctions <clang-analyzer/unix.StdCLibraryFunctions>`, `Clang Static Analyzer unix.StdCLibraryFunctions <https://clang.llvm.org/docs/analyzer/checkers.html#unix-stdclibraryfunctions>`_,
    :doc:`clang-analyzer-unix.Vfork <clang-analyzer/unix.Vfork>`, `Clang Static Analyzer unix.Vfork <https://clang.llvm.org/docs/analyzer/checkers.html#unix-vfork>`_,
    :doc:`clang-analyzer-unix.cstring.BadSizeArg <clang-analyzer/unix.cstring.BadSizeArg>`, `Clang Static Analyzer unix.cstring.BadSizeArg <https://clang.llvm.org/docs/analyzer/checkers.html#unix-cstring-badsizearg>`_,
    :doc:`clang-analyzer-unix.cstring.NullArg <clang-analyzer/unix.cstring.NullArg>`, `Clang Static Analyzer unix.cstring.NullArg <https://clang.llvm.org/docs/analyzer/checkers.html#unix-cstring-nullarg>`_,
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst
index eeddaf8d8d65..32272a07994c 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst
@@ -28,6 +28,14 @@ After:
   using R_t = struct { int a; };
   using R_p = R_t*;
 
+The checker ignores `typedef` within `extern "C" { ... }` blocks.
+
+.. code-block:: c++
+
+  extern "C" {
+    typedef int InExternC; // Left intact.
+  }
+
 This check requires using C++11 or higher to run.
 
 Options
@@ -37,3 +45,8 @@ Options
 
    If set to `true`, the check will not give warnings inside macros. Default
    is `true`.
+
+.. option:: IgnoreExternC
+
+   If set to `true`, the check will not give warning inside `extern "C"`scope.
+   Default is `false`
+\ No newline at end of file
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-naming.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-naming.rst
index e36bbee394f1..2affb55cfa9a 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-naming.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-naming.rst
@@ -42,6 +42,7 @@ The following options are described below:
 
  - :option:`AbstractClassCase`, :option:`AbstractClassPrefix`, :option:`AbstractClassSuffix`, :option:`AbstractClassIgnoredRegexp`, :option:`AbstractClassHungarianPrefix`
  - :option:`AggressiveDependentMemberLookup`
+ - :option:`CheckAnonFieldInParent`
  - :option:`ClassCase`, :option:`ClassPrefix`, :option:`ClassSuffix`, :option:`ClassIgnoredRegexp`, :option:`ClassHungarianPrefix`
  - :option:`ClassConstantCase`, :option:`ClassConstantPrefix`, :option:`ClassConstantSuffix`, :option:`ClassConstantIgnoredRegexp`, :option:`ClassConstantHungarianPrefix`
  - :option:`ClassMemberCase`, :option:`ClassMemberPrefix`, :option:`ClassMemberSuffix`, :option:`ClassMemberIgnoredRegexp`, :option:`ClassMemberHungarianPrefix`
@@ -207,6 +208,32 @@ After if AggressiveDependentMemberLookup is `true`:
       }
     };
 
+.. option:: CheckAnonFieldInParent
+
+    When set to `true`, fields in anonymous records (i.e. anonymous
+    unions and structs) will be treated as names in the enclosing scope
+    rather than public members of the anonymous record for the purpose
+    of name checking.
+
+For example:
+
+.. code-block:: c++
+
+    class Foo {
+    private:
+      union {
+        int iv_;
+        float fv_;
+      };
+    };
+
+If :option:`CheckAnonFieldInParent` is `false`, you may get warnings
+that ``iv_`` and ``fv_`` are not coherent to public member names, because
+``iv_`` and ``fv_`` are public members of the anonymous union. When
+:option:`CheckAnonFieldInParent` is `true`, ``iv_`` and ``fv_`` will be
+treated as private data members of ``Foo`` for the purpose of name checking
+and thus no warnings will be emitted.
+
 .. option:: ClassCase
 
     When defined, the check will ensure class names conform to the
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/simplify-subscript-expr.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/simplify-subscript-expr.rst
index f3f44bedcf74..4b7d7f2ddcf4 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/simplify-subscript-expr.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/simplify-subscript-expr.rst
@@ -20,4 +20,4 @@ Options
 .. option:: Types
 
    The list of type(s) that triggers this check. Default is
-   `::std::basic_string;::std::basic_string_view;::std::vector;::std::array`
+   `::std::basic_string;::std::basic_string_view;::std::vector;::std::array;::std::span`
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
new file mode 100644
index 000000000000..19f2344de4a6
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
@@ -0,0 +1,118 @@
+// RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-unused-local-non-trivial-variable %t -- \
+// RUN:       -config="{CheckOptions: {bugprone-unused-local-non-trivial-variable.IncludeTypes: '::async::Future;::async::Foo.*', bugprone-unused-local-non-trivial-variable.ExcludeTypes: '::async::FooBar'}}" \
+// RUN:       -- -fexceptions
+
+namespace async {
+template <typename T>
+class Ptr {
+  public:
+  explicit Ptr(T Arg) : Underlying(new T(Arg)) {}
+  T& operator->() {
+    return Underlying;
+  }
+  ~Ptr() {
+    delete Underlying;
+  }
+  private:
+    T* Underlying;
+};
+
+template<typename T>
+class Future {
+public:
+    T get() {
+        return Pending;
+    }
+    ~Future();
+private:
+    T Pending;
+};
+
+class FooBar {
+  public:
+    ~FooBar();
+  private:
+    Future<int> Fut;
+};
+
+class FooQux {
+  public:
+    ~FooQux();
+  private:
+    Future<int> Fut;
+};
+
+class FizzFoo {
+  public:
+    ~FizzFoo();
+  private:
+    Future<int> Fut;
+};
+
+} // namespace async
+
+// Warning is still emitted if there are type aliases.
+namespace a {
+template<typename T>
+using Future = async::Future<T>;
+} // namespace a
+
+void releaseUnits();
+struct Units {
+  ~Units() {
+    releaseUnits();
+  }
+};
+a::Future<Units> acquireUnits();
+
+template<typename T>
+T qux(T Generic) {
+    async::Future<Units> PendingA = acquireUnits();
+    auto PendingB = acquireUnits();
+    // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: unused local variable 'PendingB' of type 'a::Future<Units>' (aka 'Future<Units>') [bugprone-unused-local-non-trivial-variable]
+    async::Future<Units> MustBeUsed;
+    // CHECK-MESSAGES: :[[@LINE-1]]:26: warning: unused local variable 'MustBeUsed' of type 'async::Future<Units>' [bugprone-unused-local-non-trivial-variable]
+    PendingA.get();
+    async::Future<T> TemplateType;
+    // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: unused local variable 'TemplateType' of type 'async::Future<T>' [bugprone-unused-local-non-trivial-variable]
+    a::Future<T> AliasTemplateType;
+    // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: unused local variable 'AliasTemplateType' of type 'a::Future<T>' (aka 'Future<type-parameter-0-0>') [bugprone-unused-local-non-trivial-variable]
+    return Generic;
+}
+
+async::Future<int> Global;
+
+int bar(int Num) {
+    a::Future<Units> PendingA = acquireUnits();
+    a::Future<Units> PendingB = acquireUnits(); // not used at all, unused variable not fired because of destructor side effect
+    // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: unused local variable 'PendingB' of type 'a::Future<Units>' (aka 'Future<Units>') [bugprone-unused-local-non-trivial-variable]
+    auto Num2 = PendingA.get();
+    auto Num3 = qux(Num);
+    async::Ptr<a::Future<Units>> Shared = async::Ptr<a::Future<Units>>(acquireUnits());
+    static auto UnusedStatic = async::Future<Units>();
+    thread_local async::Future<Units> UnusedThreadLocal;
+    auto Captured = acquireUnits();
+    Num3 += [Captured]() {
+      return 1;
+    }();
+    a::Future<Units> Referenced = acquireUnits();
+    a::Future<Units>* Pointer = &Referenced;
+    a::Future<Units>& Reference = Referenced;
+    const a::Future<Units>& ConstReference = Referenced;
+    try {
+    } catch (a::Future<Units> Fut) {
+    }
+    struct Holder {
+      a::Future<Units> Fut;
+    };
+    Holder H;
+    auto [fut] = H;
+    return Num * Num3;
+}
+
+void exclusion() {
+  async::FizzFoo A;
+  async::FooBar B;
+  async::FooQux C;
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: unused local variable 'C' of type 'async::FooQux' [bugprone-unused-local-non-trivial-variable]
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp
new file mode 100644
index 000000000000..6a845a0bcc35
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp
@@ -0,0 +1,14 @@
+// RUN: %check_clang_tidy %s modernize-use-using %t -- -config="{CheckOptions: {modernize-use-using.IgnoreExternC: true}}" -- -I %S/Input/use-using/
+
+// Some Header
+extern "C" {
+
+typedef int NewInt;
+}
+
+extern "C++" {
+
+typedef int InExternCPP;
+// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
+// CHECK-FIXES: using InExternCPP = int;
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
index 422abee11a71..462bc984fd3a 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
@@ -325,3 +325,20 @@ typedef bool (*ISSUE_65055_2)(int);
 typedef class ISSUE_67529_1 *ISSUE_67529;
 // CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef'
 // CHECK-FIXES: using ISSUE_67529 = class ISSUE_67529_1 *;
+
+// Some Header
+extern "C" {
+
+typedef int InExternC;
+// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
+// CHECK-FIXES: using InExternC = int;
+
+}
+
+extern "C++" {
+
+typedef int InExternCPP;
+// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
+// CHECK-FIXES: using InExternCPP = int;
+
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-anon-record-fields.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-anon-record-fields.cpp
new file mode 100644
index 000000000000..1b4d4e924a72
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-anon-record-fields.cpp
@@ -0,0 +1,185 @@
+// RUN: %check_clang_tidy -std=c++20 %s readability-identifier-naming %t -- \
+// RUN:   -config='{CheckOptions: { \
+// RUN:     readability-identifier-naming.CheckAnonFieldInParent: true, \
+// RUN:     readability-identifier-naming.ClassConstantCase: CamelCase, \
+// RUN:     readability-identifier-naming.ClassConstantPrefix: 'k', \
+// RUN:     readability-identifier-naming.ClassMemberCase: CamelCase, \
+// RUN:     readability-identifier-naming.ConstantCase: UPPER_CASE, \
+// RUN:     readability-identifier-naming.ConstantSuffix: '_CST', \
+// RUN:     readability-identifier-naming.ConstexprVariableCase: lower_case, \
+// RUN:     readability-identifier-naming.GlobalConstantCase: UPPER_CASE, \
+// RUN:     readability-identifier-naming.GlobalVariableCase: lower_case, \
+// RUN:     readability-identifier-naming.GlobalVariablePrefix: 'g_', \
+// RUN:     readability-identifier-naming.LocalConstantCase: CamelCase, \
+// RUN:     readability-identifier-naming.LocalConstantPrefix: 'k', \
+// RUN:     readability-identifier-naming.LocalVariableCase: lower_case, \
+// RUN:     readability-identifier-naming.MemberCase: CamelCase, \
+// RUN:     readability-identifier-naming.MemberPrefix: 'm_', \
+// RUN:     readability-identifier-naming.ConstantMemberCase: lower_case, \
+// RUN:     readability-identifier-naming.PrivateMemberPrefix: '__', \
+// RUN:     readability-identifier-naming.ProtectedMemberPrefix: '_', \
+// RUN:     readability-identifier-naming.PublicMemberCase: lower_case, \
+// RUN:     readability-identifier-naming.StaticConstantCase: UPPER_CASE, \
+// RUN:     readability-identifier-naming.StaticVariableCase: camelBack, \
+// RUN:     readability-identifier-naming.StaticVariablePrefix: 's_', \
+// RUN:     readability-identifier-naming.VariableCase: lower_case, \
+// RUN:     readability-identifier-naming.GlobalPointerCase: CamelCase, \
+// RUN:     readability-identifier-naming.GlobalPointerSuffix: '_Ptr', \
+// RUN:     readability-identifier-naming.GlobalConstantPointerCase: UPPER_CASE, \
+// RUN:     readability-identifier-naming.GlobalConstantPointerSuffix: '_Ptr', \
+// RUN:     readability-identifier-naming.LocalPointerCase: CamelCase, \
+// RUN:     readability-identifier-naming.LocalPointerPrefix: 'l_', \
+// RUN:     readability-identifier-naming.LocalConstantPointerCase: CamelCase, \
+// RUN:     readability-identifier-naming.LocalConstantPointerPrefix: 'lc_', \
+// RUN:   }}'
+
+static union {
+  int global;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'global'
+// CHECK-FIXES: {{^}}  int g_global;{{$}}
+
+  const int global_const;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: invalid case style for global constant 'global_const'
+// CHECK-FIXES: {{^}}  const int GLOBAL_CONST;{{$}}
+
+  int *global_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global pointer 'global_ptr'
+// CHECK-FIXES: {{^}}  int *GlobalPtr_Ptr;{{$}}
+
+  int *const global_const_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:14: warning: invalid case style for global constant pointer 'global_const_ptr'
+// CHECK-FIXES: {{^}}  int *const GLOBAL_CONST_PTR_Ptr;{{$}}
+};
+
+namespace ns {
+
+static union {
+  int ns_global;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'ns_global'
+// CHECK-FIXES: {{^}}  int g_ns_global;{{$}}
+
+  const int ns_global_const;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: invalid case style for global constant 'ns_global_const'
+// CHECK-FIXES: {{^}}  const int NS_GLOBAL_CONST;{{$}}
+
+  int *ns_global_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global pointer 'ns_global_ptr'
+// CHECK-FIXES: {{^}}  int *NsGlobalPtr_Ptr;{{$}}
+
+  int *const ns_global_const_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:14: warning: invalid case style for global constant pointer 'ns_global_const_ptr'
+// CHECK-FIXES: {{^}}  int *const NS_GLOBAL_CONST_PTR_Ptr;{{$}}
+};
+
+namespace {
+
+union {
+  int anon_ns_global;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'anon_ns_global'
+// CHECK-FIXES: {{^}}  int g_anon_ns_global;{{$}}
+
+  const int anon_ns_global_const;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: invalid case style for global constant 'anon_ns_global_const'
+// CHECK-FIXES: {{^}}  const int ANON_NS_GLOBAL_CONST;{{$}}
+
+  int *anon_ns_global_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global pointer 'anon_ns_global_ptr'
+// CHECK-FIXES: {{^}}  int *AnonNsGlobalPtr_Ptr;{{$}}
+
+  int *const anon_ns_global_const_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:14: warning: invalid case style for global constant pointer 'anon_ns_global_const_ptr'
+// CHECK-FIXES: {{^}}  int *const ANON_NS_GLOBAL_CONST_PTR_Ptr;{{$}}
+};
+
+}
+
+}
+
+
+class Foo {
+public:
+  union {
+    int PubMember;
+// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for public member 'PubMember'
+// CHECK-FIXES: {{^}}    int pub_member;{{$}}
+
+    const int PubConstMember;
+// CHECK-MESSAGES: :[[@LINE-1]]:15: warning: invalid case style for constant member 'PubConstMember'
+// CHECK-FIXES: {{^}}    const int pub_const_member;{{$}}
+
+    int *PubPtrMember;
+// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for public member 'PubPtrMember'
+// CHECK-FIXES: {{^}}    int *pub_ptr_member;{{$}}
+
+    int *const PubConstPtrMember;
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: invalid case style for constant member 'PubConstPtrMember'
+// CHECK-FIXES: {{^}}    int *const pub_const_ptr_member;{{$}}
+  };
+
+protected:
+  union {
+    int prot_member;
+// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for protected member 'prot_member'
+// CHECK-FIXES: {{^}}    int _prot_member;{{$}}
+
+    const int prot_const_member;
+
+    int *prot_ptr_member;
+// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for protected member 'prot_ptr_member'
+// CHECK-FIXES: {{^}}    int *_prot_ptr_member;{{$}}
+
+    int *const prot_const_ptr_member;
+  };
+
+
+private:
+  union {
+    int pri_member;
+// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for private member 'pri_member'
+// CHECK-FIXES: {{^}}    int __pri_member;{{$}}
+
+    const int pri_const_member;
+
+    int *pri_ptr_member;
+// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for private member 'pri_ptr_member'
+// CHECK-FIXES: {{^}}    int *__pri_ptr_member;{{$}}
+
+    int *const pri_const_ptr_member;
+  };
+};
+
+void test() {
+  union {
+    int local;
+
+    const int local_const;
+// CHECK-MESSAGES: :[[@LINE-1]]:15: warning: invalid case style for local constant 'local_const'
+// CHECK-FIXES: {{^}}    const int kLocalConst;{{$}}
+
+    int *local_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for local pointer 'local_ptr'
+// CHECK-FIXES: {{^}}    int *l_LocalPtr;{{$}}
+
+    int *const local_const_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: invalid case style for local constant pointer 'local_const_ptr'
+// CHECK-FIXES: {{^}}    int *const lc_LocalConstPtr;{{$}}
+  };
+
+  static union {
+    int local_static;
+// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for static variable 'local_static'
+// CHECK-FIXES: {{^}}    int s_localStatic;{{$}}
+
+    const int local_static_const;
+// CHECK-MESSAGES: :[[@LINE-1]]:15: warning: invalid case style for static constant 'local_static_const'
+// CHECK-FIXES: {{^}}    const int LOCAL_STATIC_CONST;{{$}}
+
+    int *local_static_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for static variable 'local_static_ptr'
+// CHECK-FIXES: {{^}}    int *s_localStaticPtr;{{$}}
+
+    int *const local_static_const_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: invalid case style for static constant 'local_static_const_ptr'
+// CHECK-FIXES: {{^}}    int *const LOCAL_STATIC_CONST_PTR;{{$}}
+  };
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/misleading-indentation.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/misleading-indentation.cpp
index aea0618d120d..5d4d60f5f1a3 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/misleading-indentation.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/misleading-indentation.cpp
@@ -4,6 +4,8 @@ void foo1();
 void foo2();
 void foo3();
 
+#define E
+
 #define BLOCK \
   if (cond1)  \
     foo1();   \
@@ -109,6 +111,13 @@ void f()
   }
 
   BLOCK
+
+  if (cond1)
+    foo1();
+  else
+    foo2();
+  E foo3();
+  // CHECK-MESSAGES-NOT: :[[@LINE-1]]readability-misleading-indentation
 }
 
 void g(bool x) {
diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 2ca6db02e587..9f814478c455 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -167,6 +167,23 @@ endif()
 include(CheckIncludeFile)
 check_include_file(sys/resource.h CLANG_HAVE_RLIMITS)
 
+# This check requires _GNU_SOURCE on linux
+check_include_file(dlfcn.h CLANG_HAVE_DLFCN_H)
+if( CLANG_HAVE_DLFCN_H )
+  include(CheckLibraryExists)
+  include(CheckSymbolExists)
+  check_library_exists(dl dlopen "" HAVE_LIBDL)
+  if( HAVE_LIBDL )
+    list(APPEND CMAKE_REQUIRED_LIBRARIES dl)
+  endif()
+  list(APPEND CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)
+  check_symbol_exists(dladdr dlfcn.h CLANG_HAVE_DLADDR)
+  list(REMOVE_ITEM CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)
+  if( HAVE_LIBDL )
+    list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES dl)
+  endif()
+endif()
+
 set(CLANG_RESOURCE_DIR "" CACHE STRING
   "Relative directory from the Clang binary to its resource files.")
 
diff --git a/clang/docs/ClangFormat.rst b/clang/docs/ClangFormat.rst
index f52f35550d03..8d4017b29fb8 100644
--- a/clang/docs/ClangFormat.rst
+++ b/clang/docs/ClangFormat.rst
@@ -131,6 +131,27 @@ An easy way to create the ``.clang-format`` file is:
 
 Available style options are described in :doc:`ClangFormatStyleOptions`.
 
+You can create ``.clang-format-ignore`` files to make ``clang-format`` ignore
+certain files. A ``.clang-format-ignore`` file consists of patterns of file path
+names. It has the following format:
+
+* A blank line is skipped.
+* Leading and trailing spaces of a line are trimmed.
+* A line starting with a hash (``#``) is a comment.
+* A non-comment line is a single pattern.
+* The slash (``/``) is used as the directory separator.
+* A pattern is relative to the directory of the ``.clang-format-ignore`` file
+  (or the root directory if the pattern starts with a slash).
+* Patterns follow the rules specified in `POSIX 2.13.1, 2.13.2, and Rule 1 of
+  2.13.3 <https://pubs.opengroup.org/onlinepubs/9699919799/utilities/
+  V3_chap02.html#tag_18_13>`_.
+* A pattern is negated if it starts with a bang (``!``).
+
+To match all files in a directory, use e.g. ``foo/bar/*``. To match all files in
+the directory of the ``.clang-format-ignore`` file, use ``*``.
+Multiple ``.clang-format-ignore`` files are supported similar to the
+``.clang-format`` files, with a lower directory level file voiding the higher
+level ones.
 
 Vim Integration
 ===============
diff --git a/clang/docs/ControlFlowIntegrityDesign.rst b/clang/docs/ControlFlowIntegrityDesign.rst
index f3a3c8294f7c..d66bd16155a9 100644
--- a/clang/docs/ControlFlowIntegrityDesign.rst
+++ b/clang/docs/ControlFlowIntegrityDesign.rst
@@ -349,7 +349,7 @@ address point. Note that libraries like libcxxabi do assume this property.
 
 (2) virtual function entry layout property
 
-For each virtual function the distance between an virtual table entry for this function and the corresponding
+For each virtual function the distance between a virtual table entry for this function and the corresponding
 address point is always the same. This property ensures that dynamic dispatch still works with the interleaving layout.
 
 Note that the interleaving scheme in the CFI implementation guarantees both properties above whereas the original scheme proposed
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 13fb7c345aa4..23a7f4f5d5b9 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2019,7 +2019,7 @@ would be +1.  ``ns_returns_autoreleased`` specifies that the returned object is
 autorelease pool.
 
 **Usage**: The ``ns_consumed`` and ``cf_consumed`` attributes can be placed on
-an parameter declaration; they specify that the argument is expected to have a
+a parameter declaration; they specify that the argument is expected to have a
 +1 retain count, which will be balanced in some way by the function or method.
 The ``ns_consumes_self`` attribute can only be placed on an Objective-C
 method; it specifies that the method expects its ``self`` parameter to have a
@@ -3601,7 +3601,7 @@ scalar calls of ``__builtin_isfpclass`` applied to the input elementwise.
 The result of ``__builtin_isfpclass`` is a boolean value, if the first argument
 is a scalar, or an integer vector with the same element count as the first
 argument. The element type in this vector has the same bit length as the
-element of the the first argument type.
+element of the first argument type.
 
 This function never raises floating-point exceptions and does not canonicalize
 its input. The floating-point argument is not promoted, its data class is
@@ -4959,7 +4959,7 @@ Clang supports the following match rules:
 - ``record(unless(is_union))``: Can be used to apply attributes only to
   ``struct`` and ``class`` declarations.
 
-- ``enum``: Can be be used to apply attributes to enumeration declarations.
+- ``enum``: Can be used to apply attributes to enumeration declarations.
 
 - ``enum_constant``: Can be used to apply attributes to enumerators.
 
diff --git a/clang/docs/LibASTMatchersTutorial.rst b/clang/docs/LibASTMatchersTutorial.rst
index 37c9f178fa8d..2a58502a8de4 100644
--- a/clang/docs/LibASTMatchersTutorial.rst
+++ b/clang/docs/LibASTMatchersTutorial.rst
@@ -50,7 +50,7 @@ Okay. Now we'll build Clang!
 
       cd ~/clang-llvm
       mkdir build && cd build
-      cmake -G Ninja ../llvm -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra" -DLLVM_BUILD_TESTS=ON  # Enable tests; default is off.
+      cmake -G Ninja ../llvm -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra" -DCMAKE_BUILD_TYPE=Release -DLLVM_BUILD_TESTS=ON
       ninja
       ninja check       # Test LLVM only.
       ninja clang-test  # Test Clang only.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 39b9176865fc..0c8fec691bf3 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -253,7 +253,7 @@ New Compiler Flags
   the preprocessed text to the output. This can greatly reduce the size of the
   preprocessed output, which can be helpful when trying to reduce a test case.
 * ``-fassume-nothrow-exception-dtor`` is added to assume that the destructor of
-  an thrown exception object will not throw. The generated code for catch
+  a thrown exception object will not throw. The generated code for catch
   handlers will be smaller. A throw expression of a type with a
   potentially-throwing destructor will lead to an error.
 
@@ -1041,6 +1041,7 @@ clang-format
 - Add ``BreakAdjacentStringLiterals`` option.
 - Add ``ObjCPropertyAttributeOrder`` which can be used to sort ObjC property
   attributes (like ``nonatomic, strong, nullable``).
+- Add ``.clang-format-ignore`` files.
 
 libclang
 --------
@@ -1052,18 +1053,119 @@ libclang
 Static Analyzer
 ---------------
 
+New features
+^^^^^^^^^^^^
+
+- Implemented the ``[[clang::suppress]]`` attribute for suppressing diagnostics
+  of static analysis tools, such as the Clang Static Analyzer.
+  `Documentation <https://clang.llvm.org/docs/AttributeReference.html#suppress>`__.
+
+- Added support for the ``cleanup`` attribute.
+  `Documentation <https://clang.llvm.org/docs/AttributeReference.html#cleanup>`__.
+
+- Support "Deducing this" (P0847R7). (Worked out of the box)
+  (`af4751738db8 <https://github.com/llvm/llvm-project/commit/af4751738db89a142a8880c782d12d4201b222a8>`__)
+
 - Added a new checker ``core.BitwiseShift`` which reports situations where
   bitwise shift operators produce undefined behavior (because some operand is
   negative or too large).
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#core-bitwiseshift-c-c>`__.
+
+- Added a new experimental checker ``alpha.core.StdVariant`` to detect variant
+  accesses via wrong alternatives.
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#alpha-core-stdvariant-c>`__.
+  (`#66481 <https://github.com/llvm/llvm-project/pull/66481>`_)
+
+- Added a new experimental checker ``alpha.cplusplus.ArrayDelete`` to detect
+  destructions of arrays of polymorphic objects that are destructed as their
+  base class (`CERT EXP51-CPP <https://wiki.sei.cmu.edu/confluence/display/cplusplus/EXP51-CPP.+Do+not+delete+an+array+through+a+pointer+of+the+incorrect+type>`_).
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#alpha-cplusplus-arraydelete-c>`__.
+  (`0e246bb67573 <https://github.com/llvm/llvm-project/commit/0e246bb67573799409d0085b89902a330998ddcc>`_)
+
+- Added a new checker configuration option ``InvalidatingGetEnv=[true,false]`` to
+  ``security.cert.env.InvalidPtr``. It's not set by default.
+  If set, ``getenv`` calls won't invalidate previously returned pointers.
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#security-cert-env-invalidptr>`__.
+  (`#67663 <https://github.com/llvm/llvm-project/pull/67663>`_)
+
+Crash and bug fixes
+^^^^^^^^^^^^^^^^^^^
 
-- Move checker ``alpha.unix.Errno`` out of the ``alpha`` package
-  to ``unix.Errno``.
+- Fixed a crash caused by ``builtin_bit_cast``.
+  (`#69922 <https://github.com/llvm/llvm-project/issues/69922>`_)
 
-- Move checker ``alpha.unix.StdCLibraryFunctions`` out of the ``alpha`` package
-  to ``unix.StdCLibraryFunctions``.
+- Fixed a ``core.StackAddressEscape`` crash on temporary object fields.
+  (`#66221 <https://github.com/llvm/llvm-project/issues/66221>`_)
+
+- A few crashes have been found and fixed using randomized testing related
+  to the use of ``_BitInt()`` in tidy checks and in clang analysis.
+  (`#67212 <https://github.com/llvm/llvm-project/pull/67212>`_,
+  `#66782 <https://github.com/llvm/llvm-project/pull/66782>`_,
+  `#65889 <https://github.com/llvm/llvm-project/pull/65889>`_,
+  `#65888 <https://github.com/llvm/llvm-project/pull/65888>`_,
+  `#65887 <https://github.com/llvm/llvm-project/pull/65887>`_)
+
+- Fixed note links of the HTML output.
+  (`#64054 <https://github.com/llvm/llvm-project/issues/64054>`_)
+
+- Allow widening rage-based for loops.
+  (`#70190 <https://github.com/llvm/llvm-project/pull/70190>`_)
+
+- Fixed uninitialized base class with initializer list when ctor is not
+  declared in the base class.
+  (`#70464 <https://github.com/llvm/llvm-project/issues/70464>`_,
+  `#59493 <https://github.com/llvm/llvm-project/issues/59493>`_,
+  `#54533 <https://github.com/llvm/llvm-project/issues/54533>`_)
+
+- Fixed an ``alpha.unix.cstring`` crash on variadic functions.
+  (`#74269 <https://github.com/llvm/llvm-project/issues/74269>`_)
 
 - Fix false positive in mutation check when using pointer to member function.
-  (`#66204: <https://github.com/llvm/llvm-project/issues/66204>`_).
+  (`#66204 <https://github.com/llvm/llvm-project/issues/66204>`_)
+
+Improvements
+^^^^^^^^^^^^
+
+- Improved the ``unix.StdCLibraryFunctions`` checker by modeling more
+  functions like ``send``, ``recv``, ``readlink``, ``fflush`` and
+  ``errno`` behavior.
+  (`52ac71f92d38 <https://github.com/llvm/llvm-project/commit/52ac71f92d38f75df5cb88e9c090ac5fd5a71548>`_,
+  `#71373 <https://github.com/llvm/llvm-project/pull/71373>`_,
+  `#76557 <https://github.com/llvm/llvm-project/pull/76557>`_,
+  `#71392 <https://github.com/llvm/llvm-project/pull/71392>`_)
+
+- Fixed a false negative for when accessing a nonnull property (ObjC).
+  (`1dceba3a3684 <https://github.com/llvm/llvm-project/commit/1dceba3a3684d12394731e09a6cf3efcebf07a3a>`_)
+
+- ``security.insecureAPI.DeprecatedOrUnsafeBufferHandling`` now considers
+  ``fprintf`` calls unsafe.
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-deprecatedorunsafebufferhandling-c>`__.
+
+- Improved the diagnostics of the ``optin.core.EnumCastOutOfRange`` checker.
+  It will display the name and the declaration of the enumeration along with
+  the concrete value being cast to the enum.
+  (`#74503 <https://github.com/llvm/llvm-project/pull/74503>`_)
+
+- Improved the ``alpha.security.ArrayBoundV2`` checker for detecting buffer
+  accesses prior the buffer; and also reworked the diagnostic messages.
+  (`3e014038b373 <https://github.com/llvm/llvm-project/commit/3e014038b373e5a4a96d89d46cea17e4d2456a04>`_,
+  `#70056 <https://github.com/llvm/llvm-project/pull/70056>`_,
+  `#72107 <https://github.com/llvm/llvm-project/pull/72107>`_)
+
+- Improved the ``alpha.unix.cstring.OutOfBounds`` checking both ends of the
+  buffers in more cases.
+  (`c3a87ddad62a <https://github.com/llvm/llvm-project/commit/c3a87ddad62a6cc01acaccc76592bc6730c8ac3c>`_,
+  `0954dc3fb921 <https://github.com/llvm/llvm-project/commit/0954dc3fb9214b994623f5306473de075f8e3593>`_)
+
+- Improved the ``alpha.unix.Stream`` checker by modeling more functions like,
+  ``fflush``, ``fputs``, ``fgetc``, ``fputc``, ``fopen``, ``fopen``, ``fgets``.
+  (`#74296 <https://github.com/llvm/llvm-project/pull/74296>`_,
+  `#73335 <https://github.com/llvm/llvm-project/pull/73335>`_,
+  `#72627 <https://github.com/llvm/llvm-project/pull/72627>`_,
+  `#71518 <https://github.com/llvm/llvm-project/pull/71518>`_,
+  `#72016 <https://github.com/llvm/llvm-project/pull/72016>`_,
+  `#70540 <https://github.com/llvm/llvm-project/pull/70540>`_,
+  `#73638 <https://github.com/llvm/llvm-project/pull/73638>`_)
 
 - The ``alpha.security.taint.TaintPropagation`` checker no longer propagates
   taint on ``strlen`` and ``strnlen`` calls, unless these are marked
@@ -1072,16 +1174,32 @@ Static Analyzer
   Read the PR for the details.
   (`#66086 <https://github.com/llvm/llvm-project/pull/66086>`_)
 
-- A few crashes have been found and fixed using randomized testing related
-  to the use of ``_BitInt()`` in tidy checks and in clang analysis. See
-  `#67212 <https://github.com/llvm/llvm-project/pull/67212>`_,
-  `#66782 <https://github.com/llvm/llvm-project/pull/66782>`_,
-  `#65889 <https://github.com/llvm/llvm-project/pull/65889>`_,
-  `#65888 <https://github.com/llvm/llvm-project/pull/65888>`_, and
-  `#65887 <https://github.com/llvm/llvm-project/pull/65887>`_
+- Other taint-related improvements.
+  (`#66358 <https://github.com/llvm/llvm-project/pull/66358>`_,
+  `#66074 <https://github.com/llvm/llvm-project/pull/66074>`_,
+  `#66358 <https://github.com/llvm/llvm-project/pull/66358>`_)
+
+- Checkers can query constraint bounds to improve diagnostic messages.
+  (`#74141 <https://github.com/llvm/llvm-project/pull/74141>`_)
+
+Moved checkers
+^^^^^^^^^^^^^^
+
+- Move checker ``alpha.unix.Errno`` out of the ``alpha`` package
+  to ``unix.Errno``.
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#unix-errno-c>`__.
+
+- Move checker ``alpha.unix.StdCLibraryFunctions`` out of the ``alpha`` package
+  to ``unix.StdCLibraryFunctions``.
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#unix-stdclibraryfunctions-c>`__.
+
+- Move checker ``alpha.security.cert.env.InvalidPtr`` out of the ``alpha``
+  package to ``security.cert.env.InvalidPtr``.
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#security-cert-env-invalidptr>`__.
 
 - Move checker ``alpha.cplusplus.EnumCastOutOfRange`` out of the ``alpha``
   package to ``optin.core.EnumCastOutOfRange``.
+  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#optin-core-enumcastoutofrange-c-c>`__.
 
 .. _release-notes-sanitizers:
 
diff --git a/clang/docs/SanitizerCoverage.rst b/clang/docs/SanitizerCoverage.rst
index c7ced397c722..45ad03cb4377 100644
--- a/clang/docs/SanitizerCoverage.rst
+++ b/clang/docs/SanitizerCoverage.rst
@@ -496,7 +496,7 @@ offsets in the corresponding binary/DSO that were executed during the run.
 Sancov Tool
 -----------
 
-An simple ``sancov`` tool is provided to process coverage files.
+A simple ``sancov`` tool is provided to process coverage files.
 The tool is part of LLVM project and is currently supported only on Linux.
 It can handle symbolization tasks autonomously without any extra support
 from the environment. You need to pass .sancov files (named
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 81d40395067c..bb637cf1b800 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -1025,7 +1025,7 @@ security.insecureAPI.vfork (C)
 
 security.insecureAPI.DeprecatedOrUnsafeBufferHandling (C)
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""
- Warn on occurrences of unsafe or deprecated buffer handling functions, which now have a secure variant: ``sprintf, vsprintf, scanf, wscanf, fscanf, fwscanf, vscanf, vwscanf, vfscanf, vfwscanf, sscanf, swscanf, vsscanf, vswscanf, swprintf, snprintf, vswprintf, vsnprintf, memcpy, memmove, strncpy, strncat, memset``
+ Warn on occurrences of unsafe or deprecated buffer handling functions, which now have a secure variant: ``sprintf, fprintf, vsprintf, scanf, wscanf, fscanf, fwscanf, vscanf, vwscanf, vfscanf, vfwscanf, sscanf, swscanf, vsscanf, vswscanf, swprintf, snprintf, vswprintf, vsnprintf, memcpy, memmove, strncpy, strncat, memset``
 
 .. code-block:: c
 
@@ -2095,6 +2095,21 @@ This checker is a part of ``core.StackAddressEscape``, but is temporarily disabl
                  //       returned block
  }
 
+.. _alpha-core-StdVariant:
+
+alpha.core.StdVariant (C++)
+"""""""""""""""""""""""""""
+Check if a value of active type is retrieved from an ``std::variant`` instance with ``std::get``.
+In case of bad variant type access (the accessed type differs from the active type)
+a warning is emitted. Currently, this checker does not take exception handling into account.
+
+.. code-block:: cpp
+
+ void test() {
+   std::variant<int, char> v = 25;
+   char c = stg::get<char>(v); // warn: "int" is the active alternative
+ }
+
 .. _alpha-core-TestAfterDivZero:
 
 alpha.core.TestAfterDivZero (C)
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index b3ae66e6e769..1afa69367286 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2383,10 +2383,6 @@ public:
   /// Check if the type is the CUDA device builtin texture type.
   bool isCUDADeviceBuiltinTextureType() const;
 
-  bool isRVVType(unsigned ElementCount) const;
-
-  bool isRVVType(unsigned Bitwidth, bool IsFloat, bool IsBFloat = false) const;
-
   /// Return the implicit lifetime for this type, which must not be dependent.
   Qualifiers::ObjCLifetime getObjCARCImplicitLifetime() const;
 
@@ -7283,28 +7279,6 @@ inline bool Type::isOpenCLSpecificType() const {
          isQueueT() || isReserveIDT() || isPipeType() || isOCLExtOpaqueType();
 }
 
-inline bool Type::isRVVType(unsigned ElementCount) const {
-  bool Ret = false;
-#define RVV_VECTOR_TYPE(Name, Id, SingletonId, NumEls, ElBits, NF, IsSigned,   \
-                        IsFP, IsBF)                                            \
-  if (NumEls == ElementCount)                                                  \
-    Ret |= isSpecificBuiltinType(BuiltinType::Id);
-#include "clang/Basic/RISCVVTypes.def"
-  return Ret;
-}
-
-inline bool Type::isRVVType(unsigned Bitwidth, bool IsFloat,
-                            bool IsBFloat) const {
-  bool Ret = false;
-#define RVV_TYPE(Name, Id, SingletonId)
-#define RVV_VECTOR_TYPE(Name, Id, SingletonId, NumEls, ElBits, NF, IsSigned,   \
-                        IsFP, IsBF)                                            \
-  if (ElBits == Bitwidth && IsFloat == IsFP && IsBFloat == IsBF)               \
-    Ret |= isSpecificBuiltinType(BuiltinType::Id);
-#include "clang/Basic/RISCVVTypes.def"
-  return Ret;
-}
-
 inline bool Type::isTemplateTypeParmType() const {
   return isa<TemplateTypeParmType>(CanonicalType);
 }
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index 5943af50b6ad..e8c27d6c1203 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -289,6 +289,22 @@ public:
   ///  `E` must be a glvalue or a `BuiltinType::BuiltinFn`
   StorageLocation *getStorageLocation(const Expr &E) const;
 
+  /// Returns the result of casting `getStorageLocation(...)` to a subclass of
+  /// `StorageLocation` (using `cast_or_null<T>`).
+  /// This assert-fails if the result of `getStorageLocation(...)` is not of
+  /// type `T *`; if the storage location is not guaranteed to have type `T *`,
+  /// consider using `dyn_cast_or_null<T>(getStorageLocation(...))` instead.
+  template <typename T>
+  std::enable_if_t<std::is_base_of_v<StorageLocation, T>, T *>
+  get(const ValueDecl &D) const {
+    return cast_or_null<T>(getStorageLocation(D));
+  }
+  template <typename T>
+  std::enable_if_t<std::is_base_of_v<StorageLocation, T>, T *>
+  get(const Expr &E) const {
+    return cast_or_null<T>(getStorageLocation(E));
+  }
+
   /// Returns the storage location assigned to the `this` pointee in the
   /// environment or null if the `this` pointee has no assigned storage location
   /// in the environment.
@@ -457,6 +473,26 @@ public:
   /// storage location in the environment, otherwise returns null.
   Value *getValue(const Expr &E) const;
 
+  /// Returns the result of casting `getValue(...)` to a subclass of `Value`
+  /// (using `cast_or_null<T>`).
+  /// This assert-fails if the result of `getValue(...)` is not of type `T *`;
+  /// if the value is not guaranteed to have type `T *`, consider using
+  /// `dyn_cast_or_null<T>(getValue(...))` instead.
+  template <typename T>
+  std::enable_if_t<std::is_base_of_v<Value, T>, T *>
+  get(const StorageLocation &Loc) const {
+    return cast_or_null<T>(getValue(Loc));
+  }
+  template <typename T>
+  std::enable_if_t<std::is_base_of_v<Value, T>, T *>
+  get(const ValueDecl &D) const {
+    return cast_or_null<T>(getValue(D));
+  }
+  template <typename T>
+  std::enable_if_t<std::is_base_of_v<Value, T>, T *> get(const Expr &E) const {
+    return cast_or_null<T>(getValue(E));
+  }
+
   // FIXME: should we deprecate the following & call arena().create() directly?
 
   /// Creates a `T` (some subclass of `Value`), forwarding `args` to the
@@ -691,20 +727,9 @@ RecordStorageLocation *getBaseObjectLocation(const MemberExpr &ME,
 std::vector<FieldDecl *> getFieldsForInitListExpr(const RecordDecl *RD);
 
 /// Associates a new `RecordValue` with `Loc` and returns the new value.
-/// It is not defined whether the field values remain the same or not.
-///
-/// This function is primarily intended for use by checks that set custom
-/// properties on `RecordValue`s to model the state of these values. Such checks
-/// should avoid modifying the properties of an existing `RecordValue` because
-/// these changes would be visible to other `Environment`s that share the same
-/// `RecordValue`. Instead, call `refreshRecordValue()`, then set the properties
-/// on the new `RecordValue` that it returns. Typical usage:
-///
-///   refreshRecordValue(Loc, Env).setProperty("my_prop", MyPropValue);
 RecordValue &refreshRecordValue(RecordStorageLocation &Loc, Environment &Env);
 
 /// Associates a new `RecordValue` with `Expr` and returns the new value.
-/// See also documentation for the overload above.
 RecordValue &refreshRecordValue(const Expr &Expr, Environment &Env);
 
 } // namespace dataflow
diff --git a/clang/include/clang/Analysis/FlowSensitive/RecordOps.h b/clang/include/clang/Analysis/FlowSensitive/RecordOps.h
index 7b87840d626b..783e53e980aa 100644
--- a/clang/include/clang/Analysis/FlowSensitive/RecordOps.h
+++ b/clang/include/clang/Analysis/FlowSensitive/RecordOps.h
@@ -22,19 +22,13 @@ namespace dataflow {
 /// Copies a record (struct, class, or union) from `Src` to `Dst`.
 ///
 /// This performs a deep copy, i.e. it copies every field (including synthetic
-/// fields) and recurses on fields of record type. It also copies properties
-/// from the `RecordValue` associated with `Src` to the `RecordValue` associated
-/// with `Dst` (if these `RecordValue`s exist).
+/// fields) and recurses on fields of record type.
 ///
 /// If there is a `RecordValue` associated with `Dst` in the environment, this
 /// function creates a new `RecordValue` and associates it with `Dst`; clients
 /// need to be aware of this and must not assume that the `RecordValue`
 /// associated with `Dst` remains the same after the call.
 ///
-/// We create a new `RecordValue` rather than modifying properties on the old
-/// `RecordValue` because the old `RecordValue` may be shared with other
-/// `Environment`s, and we don't want changes to properties to be visible there.
-///
 /// Requirements:
 ///
 ///  `Src` and `Dst` must have the same canonical unqualified type.
@@ -49,9 +43,7 @@ void copyRecord(RecordStorageLocation &Src, RecordStorageLocation &Dst,
 ///
 /// This performs a deep comparison, i.e. it compares every field (including
 /// synthetic fields) and recurses on fields of record type. Fields of reference
-/// type compare equal if they refer to the same storage location. If
-/// `RecordValue`s are associated with `Loc1` and Loc2`, it also compares the
-/// properties on those `RecordValue`s.
+/// type compare equal if they refer to the same storage location.
 ///
 /// Note on how to interpret the result:
 /// - If this returns true, the records are guaranteed to be equal at runtime.
diff --git a/clang/include/clang/Analysis/FlowSensitive/Value.h b/clang/include/clang/Analysis/FlowSensitive/Value.h
index e6c68e5b4e93..be1bf9324c87 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Value.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Value.h
@@ -63,7 +63,11 @@ public:
 
   /// Assigns `Val` as the value of the synthetic property with the given
   /// `Name`.
+  ///
+  /// Properties may not be set on `RecordValue`s; use synthetic fields instead
+  /// (for details, see documentation for `RecordStorageLocation`).
   void setProperty(llvm::StringRef Name, Value &Val) {
+    assert(getKind() != Kind::Record);
     Properties.insert_or_assign(Name, &Val);
   }
 
@@ -184,33 +188,23 @@ private:
 /// In C++, prvalues of class type serve only a limited purpose: They can only
 /// be used to initialize a result object. It is not possible to access member
 /// variables or call member functions on a prvalue of class type.
-/// Correspondingly, `RecordValue` also serves only two limited purposes:
-/// - It conveys a prvalue of class type from the place where the object is
-///   constructed to the result object that it initializes.
+/// Correspondingly, `RecordValue` also serves only a limited purpose: It
+/// conveys a prvalue of class type from the place where the object is
+/// constructed to the result object that it initializes.
 ///
-///   When creating a prvalue of class type, we already need a storage location
-///   for `this`, even though prvalues are otherwise not associated with storage
-///   locations. `RecordValue` is therefore essentially a wrapper for a storage
-///   location, which is then used to set the storage location for the result
-///   object when we process the AST node for that result object.
+/// When creating a prvalue of class type, we already need a storage location
+/// for `this`, even though prvalues are otherwise not associated with storage
+/// locations. `RecordValue` is therefore essentially a wrapper for a storage
+/// location, which is then used to set the storage location for the result
+/// object when we process the AST node for that result object.
 ///
-///   For example:
-///      MyStruct S = MyStruct(3);
+/// For example:
+///    MyStruct S = MyStruct(3);
 ///
-///   In this example, `MyStruct(3) is a prvalue, which is modeled as a
-///   `RecordValue` that wraps a `RecordStorageLocation`. This
-//    `RecordStorageLocation` is then used as the storage location for `S`.
+/// In this example, `MyStruct(3) is a prvalue, which is modeled as a
+/// `RecordValue` that wraps a `RecordStorageLocation`. This
+/// `RecordStorageLocation` is then used as the storage location for `S`.
 ///
-/// - It allows properties to be associated with an object of class type.
-///   Note that when doing so, you should avoid mutating the properties of an
-///   existing `RecordValue` in place, as these changes would be visible to
-///   other `Environment`s that share the same `RecordValue`. Instead, associate
-///   a new `RecordValue` with the `RecordStorageLocation` and set the
-///   properties on this new `RecordValue`. (See also `refreshRecordValue()` in
-///   DataflowEnvironment.h, which makes this easy.)
-///   Note also that this implies that it is common for the same
-///   `RecordStorageLocation` to be associated with different `RecordValue`s
-///   in different environments.
 /// Over time, we may eliminate `RecordValue` entirely. See also the discussion
 /// here: https://reviews.llvm.org/D155204#inline-1503204
 class RecordValue final : public Value {
diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def
index 82a1ba3c82ad..31ec84143f65 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@@ -68,6 +68,9 @@ TARGET_BUILTIN(__builtin_arm_ldg, "v*v*", "t", "mte")
 TARGET_BUILTIN(__builtin_arm_stg, "vv*", "t", "mte")
 TARGET_BUILTIN(__builtin_arm_subp, "Uiv*v*", "t", "mte")
 
+// SME state function
+BUILTIN(__builtin_arm_get_sme_state, "vULi*ULi*", "n")
+
 // Memory Operations
 TARGET_BUILTIN(__builtin_arm_mops_memset_tag, "v*v*iz", "", "mte,mops")
 
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 7cf347e92d99..6765721ae700 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -348,7 +348,8 @@ def CXX98CompatPedantic : DiagGroup<"c++98-compat-pedantic",
                                      CXXPre20CompatPedantic,
                                      CXXPre23CompatPedantic]>;
 
-def CXX11Narrowing : DiagGroup<"c++11-narrowing">;
+def CXX11NarrowingConstReference : DiagGroup<"c++11-narrowing-const-reference">;
+def CXX11Narrowing : DiagGroup<"c++11-narrowing", [CXX11NarrowingConstReference]>;
 
 def CXX11WarnInconsistentOverrideDestructor :
   DiagGroup<"inconsistent-missing-destructor-override">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index c100041ca400..aebb7d9b945c 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6158,12 +6158,24 @@ def err_illegal_initializer_type : Error<"illegal initializer type %0">;
 def ext_init_list_type_narrowing : ExtWarn<
   "type %0 cannot be narrowed to %1 in initializer list">,
   InGroup<CXX11Narrowing>, DefaultError, SFINAEFailure;
+// *_narrowing_const_reference diagnostics have the same messages, but are
+// controlled by -Wc++11-narrowing-const-reference for narrowing involving a
+// const reference.
+def ext_init_list_type_narrowing_const_reference : ExtWarn<
+  "type %0 cannot be narrowed to %1 in initializer list">,
+  InGroup<CXX11NarrowingConstReference>, DefaultError, SFINAEFailure;
 def ext_init_list_variable_narrowing : ExtWarn<
   "non-constant-expression cannot be narrowed from type %0 to %1 in "
   "initializer list">, InGroup<CXX11Narrowing>, DefaultError, SFINAEFailure;
+def ext_init_list_variable_narrowing_const_reference : ExtWarn<
+  "non-constant-expression cannot be narrowed from type %0 to %1 in "
+  "initializer list">, InGroup<CXX11NarrowingConstReference>, DefaultError, SFINAEFailure;
 def ext_init_list_constant_narrowing : ExtWarn<
   "constant expression evaluates to %0 which cannot be narrowed to type %1">,
   InGroup<CXX11Narrowing>, DefaultError, SFINAEFailure;
+def ext_init_list_constant_narrowing_const_reference : ExtWarn<
+  "constant expression evaluates to %0 which cannot be narrowed to type %1">,
+  InGroup<CXX11NarrowingConstReference>, DefaultError, SFINAEFailure;
 def warn_init_list_type_narrowing : Warning<
   "type %0 cannot be narrowed to %1 in initializer list in C++11">,
   InGroup<CXX11Narrowing>, DefaultIgnore;
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 152d9f65f86d..21abc346cf17 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -456,6 +456,7 @@ ENUM_LANGOPT(SignReturnAddressScope, SignReturnAddressScopeKind, 2, SignReturnAd
 ENUM_LANGOPT(SignReturnAddressKey, SignReturnAddressKeyKind, 1, SignReturnAddressKeyKind::AKey,
              "Key used for return address signing")
 LANGOPT(BranchTargetEnforcement, 1, 0, "Branch-target enforcement enabled")
+LANGOPT(BranchProtectionPAuthLR, 1, 0, "Use PC as a diversifier using PAuthLR NOP instructions.")
 
 LANGOPT(SpeculativeLoadHardening, 1, 0, "Speculative load hardening enabled")
 
diff --git a/clang/include/clang/Basic/PlistSupport.h b/clang/include/clang/Basic/PlistSupport.h
index 557462a5b90d..d52d196019cf 100644
--- a/clang/include/clang/Basic/PlistSupport.h
+++ b/clang/include/clang/Basic/PlistSupport.h
@@ -77,8 +77,7 @@ inline raw_ostream &EmitInteger(raw_ostream &o, int64_t value) {
 
 inline raw_ostream &EmitString(raw_ostream &o, StringRef s) {
   o << "<string>";
-  for (StringRef::const_iterator I = s.begin(), E = s.end(); I != E; ++I) {
-    char c = *I;
+  for (char c : s) {
     switch (c) {
     default:
       o << c;
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index aa0f5023104a..ac3c324c6c29 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1372,6 +1372,7 @@ public:
     LangOptions::SignReturnAddressKeyKind SignKey =
         LangOptions::SignReturnAddressKeyKind::AKey;
     bool BranchTargetEnforcement = false;
+    bool BranchProtectionPAuthLR = false;
   };
 
   /// Determine if the Architecture in this TargetInfo supports branch
diff --git a/clang/include/clang/Basic/Version.h b/clang/include/clang/Basic/Version.h
index 2881d8db954e..8e4e6928fded 100644
--- a/clang/include/clang/Basic/Version.h
+++ b/clang/include/clang/Basic/Version.h
@@ -40,6 +40,9 @@ namespace clang {
   /// string as getClangRevision.
   std::string getLLVMRevision();
 
+  /// Retrieves the Clang vendor tag.
+  std::string getClangVendor();
+
   /// Retrieves the full repository version that is an amalgamation of
   /// the information in getClangRepositoryPath() and getClangRevision().
   std::string getClangFullRepositoryVersion();
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index ce99ca82c1d3..aac3bd486de9 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -351,6 +351,285 @@ let TargetGuard = "sme2" in {
   def SVBMOPA : Inst<"svbmopa_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmopa_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
 
   def SVBMOPS : Inst<"svbmops_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
+
+  // VERTICAL DOT-PRODUCT
+  def SVVDOT_LANE_ZA32_VG1x2_S : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "s", MergeNone, "aarch64_sme_svdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOT_LANE_ZA32_VG1x4_S : Inst<"svvdot_lane_za32[_{d}]_vg1x4", "vm4di", "c", MergeNone, "aarch64_sme_svdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOT_LANE_ZA32_VG1x2_U : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "Us", MergeNone, "aarch64_sme_uvdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOT_LANE_ZA32_VG1x4_U : Inst<"svvdot_lane_za32[_{d}]_vg1x4", "vm4di", "Uc", MergeNone, "aarch64_sme_uvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOT_LANE_ZA32_VG1x2_F : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "hb", MergeNone, "aarch64_sme_fvdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVSUVDOT_LANE_ZA32_VG1x4 : Inst<"svsuvdot_lane_za32[_{d}]_vg1x4", "vm4di", "c", MergeNone, "aarch64_sme_suvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVUSVDOT_LANE_ZA32_VG1x4 : Inst<"svusvdot_lane_za32[_{d}]_vg1x4", "vm4di", "Uc", MergeNone, "aarch64_sme_usvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+
+  // Multi-vector signed & unsigned integer dot-product
+  def SVDOT_MULTI_ZA32_VG1x2_S  : Inst<"svdot_za32[_{d}]_vg1x2", "vm22", "cs", MergeNone, "aarch64_sme_sdot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA32_VG1x4_S  : Inst<"svdot_za32[_{d}]_vg1x4", "vm44", "cs", MergeNone, "aarch64_sme_sdot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA32_VG1x2_U  : Inst<"svdot_za32[_{d}]_vg1x2", "vm22", "UcUs", MergeNone, "aarch64_sme_udot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA32_VG1x4_U  : Inst<"svdot_za32[_{d}]_vg1x4", "vm44", "UcUs", MergeNone, "aarch64_sme_udot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x2_S : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vm2d", "cs", MergeNone, "aarch64_sme_sdot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x4_S : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vm4d", "cs", MergeNone, "aarch64_sme_sdot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x2_U : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vm2d", "UcUs", MergeNone, "aarch64_sme_udot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x4_U : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vm4d", "UcUs", MergeNone, "aarch64_sme_udot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_LANE_ZA32_VG1x2_S   : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vm2di", "cs", MergeNone, "aarch64_sme_sdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_ZA32_VG1x4_S   : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vm4di", "cs", MergeNone, "aarch64_sme_sdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_ZA32_VG1x2_U   : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vm2di", "UcUs", MergeNone, "aarch64_sme_udot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_ZA32_VG1x4_U   : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vm4di", "UcUs", MergeNone, "aarch64_sme_udot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+
+  def SVUSDOT_SINGLE_ZA32_VG1x2 : Inst<"svusdot[_single]_za32[_{d}]_vg1x2", "vm2.dx", "Uc", MergeNone, "aarch64_sme_usdot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVUSDOT_SINGLE_ZA32_VG1x4 : Inst<"svusdot[_single]_za32[_{d}]_vg1x4", "vm4.dx", "Uc", MergeNone, "aarch64_sme_usdot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVUSDOT_MULTI_ZA32_VG1x2  : Inst<"svusdot_za32[_{d}]_vg1x2", "vm2.d2.x", "Uc", MergeNone, "aarch64_sme_usdot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVUSDOT_MULTI_ZA32_VG1x4  : Inst<"svusdot_za32[_{d}]_vg1x4", "vm4.d4.x", "Uc", MergeNone, "aarch64_sme_usdot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVUSDOT_LANE_ZA32_VG1x2   : Inst<"svusdot_lane_za32[_{d}]_vg1x2", "vm2.dxi", "Uc", MergeNone, "aarch64_sme_usdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVUSDOT_LANE_ZA32_VG1x4   : Inst<"svusdot_lane_za32[_{d}]_vg1x4", "vm4.dxi", "Uc", MergeNone, "aarch64_sme_usdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+
+  def SVSUDOT_SINGLE_ZA32_VG1x2 : Inst<"svsudot[_single]_za32[_{d}]_vg1x2", "vm2.du", "c", MergeNone, "aarch64_sme_sudot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVSUDOT_SINGLE_ZA32_VG1x4 : Inst<"svsudot[_single]_za32[_{d}]_vg1x4", "vm4.du", "c", MergeNone, "aarch64_sme_sudot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  // Multi-multi sudot builtins are mapped to usdot, with zn & zm operands swapped
+  def SVSUDOT_MULTI_ZA32_VG1x2  : Inst<"svsudot_za32[_{d}]_vg1x2", "vm2.d2.u", "c", MergeNone, "aarch64_sme_usdot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVSUDOT_MULTI_ZA32_VG1x4  : Inst<"svsudot_za32[_{d}]_vg1x4", "vm4.d4.u", "c", MergeNone, "aarch64_sme_usdot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVSUDOT_LANE_ZA32_VG1x2   : Inst<"svsudot_lane_za32[_{d}]_vg1x2", "vm2.dui", "c", MergeNone, "aarch64_sme_sudot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVSUDOT_LANE_ZA32_VG1x4   : Inst<"svsudot_lane_za32[_{d}]_vg1x4", "vm4.dui", "c", MergeNone, "aarch64_sme_sudot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+
+  // Multi-vector half-precision/BFloat16 floating-point dot-product
+  def SVDOT_MULTI_ZA32_VG1x2_F16  : Inst<"svdot_za32[_{d}]_vg1x2", "vm22", "bh", MergeNone, "aarch64_sme_fdot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA32_VG1x4_F16  : Inst<"svdot_za32[_{d}]_vg1x4", "vm44", "bh", MergeNone, "aarch64_sme_fdot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x2_F16 : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vm2d", "bh", MergeNone, "aarch64_sme_fdot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x4_F16 : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vm4d", "bh", MergeNone, "aarch64_sme_fdot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_LANE_ZA32_VG1x2_F16   : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vm2di", "bh", MergeNone, "aarch64_sme_fdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_ZA32_VG1x4_F16   : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vm4di", "bh", MergeNone, "aarch64_sme_fdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+}
+
+let TargetGuard = "sme2,sme-i16i64" in {
+  def SVVDOT_LANE_ZA64_VG1x4_S : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "s", MergeNone, "aarch64_sme_svdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVVDOT_LANE_ZA64_VG1x4_U : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "Us", MergeNone, "aarch64_sme_uvdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+
+  def SVDOT_MULTI_ZA64_VG1x2_S16  : Inst<"svdot_za64[_{d}]_vg1x2", "vm22", "s", MergeNone, "aarch64_sme_sdot_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA64_VG1x4_S16  : Inst<"svdot_za64[_{d}]_vg1x4", "vm44", "s", MergeNone, "aarch64_sme_sdot_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA64_VG1x2_U16  : Inst<"svdot_za64[_{d}]_vg1x2", "vm22", "Us", MergeNone, "aarch64_sme_udot_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA64_VG1x4_U16  : Inst<"svdot_za64[_{d}]_vg1x4", "vm44", "Us", MergeNone, "aarch64_sme_udot_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA64_VG1x2_S16 : Inst<"svdot[_single]_za64[_{d}]_vg1x2", "vm2d", "s", MergeNone, "aarch64_sme_sdot_single_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA64_VG1x4_S16 : Inst<"svdot[_single]_za64[_{d}]_vg1x4", "vm4d", "s", MergeNone, "aarch64_sme_sdot_single_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA64_VG1x2_U16 : Inst<"svdot[_single]_za64[_{d}]_vg1x2", "vm2d", "Us", MergeNone, "aarch64_sme_udot_single_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA64_VG1x4_U16 : Inst<"svdot[_single]_za64[_{d}]_vg1x4", "vm4d", "Us", MergeNone, "aarch64_sme_udot_single_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_LANE_ZA64_VG1x2_S16   : Inst<"svdot_lane_za64[_{d}]_vg1x2", "vm2di", "s", MergeNone, "aarch64_sme_sdot_lane_za64_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVDOT_LANE_ZA64_VG1x4_S16   : Inst<"svdot_lane_za64[_{d}]_vg1x4", "vm4di", "s", MergeNone, "aarch64_sme_sdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVDOT_LANE_ZA64_VG1x2_U16   : Inst<"svdot_lane_za64[_{d}]_vg1x2", "vm2di", "Us", MergeNone, "aarch64_sme_udot_lane_za64_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVDOT_LANE_ZA64_VG1x4_U16   : Inst<"svdot_lane_za64[_{d}]_vg1x4", "vm4di", "Us", MergeNone, "aarch64_sme_udot_lane_za64_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+}
+
+// FMLA/FMLS
+let TargetGuard = "sme2" in {
+  def SVMLA_MULTI_VG1x2_F32 : Inst<"svmla_za32[_{d}]_vg1x2", "vm22", "f", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLA_MULTI_VG1x4_F32 : Inst<"svmla_za32[_{d}]_vg1x4", "vm44", "f", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_MULTI_VG1x2_F32 : Inst<"svmls_za32[_{d}]_vg1x2", "vm22", "f", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_MULTI_VG1x4_F32 : Inst<"svmls_za32[_{d}]_vg1x4", "vm44", "f", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x2", "vm2d", "f", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLA_SINGLE_VG1x4_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x4", "vm4d", "f", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_SINGLE_VG1x2_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x2", "vm2d", "f", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_SINGLE_VG1x4_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x4", "vm4d", "f", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLA_LANE_VG1x2_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x2", "vm2di", "f", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVMLA_LANE_VG1x4_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x4", "vm4di", "f", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVMLS_LANE_VG1x2_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x2", "vm2di", "f", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVMLS_LANE_VG1x4_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x4", "vm4di", "f", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+}
+
+let TargetGuard = "sme2,sme-f64f64" in {
+  def SVMLA_MULTI_VG1x2_F64 : Inst<"svmla_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLA_MULTI_VG1x4_F64 : Inst<"svmla_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_MULTI_VG1x2_F64 : Inst<"svmls_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_MULTI_VG1x4_F64 : Inst<"svmls_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x2", "vm2d", "d", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLA_SINGLE_VG1x4_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x4", "vm4d", "d", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_SINGLE_VG1x2_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x2", "vm2d", "d", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_SINGLE_VG1x4_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x4", "vm4d", "d", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLA_LANE_VG1x2_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x2", "vm2di", "d", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVMLA_LANE_VG1x4_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVMLS_LANE_VG1x2_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x2", "vm2di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+}
+
+// FMLAL/FMLSL/UMLAL/SMLAL
+// SMLALL/UMLALL/USMLALL/SUMLALL
+let TargetGuard = "sme2" in {
+  // MULTI MLAL
+  def SVMLAL_MULTI_VG2x2_F16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "bh", MergeNone, "aarch64_sme_fmlal_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x4_F16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "bh", MergeNone, "aarch64_sme_fmlal_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x2_S16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "s", MergeNone, "aarch64_sme_smlal_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x4_S16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "s", MergeNone, "aarch64_sme_smlal_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x2_U16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "Us", MergeNone, "aarch64_sme_umlal_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x4_U16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "Us", MergeNone, "aarch64_sme_umlal_vg2x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLAL_MULTI_VG4x2_S8 : Inst<"svmla_za32[_{d}]_vg4x2", "vm22", "c", MergeNone, "aarch64_sme_smla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x2_U8 : Inst<"svmla_za32[_{d}]_vg4x2", "vm22", "Uc", MergeNone, "aarch64_sme_umla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x4_S8 : Inst<"svmla_za32[_{d}]_vg4x4", "vm44", "c", MergeNone, "aarch64_sme_smla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x4_U8 : Inst<"svmla_za32[_{d}]_vg4x4", "vm44", "Uc", MergeNone, "aarch64_sme_umla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // MULTI MLSL
+  def SVMLSL_MULTI_VG2x2_F16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "bh", MergeNone, "aarch64_sme_fmlsl_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x4_F16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "bh", MergeNone, "aarch64_sme_fmlsl_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x2_S16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "s", MergeNone, "aarch64_sme_smlsl_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x4_S16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "s", MergeNone, "aarch64_sme_smlsl_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x2_U16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "Us", MergeNone, "aarch64_sme_umlsl_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x4_U16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "Us", MergeNone, "aarch64_sme_umlsl_vg2x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLSL_MULTI_VG4x2_S8 : Inst<"svmls_za32[_{d}]_vg4x2", "vm22", "c", MergeNone, "aarch64_sme_smls_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x2_U8 : Inst<"svmls_za32[_{d}]_vg4x2", "vm22", "Uc", MergeNone, "aarch64_sme_umls_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x4_S8 : Inst<"svmls_za32[_{d}]_vg4x4", "vm44", "c", MergeNone, "aarch64_sme_smls_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x4_U8 : Inst<"svmls_za32[_{d}]_vg4x4", "vm44", "Uc", MergeNone, "aarch64_sme_umls_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // SINGLE MLAL
+  def SVMLAL_SINGLE_VG2x1_F16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x2_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x4_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x1_S16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "s", MergeNone, "aarch64_sme_smlal_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x2_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "s", MergeNone, "aarch64_sme_smlal_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x4_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "s", MergeNone, "aarch64_sme_smlal_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x1_U16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x2_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x4_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLAL_SINGLE_VG4x1_S8  : Inst<"svmla_za32[_{d}]_vg4x1",          "vmdd", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x1_U8  : Inst<"svmla_za32[_{d}]_vg4x1",          "vmdd", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x2_S8  : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vm2d", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x2_U8  : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vm2d", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x4_S8  : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vm4d", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x4_U8  : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vm4d", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // SINGLE MLSL
+  def SVMLSL_SINGLE_VG2x1_F16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x2_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x4_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x1_S16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x2_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x4_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x1_U16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x2_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x4_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLSL_SINGLE_VG4x1_S8  : Inst<"svmls_za32[_{d}]_vg4x1",          "vmdd", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x1_U8  : Inst<"svmls_za32[_{d}]_vg4x1",          "vmdd", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x2_S8  : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vm2d", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x2_U8  : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vm2d", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x4_S8  : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vm4d", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x4_U8  : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vm4d", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // INDEXED MLAL
+  def SVMLAL_LANE_VG2x1_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x2_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x4_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x1_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x2_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x4_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x1_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x2_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x4_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+
+  def SVMLAL_LANE_VG4x1_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmddi", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x1_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmddi", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x2_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vm2di", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x2_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vm2di", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x4_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vm4di", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x4_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vm4di", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+
+  // INDEXED MLSL
+  def SVMLSL_LANE_VG2x1_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x2_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x4_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x1_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x2_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x4_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x1_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x2_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x4_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+
+  def SVMLSL_LANE_VG4x1_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmddi", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x1_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmddi", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x2_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vm2di", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x2_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vm2di", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x4_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vm4di", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x4_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vm4di", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+
+  // SINGLE SUMLALL
+  // Single sumla maps to usmla, with zn & zm operands swapped
+  def SVSUMLALL_SINGLE_VG4x1 : Inst<"svsumla_za32[_{d}]_vg4x1",          "vmdu",   "c", MergeNone, "aarch64_sme_usmla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+
+  def SVSUMLALL_SINGLE_VG4x2 : Inst<"svsumla[_single]_za32[_{d}]_vg4x2", "vm2.du", "c", MergeNone, "aarch64_sme_sumla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVSUMLALL_SINGLE_VG4x4 : Inst<"svsumla[_single]_za32[_{d}]_vg4x4", "vm4.du", "c", MergeNone, "aarch64_sme_sumla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // Multi-multi sumla builtins are mapped to usmla, with zn & zm operands swapped
+  def SVSUMLALL_MULTI_VG4x2 : Inst<"svsumla_za32[_{d}]_vg4x2", "vm2.d2.u", "c", MergeNone, "aarch64_sme_usmla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVSUMLALL_MULTI_VG4x4 : Inst<"svsumla_za32[_{d}]_vg4x4", "vm4.d4.u", "c", MergeNone, "aarch64_sme_usmla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // INDEXED SUMLALL
+  def SVSUMLALL_LANE_VG4x1 : Inst<"svsumla_lane_za32[_{d}]_vg4x1", "vmdui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVSUMLALL_LANE_VG4x2 : Inst<"svsumla_lane_za32[_{d}]_vg4x2", "vm2ui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVSUMLALL_LANE_VG4x4 : Inst<"svsumla_lane_za32[_{d}]_vg4x4", "vm4ui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+
+  // SINGLE USMLALL
+  def SVUSMLALL_SINGLE_VG4x1 : Inst<"svusmla_za32[_{d}]_vg4x1",          "vmdx",   "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVUSMLALL_SINGLE_VG4x2 : Inst<"svusmla[_single]_za32[_{d}]_vg4x2", "vm2.dx", "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVUSMLALL_SINGLE_VG4x4 : Inst<"svusmla[_single]_za32[_{d}]_vg4x4", "vm4.dx", "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // MULTI USMLALL
+  def SVUSMLALL_MULTI_VG4x2 : Inst<"svusmla_za32[_{d}]_vg4x2", "vm2.d2.x", "Uc", MergeNone, "aarch64_sme_usmla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVUSMLALL_MULTI_VG4x4 : Inst<"svusmla_za32[_{d}]_vg4x4", "vm4.d4.x", "Uc", MergeNone, "aarch64_sme_usmla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // INDEXED USMLALL
+  def SVUSMLALL_LANE_VG4x1 : Inst<"svusmla_lane_za32[_{d}]_vg4x1", "vmdxi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVUSMLALL_LANE_VG4x2 : Inst<"svusmla_lane_za32[_{d}]_vg4x2", "vm2xi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVUSMLALL_LANE_VG4x4 : Inst<"svusmla_lane_za32[_{d}]_vg4x4", "vm4xi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+}
+
+let TargetGuard = "sme2,sme-i16i64" in {
+  // MULTI MLAL
+  def SVMLAL_MULTI_VG4x2_S16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smla_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x2_U16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x4_S16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smla_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x4_U16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // MULTI MLSL
+  def SVMLSL_MULTI_VG4x2_S16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smls_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x2_U16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x4_S16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smls_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x4_U16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // SINGLE MLAL
+  def SVMLAL_SINGLE_VG4x1_S16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x1_U16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x2_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x2_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x4_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x4_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // SINGLE MLSL
+  def SVMLSL_SINGLE_VG4x1_S16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x1_U16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x2_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x2_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x4_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x4_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // INDEXED MLAL
+  def SVMLAL_LANE_VG4x1_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x1_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x2_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x2_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x4_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x4_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+
+  // INDEXED MLSL
+  def SVMLSL_LANE_VG4x1_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x1_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x2_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x2_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x4_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x4_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
 }
 
 //
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 04bf7acdeba7..91f62c4c7633 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1988,79 +1988,61 @@ def SVWHILELO_COUNT  : SInst<"svwhilelt_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNo
 def SVWHILELS_COUNT  : SInst<"svwhilele_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilels_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
 def SVWHILEHI_COUNT  : SInst<"svwhilegt_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehi_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
 def SVWHILEHS_COUNT  : SInst<"svwhilege_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehs_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+}
+
+multiclass MultiVecLoad<string i> {
+  // FIXME: Replace IsStreamingCompatible with IsStreamingOrHasSVE2p1 when available (SME2 requires __arm_streaming)
+  def SV # NAME # B_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+
+  def SV # NAME # B_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+}
 
-def SVLD1B_X2 : MInst<"svld1[_{2}]_x2", "2}c", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1H_X2 : MInst<"svld1[_{2}]_x2", "2}c", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1W_X2 : MInst<"svld1[_{2}]_x2", "2}c", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1D_X2 : MInst<"svld1[_{2}]_x2", "2}c", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1B_X4 : MInst<"svld1[_{2}]_x4", "4}c", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-def SVLD1H_X4 : MInst<"svld1[_{2}]_x4", "4}c", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-def SVLD1W_X4 : MInst<"svld1[_{2}]_x4", "4}c", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-def SVLD1D_X4 : MInst<"svld1[_{2}]_x4", "4}c", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-
-def SVLDNT1B_X2 : MInst<"svldnt1[_{2}]_x2", "2}c", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1H_X2 : MInst<"svldnt1[_{2}]_x2", "2}c", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1W_X2 : MInst<"svldnt1[_{2}]_x2", "2}c", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1D_X2 : MInst<"svldnt1[_{2}]_x2", "2}c", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1B_X4 : MInst<"svldnt1[_{2}]_x4", "4}c", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-def SVLDNT1H_X4 : MInst<"svldnt1[_{2}]_x4", "4}c", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-def SVLDNT1W_X4 : MInst<"svldnt1[_{2}]_x4", "4}c", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-def SVLDNT1D_X4 : MInst<"svldnt1[_{2}]_x4", "4}c", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-
-def SVLD1B_VNUM_X2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1H_VNUM_X2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1W_VNUM_X2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1D_VNUM_X2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1B_VNUM_X4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-def SVLD1H_VNUM_X4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-def SVLD1W_VNUM_X4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-def SVLD1D_VNUM_X4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-
-def SVLDNT1B_VNUM_X2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1H_VNUM_X2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1W_VNUM_X2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1D_VNUM_X2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1B_VNUM_X4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-def SVLDNT1H_VNUM_X4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-def SVLDNT1W_VNUM_X4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-def SVLDNT1D_VNUM_X4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-
-def SVST1B_X2 : MInst<"svst1[_{2}_x2]", "v}p2", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1H_X2 : MInst<"svst1[_{2}_x2]", "v}p2", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1W_X2 : MInst<"svst1[_{2}_x2]", "v}p2", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1D_X2 : MInst<"svst1[_{2}_x2]", "v}p2", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1B_X4 : MInst<"svst1[_{2}_x4]", "v}p4", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-def SVST1H_X4 : MInst<"svst1[_{2}_x4]", "v}p4", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-def SVST1W_X4 : MInst<"svst1[_{2}_x4]", "v}p4", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-def SVST1D_X4 : MInst<"svst1[_{2}_x4]", "v}p4", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-
-def SVST1B_VNUM_X2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1H_VNUM_X2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1W_VNUM_X2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1D_VNUM_X2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1B_VNUM_X4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-def SVST1H_VNUM_X4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-def SVST1W_VNUM_X4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-def SVST1D_VNUM_X4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-
-def SVSTNT1B_X2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1H_X2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1W_X2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1D_X2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1B_X4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-def SVSTNT1H_X4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-def SVSTNT1W_X4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-def SVSTNT1D_X4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-
-def SVSTNT1B_VNUM_X2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1H_VNUM_X2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1W_VNUM_X2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1D_VNUM_X2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1B_VNUM_X4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-def SVSTNT1H_VNUM_X4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-def SVSTNT1W_VNUM_X4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-def SVSTNT1D_VNUM_X4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
+let TargetGuard = "sve2p1|sme2" in {
+  defm LD1   : MultiVecLoad<"ld1">;
+  defm LDNT1 : MultiVecLoad<"ldnt1">;
+}
+
+multiclass MultiVecStore<string i> {
+  // FIXME: Replace IsStreamingCompatible with IsStreamingOrHasSVE2p1 when available (SME2 requires __arm_streaming)
+  def SV # NAME # B_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+
+  def SV # NAME # B_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+}
+
+let TargetGuard = "sve2p1|sme2" in {
+  defm ST1   : MultiVecStore<"st1">;
+  defm STNT1 : MultiVecStore<"stnt1">;
+}
 
+let TargetGuard = "sve2p1" in {
 def SVDOT_X2_S : SInst<"svdot[_{d}_{2}_{3}]", "ddhh", "i",  MergeNone, "aarch64_sve_sdot_x2", [], []>;
 def SVDOT_X2_U : SInst<"svdot[_{d}_{2}_{3}]", "ddhh", "Ui", MergeNone, "aarch64_sve_udot_x2", [], []>;
 def SVDOT_X2_F : SInst<"svdot[_{d}_{2}_{3}]", "ddhh", "f",  MergeNone, "aarch64_sve_fdot_x2", [], []>;
@@ -2142,6 +2124,21 @@ let TargetGuard = "sme2" in {
 }
 
 let TargetGuard = "sme2" in {
+  // FRINTA / FRINTM / FRINTN / FRINTP
+  def SVRINTA_X2 : SInst<"svrinta[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frinta_x2", [IsStreaming], []>;
+  def SVRINTA_X4 : SInst<"svrinta[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frinta_x4", [IsStreaming], []>;
+
+  def SVRINTM_X2 : SInst<"svrintm[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frintm_x2", [IsStreaming], []>;
+  def SVRINTM_X4 : SInst<"svrintm[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frintm_x4", [IsStreaming], []>;
+
+  def SVRINTN_X2 : SInst<"svrintn[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frintn_x2", [IsStreaming], []>;
+  def SVRINTN_X4 : SInst<"svrintn[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frintn_x4", [IsStreaming], []>;
+
+  def SVRINTP_X2 : SInst<"svrintp[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frintp_x2", [IsStreaming], []>;
+  def SVRINTP_X4 : SInst<"svrintp[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frintp_x4", [IsStreaming], []>;
+}
+
+let TargetGuard = "sme2" in {
   def SVSCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]",  "22dd",   "csil",     MergeNone, "aarch64_sve_sclamp_single_x2",  [IsStreaming], []>;
   def SVUCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]",  "22dd",   "UcUsUiUl", MergeNone, "aarch64_sve_uclamp_single_x2",  [IsStreaming], []>;
   def SVFCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]",  "22dd",   "hfd",      MergeNone, "aarch64_sve_fclamp_single_x2",  [IsStreaming], []>;
diff --git a/clang/include/clang/Basic/riscv_sifive_vector.td b/clang/include/clang/Basic/riscv_sifive_vector.td
index bb54e2664186..0d471f6c554c 100644
--- a/clang/include/clang/Basic/riscv_sifive_vector.td
+++ b/clang/include/clang/Basic/riscv_sifive_vector.td
@@ -109,10 +109,10 @@ multiclass RVVVFWMACCBuiltinSet<list<list<string>> suffixes_prototypes> {
       Name = NAME,
       HasMasked = false,
       Log2LMUL = [-2, -1, 0, 1, 2] in
-    defm NAME : RVVOutOp1Op2BuiltinSet<NAME, "b", suffixes_prototypes>;
+    defm NAME : RVVOutOp1Op2BuiltinSet<NAME, "y", suffixes_prototypes>;
 }
 
-multiclass RVVVQMACCBuiltinSet<list<list<string>> suffixes_prototypes> {
+multiclass RVVVQMACCDODBuiltinSet<list<list<string>> suffixes_prototypes> {
   let OverloadedName = NAME,
       Name = NAME,
       HasMasked = false,
@@ -120,6 +120,14 @@ multiclass RVVVQMACCBuiltinSet<list<list<string>> suffixes_prototypes> {
     defm NAME : RVVOutOp1Op2BuiltinSet<NAME, "i", suffixes_prototypes>;
 }
 
+multiclass RVVVQMACCQOQBuiltinSet<list<list<string>> suffixes_prototypes> {
+   let OverloadedName = NAME,
+       Name = NAME,
+       HasMasked = false,
+       Log2LMUL = [-1, 0, 1, 2] in
+     defm NAME : RVVOutOp1Op2BuiltinSet<NAME, "s", suffixes_prototypes>;
+}
+
 multiclass RVVVFNRCLIPBuiltinSet<string suffix, string prototype, string type_range> {
   let Log2LMUL = [-3, -2, -1, 0, 1, 2],
       Name = NAME,
@@ -130,18 +138,18 @@ multiclass RVVVFNRCLIPBuiltinSet<string suffix, string prototype, string type_ra
 
 let UnMaskedPolicyScheme = HasPolicyOperand in
   let RequiredFeatures = ["Xsfvqmaccdod"] in {
-    defm sf_vqmaccu_2x8x2 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)SUv(FixedSEW:8)Uv"]]>;
-    defm sf_vqmacc_2x8x2 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)Sv(FixedSEW:8)v"]]>;
-    defm sf_vqmaccus_2x8x2 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)SUv(FixedSEW:8)v"]]>;
-    defm sf_vqmaccsu_2x8x2 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>;
+    defm sf_vqmaccu_2x8x2 : RVVVQMACCDODBuiltinSet<[["", "v", "vv(FixedSEW:8)SUv(FixedSEW:8)Uv"]]>;
+    defm sf_vqmacc_2x8x2 : RVVVQMACCDODBuiltinSet<[["", "v", "vv(FixedSEW:8)Sv(FixedSEW:8)v"]]>;
+    defm sf_vqmaccus_2x8x2 : RVVVQMACCDODBuiltinSet<[["", "v", "vv(FixedSEW:8)SUv(FixedSEW:8)v"]]>;
+    defm sf_vqmaccsu_2x8x2 : RVVVQMACCDODBuiltinSet<[["", "v", "vv(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>;
   }
 
 let UnMaskedPolicyScheme = HasPolicyOperand in
   let RequiredFeatures = ["Xsfvqmaccqoq"] in {
-    defm sf_vqmaccu_4x8x4 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)SUv(FixedSEW:8)Uv"]]>;
-    defm sf_vqmacc_4x8x4 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)Sv(FixedSEW:8)v"]]>;
-    defm sf_vqmaccus_4x8x4 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)SUv(FixedSEW:8)v"]]>;
-    defm sf_vqmaccsu_4x8x4 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>;
+    defm sf_vqmaccu_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)SUv(FixedSEW:8)Uv"]]>;
+    defm sf_vqmacc_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)Sv(FixedSEW:8)v"]]>;
+    defm sf_vqmaccus_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)SUv(FixedSEW:8)v"]]>;
+    defm sf_vqmaccsu_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>;
   }
 
 let UnMaskedPolicyScheme = HasPolicyOperand in
diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index f2dde7f540fb..e7d78b03511f 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -2441,11 +2441,9 @@ let HasMasked = false, HasVL = false, IRName = "" in {
         return Builder.CreateInsertVector(ResultType, Ops[0], Ops[2], Ops[1]);
       }
       }] in {
-    let Log2LMUL = [0, 1, 2] in {
-      foreach dst_lmul = ["(LFixedLog2LMUL:1)", "(LFixedLog2LMUL:2)", "(LFixedLog2LMUL:3)"] in {
-        def : RVVBuiltin<"v" # dst_lmul # "v", dst_lmul # "v" # dst_lmul # "vKzv", "csilxfd">;
-        def : RVVBuiltin<"Uv" # dst_lmul # "Uv", dst_lmul # "Uv" # dst_lmul #"UvKzUv", "csil">;
-      }
+    foreach dst_lmul = ["(LFixedLog2LMUL:1)", "(LFixedLog2LMUL:2)", "(LFixedLog2LMUL:3)"] in {
+      def : RVVBuiltin<"v" # dst_lmul # "v", dst_lmul # "v" # dst_lmul # "vKzv", "csilxfd">;
+      def : RVVBuiltin<"Uv" # dst_lmul # "Uv", dst_lmul # "Uv" # dst_lmul #"UvKzUv", "csil">;
     }
     foreach nf = NFList in {
       defvar T = "(Tuple:" # nf # ")";
diff --git a/clang/include/clang/Basic/riscv_vector_common.td b/clang/include/clang/Basic/riscv_vector_common.td
index 4036ce8e6903..040db6f0cdbf 100644
--- a/clang/include/clang/Basic/riscv_vector_common.td
+++ b/clang/include/clang/Basic/riscv_vector_common.td
@@ -41,7 +41,7 @@
 //   x: float16_t (half)
 //   f: float32_t (float)
 //   d: float64_t (double)
-//   b: bfloat16_t (bfloat16)
+//   y: bfloat16_t (bfloat16)
 //
 // This way, given an LMUL, a record with a TypeRange "sil" will cause the
 // definition of 3 builtins. Each type "t" in the TypeRange (in this example
diff --git a/clang/include/clang/Config/config.h.cmake b/clang/include/clang/Config/config.h.cmake
index a54a26cd32ff..4015ac804086 100644
--- a/clang/include/clang/Config/config.h.cmake
+++ b/clang/include/clang/Config/config.h.cmake
@@ -57,6 +57,12 @@
 /* Define if we have sys/resource.h (rlimits) */
 #cmakedefine CLANG_HAVE_RLIMITS ${CLANG_HAVE_RLIMITS}
 
+/* Define if we have dlfcn.h */
+#cmakedefine CLANG_HAVE_DLFCN_H ${CLANG_HAVE_DLFCN_H}
+
+/* Define if dladdr() is available on this platform. */
+#cmakedefine CLANG_HAVE_DLADDR ${CLANG_HAVE_DLADDR}
+
 /* Linker version detected at compile time. */
 #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1b02087425b7..2b93ddf03349 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5308,7 +5308,8 @@ def rewrite_objc : Flag<["-"], "rewrite-objc">, Flags<[NoXarchOption]>,
 def rewrite_legacy_objc : Flag<["-"], "rewrite-legacy-objc">,
   Flags<[NoXarchOption]>,
   HelpText<"Rewrite Legacy Objective-C source to C++">;
-def rdynamic : Flag<["-"], "rdynamic">, Group<Link_Group>;
+def rdynamic : Flag<["-"], "rdynamic">, Group<Link_Group>,
+  Visibility<[ClangOption, FlangOption]>;
 def resource_dir : Separate<["-"], "resource-dir">,
   Flags<[NoXarchOption, HelpHidden]>,
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>,
@@ -6999,6 +7000,8 @@ def msign_return_address_key_EQ : Joined<["-"], "msign-return-address-key=">,
     Values<"a_key,b_key">;
 def mbranch_target_enforce : Flag<["-"], "mbranch-target-enforce">,
   MarshallingInfoFlag<LangOpts<"BranchTargetEnforcement">>;
+def mbranch_protection_pauth_lr : Flag<["-"], "mbranch-protection-pauth-lr">,
+  MarshallingInfoFlag<LangOpts<"BranchProtectionPAuthLR">>;
 def fno_dllexport_inlines : Flag<["-"], "fno-dllexport-inlines">,
   MarshallingInfoNegativeFlag<LangOpts<"DllExportInlines">>;
 def cfguard_no_checks : Flag<["-"], "cfguard-no-checks">,
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index 59358e77edb0..21d791f5cd89 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -2422,6 +2422,8 @@ public:
     CurrentBitsIndex = 0;
   }
 
+  void advance(uint32_t BitsWidth) { CurrentBitsIndex += BitsWidth; }
+
   bool getNextBit() {
     assert(isValid());
     return Value & (1 << CurrentBitsIndex++);
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index a56929ef0245..de69f99003d8 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -564,11 +564,25 @@ private:
   unsigned DeclEnumAbbrev = 0;
   unsigned DeclObjCIvarAbbrev = 0;
   unsigned DeclCXXMethodAbbrev = 0;
+  unsigned DeclDependentNonTemplateCXXMethodAbbrev = 0;
+  unsigned DeclTemplateCXXMethodAbbrev = 0;
+  unsigned DeclMemberSpecializedCXXMethodAbbrev = 0;
+  unsigned DeclTemplateSpecializedCXXMethodAbbrev = 0;
+  unsigned DeclDependentSpecializationCXXMethodAbbrev = 0;
+  unsigned DeclTemplateTypeParmAbbrev = 0;
+  unsigned DeclUsingShadowAbbrev = 0;
 
   unsigned DeclRefExprAbbrev = 0;
   unsigned CharacterLiteralAbbrev = 0;
   unsigned IntegerLiteralAbbrev = 0;
   unsigned ExprImplicitCastAbbrev = 0;
+  unsigned BinaryOperatorAbbrev = 0;
+  unsigned CompoundAssignOperatorAbbrev = 0;
+  unsigned CallExprAbbrev = 0;
+  unsigned CXXOperatorCallExprAbbrev = 0;
+  unsigned CXXMemberCallExprAbbrev = 0;
+
+  unsigned CompoundStmtAbbrev = 0;
 
   void WriteDeclAbbrevs();
   void WriteDecl(ASTContext &Context, Decl *D);
@@ -735,12 +749,41 @@ public:
   unsigned getDeclFieldAbbrev() const { return DeclFieldAbbrev; }
   unsigned getDeclEnumAbbrev() const { return DeclEnumAbbrev; }
   unsigned getDeclObjCIvarAbbrev() const { return DeclObjCIvarAbbrev; }
-  unsigned getDeclCXXMethodAbbrev() const { return DeclCXXMethodAbbrev; }
+  unsigned getDeclCXXMethodAbbrev(FunctionDecl::TemplatedKind Kind) const {
+    switch (Kind) {
+    case FunctionDecl::TK_NonTemplate:
+      return DeclCXXMethodAbbrev;
+    case FunctionDecl::TK_FunctionTemplate:
+      return DeclTemplateCXXMethodAbbrev;
+    case FunctionDecl::TK_MemberSpecialization:
+      return DeclMemberSpecializedCXXMethodAbbrev;
+    case FunctionDecl::TK_FunctionTemplateSpecialization:
+      return DeclTemplateSpecializedCXXMethodAbbrev;
+    case FunctionDecl::TK_DependentNonTemplate:
+      return DeclDependentNonTemplateCXXMethodAbbrev;
+    case FunctionDecl::TK_DependentFunctionTemplateSpecialization:
+      return DeclDependentSpecializationCXXMethodAbbrev;
+    }
+    llvm_unreachable("Unknwon Template Kind!");
+  }
+  unsigned getDeclTemplateTypeParmAbbrev() const {
+    return DeclTemplateTypeParmAbbrev;
+  }
+  unsigned getDeclUsingShadowAbbrev() const { return DeclUsingShadowAbbrev; }
 
   unsigned getDeclRefExprAbbrev() const { return DeclRefExprAbbrev; }
   unsigned getCharacterLiteralAbbrev() const { return CharacterLiteralAbbrev; }
   unsigned getIntegerLiteralAbbrev() const { return IntegerLiteralAbbrev; }
   unsigned getExprImplicitCastAbbrev() const { return ExprImplicitCastAbbrev; }
+  unsigned getBinaryOperatorAbbrev() const { return BinaryOperatorAbbrev; }
+  unsigned getCompoundAssignOperatorAbbrev() const {
+    return CompoundAssignOperatorAbbrev;
+  }
+  unsigned getCallExprAbbrev() const { return CallExprAbbrev; }
+  unsigned getCXXOperatorCallExprAbbrev() { return CXXOperatorCallExprAbbrev; }
+  unsigned getCXXMemberCallExprAbbrev() { return CXXMemberCallExprAbbrev; }
+
+  unsigned getCompoundStmtAbbrev() const { return CompoundStmtAbbrev; }
 
   bool hasChain() const { return Chain; }
   ASTReader *getChain() const { return Chain; }
@@ -841,46 +884,33 @@ public:
   BitsPacker(BitsPacker &&) = delete;
   BitsPacker operator=(const BitsPacker &) = delete;
   BitsPacker operator=(BitsPacker &&) = delete;
-  ~BitsPacker() {
-    assert(!hasUnconsumedValues() && "There are unprocessed bits!");
+  ~BitsPacker() = default;
+
+  bool canWriteNextNBits(uint32_t BitsWidth) const {
+    return CurrentBitIndex + BitsWidth < BitIndexUpbound;
+  }
+
+  void reset(uint32_t Value) {
+    UnderlyingValue = Value;
+    CurrentBitIndex = 0;
   }
 
   void addBit(bool Value) { addBits(Value, 1); }
   void addBits(uint32_t Value, uint32_t BitsWidth) {
     assert(BitsWidth < BitIndexUpbound);
     assert((Value < (1u << BitsWidth)) && "Passing narrower bit width!");
+    assert(canWriteNextNBits(BitsWidth) &&
+           "Inserting too much bits into a value!");
 
-    if (CurrentBitIndex + BitsWidth >= BitIndexUpbound) {
-      Values.push_back(0);
-      CurrentBitIndex = 0;
-    }
-
-    assert(CurrentBitIndex < BitIndexUpbound);
-    Values.back() |= Value << CurrentBitIndex;
+    UnderlyingValue |= Value << CurrentBitIndex;
     CurrentBitIndex += BitsWidth;
   }
 
-  bool hasUnconsumedValues() const {
-    return ConsumingValueIndex < Values.size();
-  }
-  uint32_t getNextValue() {
-    assert(hasUnconsumedValues());
-    return Values[ConsumingValueIndex++];
-  }
-
-  // We can convert the packer to an uint32_t if there is only one values.
-  operator uint32_t() {
-    assert(Values.size() == 1);
-    return getNextValue();
-  }
+  operator uint32_t() { return UnderlyingValue; }
 
 private:
-  SmallVector<uint64_t, 4> Values;
-  uint16_t ConsumingValueIndex = 0;
-  // Initialize CurrentBitIndex with an invalid value
-  // to make it easier to update Values. See the implementation
-  // of `addBits` to see the details.
-  uint16_t CurrentBitIndex = BitIndexUpbound;
+  uint32_t UnderlyingValue = 0;
+  uint32_t CurrentBitIndex = 0;
 };
 
 } // namespace clang
diff --git a/clang/include/clang/StaticAnalyzer/Core/BugReporter/CommonBugCategories.h b/clang/include/clang/StaticAnalyzer/Core/BugReporter/CommonBugCategories.h
index 5d2c96e5bc9d..45187433c069 100644
--- a/clang/include/clang/StaticAnalyzer/Core/BugReporter/CommonBugCategories.h
+++ b/clang/include/clang/StaticAnalyzer/Core/BugReporter/CommonBugCategories.h
@@ -13,6 +13,7 @@
 namespace clang {
 namespace ento {
 namespace categories {
+extern const char *const AppleAPIMisuse;
 extern const char *const CoreFoundationObjectiveC;
 extern const char *const LogicError;
 extern const char *const MemoryRefCount;
diff --git a/clang/include/clang/StaticAnalyzer/Core/Checker.h b/clang/include/clang/StaticAnalyzer/Core/Checker.h
index 8a46282a595e..2ec54a837c42 100644
--- a/clang/include/clang/StaticAnalyzer/Core/Checker.h
+++ b/clang/include/clang/StaticAnalyzer/Core/Checker.h
@@ -193,9 +193,8 @@ public:
 
 class Location {
   template <typename CHECKER>
-  static void _checkLocation(void *checker,
-                             const SVal &location, bool isLoad, const Stmt *S,
-                             CheckerContext &C) {
+  static void _checkLocation(void *checker, SVal location, bool isLoad,
+                             const Stmt *S, CheckerContext &C) {
     ((const CHECKER *)checker)->checkLocation(location, isLoad, S, C);
   }
 
@@ -209,8 +208,7 @@ public:
 
 class Bind {
   template <typename CHECKER>
-  static void _checkBind(void *checker,
-                         const SVal &location, const SVal &val, const Stmt *S,
+  static void _checkBind(void *checker, SVal location, SVal val, const Stmt *S,
                          CheckerContext &C) {
     ((const CHECKER *)checker)->checkBind(location, val, S, C);
   }
@@ -456,10 +454,8 @@ namespace eval {
 
 class Assume {
   template <typename CHECKER>
-  static ProgramStateRef _evalAssume(void *checker,
-                                         ProgramStateRef state,
-                                         const SVal &cond,
-                                         bool assumption) {
+  static ProgramStateRef _evalAssume(void *checker, ProgramStateRef state,
+                                     SVal cond, bool assumption) {
     return ((const CHECKER *)checker)->evalAssume(state, cond, assumption);
   }
 
diff --git a/clang/include/clang/StaticAnalyzer/Core/CheckerManager.h b/clang/include/clang/StaticAnalyzer/Core/CheckerManager.h
index 39583c443eda..a45ba1bc573e 100644
--- a/clang/include/clang/StaticAnalyzer/Core/CheckerManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/CheckerManager.h
@@ -488,13 +488,11 @@ public:
   using CheckCallFunc =
       CheckerFn<void (const CallEvent &, CheckerContext &)>;
 
-  using CheckLocationFunc =
-      CheckerFn<void (const SVal &location, bool isLoad, const Stmt *S,
-                      CheckerContext &)>;
+  using CheckLocationFunc = CheckerFn<void(SVal location, bool isLoad,
+                                           const Stmt *S, CheckerContext &)>;
 
   using CheckBindFunc =
-      CheckerFn<void (const SVal &location, const SVal &val, const Stmt *S,
-                      CheckerContext &)>;
+      CheckerFn<void(SVal location, SVal val, const Stmt *S, CheckerContext &)>;
 
   using CheckEndAnalysisFunc =
       CheckerFn<void (ExplodedGraph &, BugReporter &, ExprEngine &)>;
@@ -530,8 +528,7 @@ public:
                                  RegionAndSymbolInvalidationTraits *ITraits)>;
 
   using EvalAssumeFunc =
-      CheckerFn<ProgramStateRef (ProgramStateRef, const SVal &cond,
-                                 bool assumption)>;
+      CheckerFn<ProgramStateRef(ProgramStateRef, SVal cond, bool assumption)>;
 
   using EvalCallFunc = CheckerFn<bool (const CallEvent &, CheckerContext &)>;
 
diff --git a/clang/lib/APINotes/APINotesManager.cpp b/clang/lib/APINotes/APINotesManager.cpp
index a921c8b9fce3..d3aef09dac91 100644
--- a/clang/lib/APINotes/APINotesManager.cpp
+++ b/clang/lib/APINotes/APINotesManager.cpp
@@ -125,7 +125,7 @@ APINotesManager::loadAPINotes(StringRef Buffer) {
 
 bool APINotesManager::loadAPINotes(const DirectoryEntry *HeaderDir,
                                    FileEntryRef APINotesFile) {
-  assert(Readers.find(HeaderDir) == Readers.end());
+  assert(!Readers.contains(HeaderDir));
   if (auto Reader = loadAPINotes(APINotesFile)) {
     Readers[HeaderDir] = Reader.release();
     return false;
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 49d0dd218d68..b61180c4f349 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -2771,9 +2771,11 @@ ASTNodeImporter::VisitTypeAliasTemplateDecl(TypeAliasTemplateDecl *D) {
     for (auto *FoundDecl : FoundDecls) {
       if (!FoundDecl->isInIdentifierNamespace(IDNS))
         continue;
-      if (auto *FoundAlias = dyn_cast<TypeAliasTemplateDecl>(FoundDecl))
-        return Importer.MapImported(D, FoundAlias);
-      ConflictingDecls.push_back(FoundDecl);
+      if (auto *FoundAlias = dyn_cast<TypeAliasTemplateDecl>(FoundDecl)) {
+        if (IsStructuralMatch(D, FoundAlias))
+          return Importer.MapImported(D, FoundAlias);
+        ConflictingDecls.push_back(FoundDecl);
+      }
     }
 
     if (!ConflictingDecls.empty()) {
@@ -3418,10 +3420,16 @@ static bool isAncestorDeclContextOf(const DeclContext *DC, const Stmt *S) {
   while (!ToProcess.empty()) {
     const Stmt *CurrentS = ToProcess.pop_back_val();
     ToProcess.append(CurrentS->child_begin(), CurrentS->child_end());
-    if (const auto *DeclRef = dyn_cast<DeclRefExpr>(CurrentS))
+    if (const auto *DeclRef = dyn_cast<DeclRefExpr>(CurrentS)) {
       if (const Decl *D = DeclRef->getDecl())
         if (isAncestorDeclContextOf(DC, D))
           return true;
+    } else if (const auto *E =
+                   dyn_cast_or_null<SubstNonTypeTemplateParmExpr>(CurrentS)) {
+      if (const Decl *D = E->getAssociatedDecl())
+        if (isAncestorDeclContextOf(DC, D))
+          return true;
+    }
   }
   return false;
 }
@@ -7820,6 +7828,18 @@ ExpectedStmt ASTNodeImporter::VisitExplicitCastExpr(ExplicitCastExpr *E) {
         *ToLParenLocOrErr, OCE->getBridgeKind(), E->getCastKind(),
         *ToBridgeKeywordLocOrErr, ToTypeInfoAsWritten, ToSubExpr);
   }
+  case Stmt::BuiltinBitCastExprClass: {
+    auto *BBC = cast<BuiltinBitCastExpr>(E);
+    ExpectedSLoc ToKWLocOrErr = import(BBC->getBeginLoc());
+    if (!ToKWLocOrErr)
+      return ToKWLocOrErr.takeError();
+    ExpectedSLoc ToRParenLocOrErr = import(BBC->getEndLoc());
+    if (!ToRParenLocOrErr)
+      return ToRParenLocOrErr.takeError();
+    return new (Importer.getToContext()) BuiltinBitCastExpr(
+        ToType, E->getValueKind(), E->getCastKind(), ToSubExpr,
+        ToTypeInfoAsWritten, *ToKWLocOrErr, *ToRParenLocOrErr);
+  }
   default:
     llvm_unreachable("Cast expression of unsupported type!");
     return make_error<ASTImportError>(ASTImportError::UnsupportedConstruct);
@@ -9097,6 +9117,12 @@ Expected<Attr *> ASTImporter::Import(const Attr *FromAttr) {
     break;
   }
 
+  case attr::AlignValue: {
+    auto *From = cast<AlignValueAttr>(FromAttr);
+    AI.importAttr(From, AI.importArg(From->getAlignment()).value());
+    break;
+  }
+
   case attr::Format: {
     const auto *From = cast<FormatAttr>(FromAttr);
     AI.importAttr(From, Import(From->getType()), From->getFormatIdx(),
@@ -9378,7 +9404,6 @@ Expected<Decl *> ASTImporter::Import(Decl *FromD) {
     setImportDeclError(FromD, *Error);
     return make_error<ASTImportError>(*Error);
   }
-
   // Make sure that ImportImpl registered the imported decl.
   assert(ImportedDecls.count(FromD) != 0 && "Missing call to MapImported?");
   if (auto Error = ImportAttrs(ToD, FromD))
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 6bb4bf14b873..1f492b051e03 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -1978,6 +1978,18 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
 }
 
 static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
+                                     TypeAliasTemplateDecl *D1,
+                                     TypeAliasTemplateDecl *D2) {
+  // Check template parameters.
+  if (!IsTemplateDeclCommonStructurallyEquivalent(Context, D1, D2))
+    return false;
+
+  // Check the templated declaration.
+  return IsStructurallyEquivalent(Context, D1->getTemplatedDecl(),
+                                  D2->getTemplatedDecl());
+}
+
+static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
                                      ConceptDecl *D1,
                                      ConceptDecl *D2) {
   // Check template parameters.
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index c2ea15567919..12e0a6faa4c3 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -2943,7 +2943,7 @@ bool ParmVarDecl::isDestroyedInCallee() const {
 
   // FIXME: isParamDestroyedInCallee() should probably imply
   // isDestructedType()
-  auto *RT = getType()->getAs<RecordType>();
+  const auto *RT = getType()->getAs<RecordType>();
   if (RT && RT->getDecl()->isParamDestroyedInCallee() &&
       getType().isDestructedType())
     return true;
@@ -3105,7 +3105,7 @@ FunctionDecl::getDefaultedFunctionInfo() const {
 }
 
 bool FunctionDecl::hasBody(const FunctionDecl *&Definition) const {
-  for (auto *I : redecls()) {
+  for (const auto *I : redecls()) {
     if (I->doesThisDeclarationHaveABody()) {
       Definition = I;
       return true;
@@ -3116,7 +3116,7 @@ bool FunctionDecl::hasBody(const FunctionDecl *&Definition) const {
 }
 
 bool FunctionDecl::hasTrivialBody() const {
-  Stmt *S = getBody();
+  const Stmt *S = getBody();
   if (!S) {
     // Since we don't have a body for this function, we don't know if it's
     // trivial or not.
@@ -3212,7 +3212,7 @@ void FunctionDecl::setPure(bool P) {
 
 template<std::size_t Len>
 static bool isNamed(const NamedDecl *ND, const char (&Str)[Len]) {
-  IdentifierInfo *II = ND->getIdentifier();
+  const IdentifierInfo *II = ND->getIdentifier();
   return II && II->isStr(Str);
 }
 
@@ -3305,9 +3305,9 @@ bool FunctionDecl::isReservedGlobalPlacementOperator() const {
   if (proto->getNumParams() != 2 || proto->isVariadic())
     return false;
 
-  ASTContext &Context =
-    cast<TranslationUnitDecl>(getDeclContext()->getRedeclContext())
-      ->getASTContext();
+  const ASTContext &Context =
+      cast<TranslationUnitDecl>(getDeclContext()->getRedeclContext())
+          ->getASTContext();
 
   // The result type and first argument type are constant across all
   // these operators.  The second argument must be exactly void*.
@@ -3342,7 +3342,7 @@ bool FunctionDecl::isReplaceableGlobalAllocationFunction(
 
   unsigned Params = 1;
   QualType Ty = FPT->getParamType(Params);
-  ASTContext &Ctx = getASTContext();
+  const ASTContext &Ctx = getASTContext();
 
   auto Consume = [&] {
     ++Params;
@@ -3388,7 +3388,8 @@ bool FunctionDecl::isReplaceableGlobalAllocationFunction(
     QualType T = Ty;
     while (const auto *TD = T->getAs<TypedefType>())
       T = TD->getDecl()->getUnderlyingType();
-    IdentifierInfo *II = T->castAs<EnumType>()->getDecl()->getIdentifier();
+    const IdentifierInfo *II =
+        T->castAs<EnumType>()->getDecl()->getIdentifier();
     if (II && II->isStr("__hot_cold_t"))
       Consume();
   }
@@ -3586,7 +3587,7 @@ unsigned FunctionDecl::getBuiltinID(bool ConsiderWrapperFunctions) const {
       (!hasAttr<ArmBuiltinAliasAttr>() && !hasAttr<BuiltinAliasAttr>()))
     return 0;
 
-  ASTContext &Context = getASTContext();
+  const ASTContext &Context = getASTContext();
   if (!Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID))
     return BuiltinID;
 
@@ -3745,7 +3746,7 @@ bool FunctionDecl::doesDeclarationForceExternallyVisibleDefinition() const {
   assert(!doesThisDeclarationHaveABody() &&
          "Must have a declaration without a body.");
 
-  ASTContext &Context = getASTContext();
+  const ASTContext &Context = getASTContext();
 
   if (Context.getLangOpts().MSVCCompat) {
     const FunctionDecl *Definition;
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index a240d74d6342..828d4ea35526 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -37,7 +37,6 @@
 namespace clang {
 namespace interp {
 
-using APInt = llvm::APInt;
 using APSInt = llvm::APSInt;
 
 /// Convert a value to an APValue.
diff --git a/clang/lib/ASTMatchers/Dynamic/Parser.cpp b/clang/lib/ASTMatchers/Dynamic/Parser.cpp
index 27096a83b8dd..6a16c2184fcf 100644
--- a/clang/lib/ASTMatchers/Dynamic/Parser.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Parser.cpp
@@ -299,10 +299,8 @@ private:
 
   /// Consume all leading whitespace from \c Code.
   void consumeWhitespace() {
-    Code = Code.drop_while([](char c) {
-      // Don't trim newlines.
-      return StringRef(" \t\v\f\r").contains(c);
-    });
+    // Don't trim newlines.
+    Code = Code.ltrim(" \t\v\f\r");
   }
 
   SourceLocation currentLocation() {
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 93919cd0243d..96fe6df88dbb 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -1034,7 +1034,7 @@ RecordStorageLocation *getImplicitObjectLocation(const CXXMemberCallExpr &MCE,
   if (ImplicitObject == nullptr)
     return nullptr;
   if (ImplicitObject->getType()->isPointerType()) {
-    if (auto *Val = cast_or_null<PointerValue>(Env.getValue(*ImplicitObject)))
+    if (auto *Val = Env.get<PointerValue>(*ImplicitObject))
       return &cast<RecordStorageLocation>(Val->getPointeeLoc());
     return nullptr;
   }
@@ -1048,11 +1048,11 @@ RecordStorageLocation *getBaseObjectLocation(const MemberExpr &ME,
   if (Base == nullptr)
     return nullptr;
   if (ME.isArrow()) {
-    if (auto *Val = cast_or_null<PointerValue>(Env.getValue(*Base)))
+    if (auto *Val = Env.get<PointerValue>(*Base))
       return &cast<RecordStorageLocation>(Val->getPointeeLoc());
     return nullptr;
   }
-  return cast_or_null<RecordStorageLocation>(Env.getStorageLocation(*Base));
+  return Env.get<RecordStorageLocation>(*Base);
 }
 
 std::vector<FieldDecl *> getFieldsForInitListExpr(const RecordDecl *RD) {
@@ -1077,7 +1077,7 @@ RecordValue &refreshRecordValue(const Expr &Expr, Environment &Env) {
   assert(Expr.getType()->isRecordType());
 
   if (Expr.isPRValue()) {
-    if (auto *ExistingVal = cast_or_null<RecordValue>(Env.getValue(Expr))) {
+    if (auto *ExistingVal = Env.get<RecordValue>(Expr)) {
       auto &NewVal = Env.create<RecordValue>(ExistingVal->getLoc());
       Env.setValue(Expr, NewVal);
       Env.setValue(NewVal.getLoc(), NewVal);
@@ -1089,8 +1089,7 @@ RecordValue &refreshRecordValue(const Expr &Expr, Environment &Env) {
     return NewVal;
   }
 
-  if (auto *Loc =
-          cast_or_null<RecordStorageLocation>(Env.getStorageLocation(Expr))) {
+  if (auto *Loc = Env.get<RecordStorageLocation>(Expr)) {
     auto &NewVal = Env.create<RecordValue>(*Loc);
     Env.setValue(*Loc, NewVal);
     return NewVal;
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
index 69ac2c2b82cf..1d31b22b6d25 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
@@ -226,7 +226,7 @@ auto isComparisonOperatorCall(L lhs_arg_matcher, R rhs_arg_matcher) {
 
 /// Ensures that `Expr` is mapped to a `BoolValue` and returns its formula.
 const Formula &forceBoolValue(Environment &Env, const Expr &Expr) {
-  auto *Value = cast_or_null<BoolValue>(Env.getValue(Expr));
+  auto *Value = Env.get<BoolValue>(Expr);
   if (Value != nullptr)
     return Value->formula();
 
@@ -267,7 +267,7 @@ BoolValue *getHasValue(Environment &Env, RecordStorageLocation *OptionalLoc) {
   if (OptionalLoc == nullptr)
     return nullptr;
   StorageLocation &HasValueLoc = locForHasValue(*OptionalLoc);
-  auto *HasValueVal = cast_or_null<BoolValue>(Env.getValue(HasValueLoc));
+  auto *HasValueVal = Env.get<BoolValue>(HasValueLoc);
   if (HasValueVal == nullptr) {
     HasValueVal = &Env.makeAtomicBoolValue();
     Env.setValue(HasValueLoc, *HasValueVal);
@@ -406,7 +406,7 @@ void transferCallReturningOptional(const CallExpr *E,
   if (E->isPRValue()) {
     Loc = &State.Env.getResultObjectLocation(*E);
   } else {
-    Loc = cast_or_null<RecordStorageLocation>(State.Env.getStorageLocation(*E));
+    Loc = State.Env.get<RecordStorageLocation>(*E);
     if (Loc == nullptr) {
       Loc = &cast<RecordStorageLocation>(State.Env.createStorageLocation(*E));
       State.Env.setStorageLocation(*E, *Loc);
@@ -449,8 +449,7 @@ BoolValue &valueOrConversionHasValue(const FunctionDecl &F, const Expr &E,
 
   // This is a constructor/assignment call for `optional<T>` with argument of
   // type `optional<U>` such that `T` is constructible from `U`.
-  auto *Loc =
-      cast_or_null<RecordStorageLocation>(State.Env.getStorageLocation(E));
+  auto *Loc = State.Env.get<RecordStorageLocation>(E);
   if (auto *HasValueVal = getHasValue(State.Env, Loc))
     return *HasValueVal;
   return State.Env.makeAtomicBoolValue();
@@ -471,8 +470,7 @@ void transferAssignment(const CXXOperatorCallExpr *E, BoolValue &HasValueVal,
                         LatticeTransferState &State) {
   assert(E->getNumArgs() > 0);
 
-  if (auto *Loc = cast_or_null<RecordStorageLocation>(
-          State.Env.getStorageLocation(*E->getArg(0)))) {
+  if (auto *Loc = State.Env.get<RecordStorageLocation>(*E->getArg(0))) {
     createOptionalValue(*Loc, HasValueVal, State.Env);
 
     // Assign a storage location for the whole expression.
@@ -534,18 +532,15 @@ void transferSwapCall(const CXXMemberCallExpr *E,
                       const MatchFinder::MatchResult &,
                       LatticeTransferState &State) {
   assert(E->getNumArgs() == 1);
-  auto *OtherLoc = cast_or_null<RecordStorageLocation>(
-      State.Env.getStorageLocation(*E->getArg(0)));
+  auto *OtherLoc = State.Env.get<RecordStorageLocation>(*E->getArg(0));
   transferSwap(getImplicitObjectLocation(*E, State.Env), OtherLoc, State.Env);
 }
 
 void transferStdSwapCall(const CallExpr *E, const MatchFinder::MatchResult &,
                          LatticeTransferState &State) {
   assert(E->getNumArgs() == 2);
-  auto *Arg0Loc = cast_or_null<RecordStorageLocation>(
-      State.Env.getStorageLocation(*E->getArg(0)));
-  auto *Arg1Loc = cast_or_null<RecordStorageLocation>(
-      State.Env.getStorageLocation(*E->getArg(1)));
+  auto *Arg0Loc = State.Env.get<RecordStorageLocation>(*E->getArg(0));
+  auto *Arg1Loc = State.Env.get<RecordStorageLocation>(*E->getArg(1));
   transferSwap(Arg0Loc, Arg1Loc, State.Env);
 }
 
@@ -585,11 +580,9 @@ void transferOptionalAndOptionalCmp(const clang::CXXOperatorCallExpr *CmpExpr,
   Environment &Env = State.Env;
   auto &A = Env.arena();
   auto *CmpValue = &forceBoolValue(Env, *CmpExpr);
-  auto *Arg0Loc = cast_or_null<RecordStorageLocation>(
-      Env.getStorageLocation(*CmpExpr->getArg(0)));
+  auto *Arg0Loc = Env.get<RecordStorageLocation>(*CmpExpr->getArg(0));
   if (auto *LHasVal = getHasValue(Env, Arg0Loc)) {
-    auto *Arg1Loc = cast_or_null<RecordStorageLocation>(
-        Env.getStorageLocation(*CmpExpr->getArg(1)));
+    auto *Arg1Loc = Env.get<RecordStorageLocation>(*CmpExpr->getArg(1));
     if (auto *RHasVal = getHasValue(Env, Arg1Loc)) {
       if (CmpExpr->getOperator() == clang::OO_ExclaimEqual)
         CmpValue = &A.makeNot(*CmpValue);
@@ -603,7 +596,7 @@ void transferOptionalAndValueCmp(const clang::CXXOperatorCallExpr *CmpExpr,
                                  const clang::Expr *E, Environment &Env) {
   auto &A = Env.arena();
   auto *CmpValue = &forceBoolValue(Env, *CmpExpr);
-  auto *Loc = cast_or_null<RecordStorageLocation>(Env.getStorageLocation(*E));
+  auto *Loc = Env.get<RecordStorageLocation>(*E);
   if (auto *HasVal = getHasValue(Env, Loc)) {
     if (CmpExpr->getOperator() == clang::OO_ExclaimEqual)
       CmpValue = &A.makeNot(*CmpValue);
@@ -616,7 +609,7 @@ void transferOptionalAndNulloptCmp(const clang::CXXOperatorCallExpr *CmpExpr,
                                    const clang::Expr *E, Environment &Env) {
   auto &A = Env.arena();
   auto *CmpValue = &forceBoolValue(Env, *CmpExpr);
-  auto *Loc = cast_or_null<RecordStorageLocation>(Env.getStorageLocation(*E));
+  auto *Loc = Env.get<RecordStorageLocation>(*E);
   if (auto *HasVal = getHasValue(Env, Loc)) {
     if (CmpExpr->getOperator() == clang::OO_ExclaimEqual)
       CmpValue = &A.makeNot(*CmpValue);
diff --git a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
index caaf443382b0..da4dd6dc0785 100644
--- a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
+++ b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
@@ -66,19 +66,8 @@ void clang::dataflow::copyRecord(RecordStorageLocation &Src,
     }
   }
 
-  RecordValue *SrcVal = cast_or_null<RecordValue>(Env.getValue(Src));
-  RecordValue *DstVal = cast_or_null<RecordValue>(Env.getValue(Dst));
-
-  DstVal = &Env.create<RecordValue>(Dst);
+  RecordValue *DstVal = &Env.create<RecordValue>(Dst);
   Env.setValue(Dst, *DstVal);
-
-  if (SrcVal == nullptr)
-    return;
-
-  for (const auto &[Name, Value] : SrcVal->properties()) {
-    if (Value != nullptr)
-      DstVal->setProperty(Name, *Value);
-  }
 }
 
 bool clang::dataflow::recordsEqual(const RecordStorageLocation &Loc1,
@@ -125,25 +114,5 @@ bool clang::dataflow::recordsEqual(const RecordStorageLocation &Loc1,
     }
   }
 
-  llvm::StringMap<Value *> Props1, Props2;
-
-  if (RecordValue *Val1 = cast_or_null<RecordValue>(Env1.getValue(Loc1)))
-    for (const auto &[Name, Value] : Val1->properties())
-      Props1[Name] = Value;
-  if (RecordValue *Val2 = cast_or_null<RecordValue>(Env2.getValue(Loc2)))
-    for (const auto &[Name, Value] : Val2->properties())
-      Props2[Name] = Value;
-
-  if (Props1.size() != Props2.size())
-    return false;
-
-  for (const auto &[Name, Value] : Props1) {
-    auto It = Props2.find(Name);
-    if (It == Props2.end())
-      return false;
-    if (Value != It->second)
-      return false;
-  }
-
   return true;
 }
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 346469660662..55093c2e2cda 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -339,8 +339,7 @@ public:
 
     switch (S->getOpcode()) {
     case UO_Deref: {
-      const auto *SubExprVal =
-          cast_or_null<PointerValue>(Env.getValue(*SubExpr));
+      const auto *SubExprVal = Env.get<PointerValue>(*SubExpr);
       if (SubExprVal == nullptr)
         break;
 
@@ -467,8 +466,7 @@ public:
       const Expr *Arg = S->getArg(0);
       assert(Arg != nullptr);
 
-      auto *ArgLoc =
-          cast_or_null<RecordStorageLocation>(Env.getStorageLocation(*Arg));
+      auto *ArgLoc = Env.get<RecordStorageLocation>(*Arg);
       if (ArgLoc == nullptr)
         return;
 
@@ -515,14 +513,12 @@ public:
 
       RecordStorageLocation *LocSrc = nullptr;
       if (Arg1->isPRValue()) {
-        if (auto *Val = cast_or_null<RecordValue>(Env.getValue(*Arg1)))
+        if (auto *Val = Env.get<RecordValue>(*Arg1))
           LocSrc = &Val->getLoc();
       } else {
-        LocSrc =
-            cast_or_null<RecordStorageLocation>(Env.getStorageLocation(*Arg1));
+        LocSrc = Env.get<RecordStorageLocation>(*Arg1);
       }
-      auto *LocDst =
-          cast_or_null<RecordStorageLocation>(Env.getStorageLocation(*Arg0));
+      auto *LocDst = Env.get<RecordStorageLocation>(*Arg0);
 
       if (LocSrc == nullptr || LocDst == nullptr)
         return;
@@ -676,7 +672,7 @@ public:
         auto Init = Inits[InitIdx++];
         assert(Base.getType().getCanonicalType() ==
                Init->getType().getCanonicalType());
-        auto* BaseVal = cast_or_null<RecordValue>(Env.getValue(*Init));
+        auto *BaseVal = Env.get<RecordValue>(*Init);
         if (!BaseVal)
           BaseVal = cast<RecordValue>(Env.createValue(Init->getType()));
         // Take ownership of the fields of the `RecordValue` for the base class
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index 8c9360235da7..faf83a8920d4 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -130,7 +130,7 @@ private:
     if (Env.getValue(Cond) == nullptr)
       transfer(StmtToEnv, Cond, Env);
 
-    auto *Val = cast_or_null<BoolValue>(Env.getValue(Cond));
+    auto *Val = Env.get<BoolValue>(Cond);
     // Value merging depends on flow conditions from different environments
     // being mutually exclusive -- that is, they cannot both be true in their
     // entirety (even if they may share some clauses). So, we need *some* value
diff --git a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
index 5902c6dc3ce0..d0d8316385b4 100644
--- a/clang/lib/Basic/IdentifierTable.cpp
+++ b/clang/lib/Basic/IdentifierTable.cpp
@@ -628,8 +628,7 @@ ObjCMethodFamily Selector::getMethodFamilyImpl(Selector sel) {
     return OMF_performSelector;
 
   // The other method families may begin with a prefix of underscores.
-  while (!name.empty() && name.front() == '_')
-    name = name.substr(1);
+  name = name.ltrim('_');
 
   if (name.empty()) return OMF_None;
   switch (name.front()) {
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index def16c032c86..2f8395cb8932 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -225,6 +225,7 @@ bool AArch64TargetInfo::validateBranchProtection(StringRef Spec, StringRef,
     BPI.SignKey = LangOptions::SignReturnAddressKeyKind::BKey;
 
   BPI.BranchTargetEnforcement = PBP.BranchTargetEnforcement;
+  BPI.BranchProtectionPAuthLR = PBP.BranchProtectionPAuthLR;
   return true;
 }
 
@@ -1364,8 +1365,7 @@ bool AArch64TargetInfo::validateConstraintModifier(
     StringRef Constraint, char Modifier, unsigned Size,
     std::string &SuggestedModifier) const {
   // Strip off constraint modifiers.
-  while (Constraint[0] == '=' || Constraint[0] == '+' || Constraint[0] == '&')
-    Constraint = Constraint.substr(1);
+  Constraint = Constraint.ltrim("=+&");
 
   switch (Constraint[0]) {
   default:
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index ce7e4d4639ce..01f9e844da12 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -419,6 +419,7 @@ bool ARMTargetInfo::validateBranchProtection(StringRef Spec, StringRef Arch,
   BPI.SignKey = LangOptions::SignReturnAddressKeyKind::AKey;
 
   BPI.BranchTargetEnforcement = PBP.BranchTargetEnforcement;
+  BPI.BranchProtectionPAuthLR = PBP.BranchProtectionPAuthLR;
   return true;
 }
 
@@ -1229,8 +1230,7 @@ bool ARMTargetInfo::validateConstraintModifier(
   bool isInOut = (Constraint[0] == '+');
 
   // Strip off constraint modifiers.
-  while (Constraint[0] == '=' || Constraint[0] == '+' || Constraint[0] == '&')
-    Constraint = Constraint.substr(1);
+  Constraint = Constraint.ltrim("=+&");
 
   switch (Constraint[0]) {
   default:
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index 685462961ee3..6bc57a83a2d5 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -416,8 +416,7 @@ static void handleFullArchString(StringRef FullArchStr,
   Features.push_back("__RISCV_TargetAttrNeedOverride");
   auto RII = llvm::RISCVISAInfo::parseArchString(
       FullArchStr, /* EnableExperimentalExtension */ true);
-  if (!RII) {
-    consumeError(RII.takeError());
+  if (llvm::errorToBool(RII.takeError())) {
     // Forward the invalid FullArchStr.
     Features.push_back("+" + FullArchStr.str());
   } else {
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index b97f88647fa4..3deaa19f8d4f 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -1613,8 +1613,7 @@ bool X86TargetInfo::validateOutputSize(const llvm::StringMap<bool> &FeatureMap,
                                        StringRef Constraint,
                                        unsigned Size) const {
   // Strip off constraint modifiers.
-  while (Constraint[0] == '=' || Constraint[0] == '+' || Constraint[0] == '&')
-    Constraint = Constraint.substr(1);
+  Constraint = Constraint.ltrim("=+&");
 
   return validateOperandSize(FeatureMap, Constraint, Size);
 }
diff --git a/clang/lib/Basic/Version.cpp b/clang/lib/Basic/Version.cpp
index e205da7adec1..4823f566bd77 100644
--- a/clang/lib/Basic/Version.cpp
+++ b/clang/lib/Basic/Version.cpp
@@ -57,6 +57,14 @@ std::string getLLVMRevision() {
 #endif
 }
 
+std::string getClangVendor() {
+#ifdef CLANG_VENDOR
+  return CLANG_VENDOR;
+#else
+  return "";
+#endif
+}
+
 std::string getClangFullRepositoryVersion() {
   std::string buf;
   llvm::raw_string_ostream OS(buf);
@@ -92,10 +100,7 @@ std::string getClangFullVersion() {
 std::string getClangToolFullVersion(StringRef ToolName) {
   std::string buf;
   llvm::raw_string_ostream OS(buf);
-#ifdef CLANG_VENDOR
-  OS << CLANG_VENDOR;
-#endif
-  OS << ToolName << " version " CLANG_VERSION_STRING;
+  OS << getClangVendor() << ToolName << " version " CLANG_VERSION_STRING;
 
   std::string repo = getClangFullRepositoryVersion();
   if (!repo.empty()) {
@@ -110,10 +115,7 @@ std::string getClangFullCPPVersion() {
   // the one we report on the command line.
   std::string buf;
   llvm::raw_string_ostream OS(buf);
-#ifdef CLANG_VENDOR
-  OS << CLANG_VENDOR;
-#endif
-  OS << "Clang " CLANG_VERSION_STRING;
+  OS << getClangVendor() << "Clang " CLANG_VERSION_STRING;
 
   std::string repo = getClangFullRepositoryVersion();
   if (!repo.empty()) {
diff --git a/clang/lib/Basic/Warnings.cpp b/clang/lib/Basic/Warnings.cpp
index cb23d844ef8f..92954cab6fb0 100644
--- a/clang/lib/Basic/Warnings.cpp
+++ b/clang/lib/Basic/Warnings.cpp
@@ -96,11 +96,7 @@ void clang::ProcessWarningOptions(DiagnosticsEngine &Diags,
 
       // Check to see if this warning starts with "no-", if so, this is a
       // negative form of the option.
-      bool isPositive = true;
-      if (Opt.starts_with("no-")) {
-        isPositive = false;
-        Opt = Opt.substr(3);
-      }
+      bool isPositive = !Opt.consume_front("no-");
 
       // Figure out how this option affects the warning.  If -Wfoo, map the
       // diagnostic to a warning, if -Wno-foo, map it to ignore.
@@ -198,8 +194,7 @@ void clang::ProcessWarningOptions(DiagnosticsEngine &Diags,
       }
     }
 
-    for (unsigned i = 0, e = Opts.Remarks.size(); i != e; ++i) {
-      StringRef Opt = Opts.Remarks[i];
+    for (StringRef Opt : Opts.Remarks) {
       const auto Flavor = diag::Flavor::Remark;
 
       // Check to see if this warning starts with "no-", if so, this is a
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 480410db1021..a6142d99f3b6 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -881,7 +881,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
           << PluginFN << toString(PassPlugin.takeError());
     }
   }
-  for (auto PassCallback : CodeGenOpts.PassBuilderCallbacks)
+  for (const auto &PassCallback : CodeGenOpts.PassBuilderCallbacks)
     PassCallback(PB);
 #define HANDLE_EXTENSION(Ext)                                                  \
   get##Ext##PluginInfo().RegisterPassBuilderCallbacks(PB);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a29304c81928..f71dbf1729a1 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10318,6 +10318,30 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
   return nullptr;
 }
 
+static void swapCommutativeSMEOperands(unsigned BuiltinID,
+                                       SmallVectorImpl<Value *> &Ops) {
+  unsigned MultiVec;
+  switch (BuiltinID) {
+  default:
+    return;
+  case SME::BI__builtin_sme_svsumla_za32_s8_vg4x1:
+    MultiVec = 1;
+    break;
+  case SME::BI__builtin_sme_svsumla_za32_s8_vg4x2:
+  case SME::BI__builtin_sme_svsudot_za32_s8_vg1x2:
+    MultiVec = 2;
+    break;
+  case SME::BI__builtin_sme_svsudot_za32_s8_vg1x4:
+  case SME::BI__builtin_sme_svsumla_za32_s8_vg4x4:
+    MultiVec = 4;
+    break;
+  }
+
+  if (MultiVec > 0)
+    for (unsigned I = 0; I < MultiVec; ++I)
+      std::swap(Ops[I + 1], Ops[I + 1 + MultiVec]);
+}
+
 Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
                                                   const CallExpr *E) {
   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID,
@@ -10340,6 +10364,9 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
            BuiltinID == SME::BI__builtin_sme_svstr_za)
     return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
 
+  // Handle builtins which require their multi-vector operands to be swapped
+  swapCommutativeSMEOperands(BuiltinID, Ops);
+
   // Should not happen!
   if (Builtin->LLVMIntrinsic == 0)
     return nullptr;
@@ -10403,6 +10430,26 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID));
   }
 
+  if (BuiltinID == clang::AArch64::BI__builtin_arm_get_sme_state) {
+    // Create call to __arm_sme_state and store the results to the two pointers.
+    CallInst *CI = EmitRuntimeCall(CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(StructType::get(CGM.Int64Ty, CGM.Int64Ty), {},
+                                false),
+        "__arm_sme_state"));
+    auto Attrs =
+        AttributeList()
+            .addFnAttribute(getLLVMContext(), "aarch64_pstate_sm_compatible")
+            .addFnAttribute(getLLVMContext(), "aarch64_pstate_za_preserved");
+    CI->setAttributes(Attrs);
+    CI->setCallingConv(
+        llvm::CallingConv::
+            AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2);
+    Builder.CreateStore(Builder.CreateExtractValue(CI, 0),
+                        EmitPointerWithAlignment(E->getArg(0)));
+    return Builder.CreateStore(Builder.CreateExtractValue(CI, 1),
+                               EmitPointerWithAlignment(E->getArg(1)));
+  }
+
   if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) {
     assert((getContext().getTypeSize(E->getType()) == 32) &&
            "rbit of unusual size!");
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index ed426098ac69..e362c9da51fe 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -6406,13 +6406,11 @@ static void emitOMPAtomicCaptureExpr(CodeGenFunction &CGF,
   }
 }
 
-static void emitOMPAtomicCompareExpr(CodeGenFunction &CGF,
-                                     llvm::AtomicOrdering AO, const Expr *X,
-                                     const Expr *V, const Expr *R,
-                                     const Expr *E, const Expr *D,
-                                     const Expr *CE, bool IsXBinopExpr,
-                                     bool IsPostfixUpdate, bool IsFailOnly,
-                                     SourceLocation Loc) {
+static void emitOMPAtomicCompareExpr(
+    CodeGenFunction &CGF, llvm::AtomicOrdering AO, llvm::AtomicOrdering FailAO,
+    const Expr *X, const Expr *V, const Expr *R, const Expr *E, const Expr *D,
+    const Expr *CE, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly,
+    SourceLocation Loc) {
   llvm::OpenMPIRBuilder &OMPBuilder =
       CGF.CGM.getOpenMPRuntime().getOMPBuilder();
 
@@ -6477,13 +6475,21 @@ static void emitOMPAtomicCompareExpr(CodeGenFunction &CGF,
               R->getType().isVolatileQualified()};
   }
 
-  CGF.Builder.restoreIP(OMPBuilder.createAtomicCompare(
-      CGF.Builder, XOpVal, VOpVal, ROpVal, EVal, DVal, AO, Op, IsXBinopExpr,
-      IsPostfixUpdate, IsFailOnly));
+  if (FailAO == llvm::AtomicOrdering::NotAtomic) {
+    // fail clause was not mentionend on the
+    // "#pragma omp atomic compare" construct.
+    CGF.Builder.restoreIP(OMPBuilder.createAtomicCompare(
+        CGF.Builder, XOpVal, VOpVal, ROpVal, EVal, DVal, AO, Op, IsXBinopExpr,
+        IsPostfixUpdate, IsFailOnly));
+  } else
+    CGF.Builder.restoreIP(OMPBuilder.createAtomicCompare(
+        CGF.Builder, XOpVal, VOpVal, ROpVal, EVal, DVal, AO, Op, IsXBinopExpr,
+        IsPostfixUpdate, IsFailOnly, FailAO));
 }
 
 static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
-                              llvm::AtomicOrdering AO, bool IsPostfixUpdate,
+                              llvm::AtomicOrdering AO,
+                              llvm::AtomicOrdering FailAO, bool IsPostfixUpdate,
                               const Expr *X, const Expr *V, const Expr *R,
                               const Expr *E, const Expr *UE, const Expr *D,
                               const Expr *CE, bool IsXLHSInRHSPart,
@@ -6504,12 +6510,8 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
                              IsXLHSInRHSPart, Loc);
     break;
   case OMPC_compare: {
-    emitOMPAtomicCompareExpr(CGF, AO, X, V, R, E, D, CE, IsXLHSInRHSPart,
-                             IsPostfixUpdate, IsFailOnly, Loc);
-    break;
-  }
-  case OMPC_fail: {
-    //TODO
+    emitOMPAtomicCompareExpr(CGF, AO, FailAO, X, V, R, E, D, CE,
+                             IsXLHSInRHSPart, IsPostfixUpdate, IsFailOnly, Loc);
     break;
   }
   default:
@@ -6519,6 +6521,8 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
 
 void CodeGenFunction::EmitOMPAtomicDirective(const OMPAtomicDirective &S) {
   llvm::AtomicOrdering AO = llvm::AtomicOrdering::Monotonic;
+  // Fail Memory Clause Ordering.
+  llvm::AtomicOrdering FailAO = llvm::AtomicOrdering::NotAtomic;
   bool MemOrderingSpecified = false;
   if (S.getSingleClause<OMPSeqCstClause>()) {
     AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -6572,12 +6576,27 @@ void CodeGenFunction::EmitOMPAtomicDirective(const OMPAtomicDirective &S) {
     }
   }
 
+  if (KindsEncountered.contains(OMPC_compare) &&
+      KindsEncountered.contains(OMPC_fail)) {
+    Kind = OMPC_compare;
+    const auto *FailClause = S.getSingleClause<OMPFailClause>();
+    if (FailClause) {
+      OpenMPClauseKind FailParameter = FailClause->getFailParameter();
+      if (FailParameter == llvm::omp::OMPC_relaxed)
+        FailAO = llvm::AtomicOrdering::Monotonic;
+      else if (FailParameter == llvm::omp::OMPC_acquire)
+        FailAO = llvm::AtomicOrdering::Acquire;
+      else if (FailParameter == llvm::omp::OMPC_seq_cst)
+        FailAO = llvm::AtomicOrdering::SequentiallyConsistent;
+    }
+  }
+
   LexicalScope Scope(*this, S.getSourceRange());
   EmitStopPoint(S.getAssociatedStmt());
-  emitOMPAtomicExpr(*this, Kind, AO, S.isPostfixUpdate(), S.getX(), S.getV(),
-                    S.getR(), S.getExpr(), S.getUpdateExpr(), S.getD(),
-                    S.getCondExpr(), S.isXLHSInRHSPart(), S.isFailOnly(),
-                    S.getBeginLoc());
+  emitOMPAtomicExpr(*this, Kind, AO, FailAO, S.isPostfixUpdate(), S.getX(),
+                    S.getV(), S.getR(), S.getExpr(), S.getUpdateExpr(),
+                    S.getD(), S.getCondExpr(), S.isXLHSInRHSPart(),
+                    S.isFailOnly(), S.getBeginLoc());
 }
 
 static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index 753a8fd74fa6..f8038497d90a 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -1139,8 +1139,7 @@ CodeGenAction::loadModule(MemoryBufferRef MBRef) {
 
   // Strip off a leading diagnostic code if there is one.
   StringRef Msg = Err.getMessage();
-  if (Msg.starts_with("error: "))
-    Msg = Msg.substr(7);
+  Msg.consume_front("error: ");
 
   unsigned DiagID =
       CI.getDiagnostics().getCustomDiagID(DiagnosticsEngine::Error, "%0");
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 7ad26ace328a..d78f2594a237 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -995,12 +995,7 @@ void CodeGenModule::Release() {
                               uint32_t(CLANG_VERSION_MINOR));
     getModule().addModuleFlag(llvm::Module::Warning, "zos_product_patchlevel",
                               uint32_t(CLANG_VERSION_PATCHLEVEL));
-    std::string ProductId;
-#ifdef CLANG_VENDOR
-    ProductId = #CLANG_VENDOR;
-#else
-    ProductId = "clang";
-#endif
+    std::string ProductId = getClangVendor() + "clang";
     getModule().addModuleFlag(llvm::Module::Error, "zos_product_id",
                               llvm::MDString::get(VMContext, ProductId));
 
@@ -1111,6 +1106,9 @@ void CodeGenModule::Release() {
     if (LangOpts.BranchTargetEnforcement)
       getModule().addModuleFlag(llvm::Module::Min, "branch-target-enforcement",
                                 1);
+    if (LangOpts.BranchProtectionPAuthLR)
+      getModule().addModuleFlag(llvm::Module::Min, "branch-protection-pauth-lr",
+                                1);
     if (LangOpts.hasSignReturnAddress())
       getModule().addModuleFlag(llvm::Module::Min, "sign-return-address", 1);
     if (LangOpts.isSignReturnAddressScopeAll())
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index be5145daa00b..7102d190fe00 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -136,6 +136,8 @@ public:
 
     Fn->addFnAttr("branch-target-enforcement",
                   BPI.BranchTargetEnforcement ? "true" : "false");
+    Fn->addFnAttr("branch-protection-pauth-lr",
+                  BPI.BranchProtectionPAuthLR ? "true" : "false");
   }
 
   bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
diff --git a/clang/lib/CodeGen/Targets/LoongArch.cpp b/clang/lib/CodeGen/Targets/LoongArch.cpp
index 7b2c31139b0b..63b9a1fdb988 100644
--- a/clang/lib/CodeGen/Targets/LoongArch.cpp
+++ b/clang/lib/CodeGen/Targets/LoongArch.cpp
@@ -324,13 +324,6 @@ ABIArgInfo LoongArchABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
     return ABIArgInfo::getDirect();
   }
 
-  // Pass 128-bit/256-bit vector values via vector registers directly.
-  if (Ty->isVectorType() && (((getContext().getTypeSize(Ty) == 128) &&
-                              (getTarget().hasFeature("lsx"))) ||
-                             ((getContext().getTypeSize(Ty) == 256) &&
-                              getTarget().hasFeature("lasx"))))
-    return ABIArgInfo::getDirect();
-
   // Complex types for the *f or *d ABI must be passed directly rather than
   // using CoerceAndExpand.
   if (IsFixed && Ty->isComplexType() && FRLen && FARsLeft >= 2) {
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index ff95c899c5f3..9b2f2a374809 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -86,6 +86,7 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/RISCVISAInfo.h"
 #include "llvm/Support/StringSaver.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
@@ -670,10 +671,15 @@ static llvm::Triple computeTargetTriple(const Driver &D,
     if (Args.hasArg(options::OPT_march_EQ) ||
         Args.hasArg(options::OPT_mcpu_EQ)) {
       StringRef ArchName = tools::riscv::getRISCVArch(Args, Target);
-      if (ArchName.starts_with_insensitive("rv32"))
-        Target.setArch(llvm::Triple::riscv32);
-      else if (ArchName.starts_with_insensitive("rv64"))
-        Target.setArch(llvm::Triple::riscv64);
+      auto ISAInfo = llvm::RISCVISAInfo::parseArchString(
+          ArchName, /*EnableExperimentalExtensions=*/true);
+      if (!llvm::errorToBool(ISAInfo.takeError())) {
+        unsigned XLen = (*ISAInfo)->getXLen();
+        if (XLen == 32)
+          Target.setArch(llvm::Triple::riscv32);
+        else if (XLen == 64)
+          Target.setArch(llvm::Triple::riscv64);
+      }
     }
   }
 
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index f9670ea6f251..e6126ff62db3 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -328,6 +328,12 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
+  if (D.IsFlangMode()) {
+    addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs);
+    addFortranRuntimeLibs(ToolChain, Args, CmdArgs);
+    CmdArgs.push_back("-lm");
+    CmdArgs.push_back("-lpthread");
+  }
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
                                          Exec, CmdArgs, Inputs, Output));
diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
index 25b43cefce6b..0717e3b813e1 100644
--- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
@@ -167,13 +167,6 @@ void riscv::getRISCVTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     Features.push_back("-relax");
   }
 
-  // GCC Compatibility: -mno-save-restore is default, unless -msave-restore is
-  // specified.
-  if (Args.hasFlag(options::OPT_msave_restore, options::OPT_mno_save_restore, false))
-    Features.push_back("+save-restore");
-  else
-    Features.push_back("-save-restore");
-
   // -mno-unaligned-access is default, unless -munaligned-access is specified.
   AddTargetFeature(Args, Features, options::OPT_munaligned_access,
                    options::OPT_mno_unaligned_access, "fast-unaligned-access");
@@ -222,10 +215,8 @@ StringRef riscv::getRISCVABI(const ArgList &Args, const llvm::Triple &Triple) {
 
   auto ParseResult = llvm::RISCVISAInfo::parseArchString(
       Arch, /* EnableExperimentalExtension */ true);
-  if (!ParseResult)
-    // Ignore parsing error, just go 3rd step.
-    consumeError(ParseResult.takeError());
-  else
+  // Ignore parsing error, just go 3rd step.
+  if (!llvm::errorToBool(ParseResult.takeError()))
     return (*ParseResult)->computeDefaultABI();
 
   // 3. Choose a default based on the triple
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index fef0522aaf45..53e26a9f8e22 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -237,9 +237,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
     assert(Name.starts_with("m") && "Invalid feature name.");
     Name = Name.substr(1);
 
-    bool IsNegative = Name.starts_with("no-");
-    if (IsNegative)
-      Name = Name.substr(3);
+    bool IsNegative = Name.consume_front("no-");
 
 #ifndef NDEBUG
     assert(Name.starts_with("avx10.") && "Invalid AVX10 feature name.");
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index de9fd5eaa1e0..acfa11980506 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1497,7 +1497,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
         << Triple.getArchName();
 
   StringRef Scope, Key;
-  bool IndirectBranches;
+  bool IndirectBranches, BranchProtectionPAuthLR;
 
   if (A->getOption().matches(options::OPT_msign_return_address_EQ)) {
     Scope = A->getValue();
@@ -1506,6 +1506,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
           << A->getSpelling() << Scope;
     Key = "a_key";
     IndirectBranches = false;
+    BranchProtectionPAuthLR = false;
   } else {
     StringRef DiagMsg;
     llvm::ARM::ParsedBranchProtection PBP;
@@ -1517,6 +1518,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
           << "b-key" << A->getAsString(Args);
     Scope = PBP.Scope;
     Key = PBP.Key;
+    BranchProtectionPAuthLR = PBP.BranchProtectionPAuthLR;
     IndirectBranches = PBP.BranchTargetEnforcement;
   }
 
@@ -1525,6 +1527,9 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
   if (!Scope.equals("none"))
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-msign-return-address-key=") + Key));
+  if (BranchProtectionPAuthLR)
+    CmdArgs.push_back(
+        Args.MakeArgString(Twine("-mbranch-protection-pauth-lr")));
   if (IndirectBranches)
     CmdArgs.push_back("-mbranch-target-enforce");
 }
@@ -2067,12 +2072,9 @@ void Clang::AddRISCVTargetArgs(const ArgList &Args,
     StringRef Arch = riscv::getRISCVArch(Args, Triple);
     auto ISAInfo = llvm::RISCVISAInfo::parseArchString(
         Arch, /*EnableExperimentalExtensions*/ true);
-    if (!ISAInfo) {
-      // Ignore parsing error.
-      consumeError(ISAInfo.takeError());
-    } else {
+    // Ignore parsing error.
+    if (!errorToBool(ISAInfo.takeError()))
       MinVLen = (*ISAInfo)->getMinVLen();
-    }
 
     // If the value is "zvl", use MinVLen from march. Otherwise, try to parse
     // as integer as long as we have a MinVLen.
@@ -3198,13 +3200,13 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
                    options::OPT_fstrict_float_cast_overflow, false))
     CmdArgs.push_back("-fno-strict-float-cast-overflow");
 
-  if (const Arg *A = Args.getLastArg(options::OPT_fcx_limited_range))
+  if (Args.hasArg(options::OPT_fcx_limited_range))
     CmdArgs.push_back("-fcx-limited-range");
-  if (const Arg *A = Args.getLastArg(options::OPT_fcx_fortran_rules))
+  if (Args.hasArg(options::OPT_fcx_fortran_rules))
     CmdArgs.push_back("-fcx-fortran-rules");
-  if (const Arg *A = Args.getLastArg(options::OPT_fno_cx_limited_range))
+  if (Args.hasArg(options::OPT_fno_cx_limited_range))
     CmdArgs.push_back("-fno-cx-limited-range");
-  if (const Arg *A = Args.getLastArg(options::OPT_fno_cx_fortran_rules))
+  if (Args.hasArg(options::OPT_fno_cx_fortran_rules))
     CmdArgs.push_back("-fno-cx-fortran-rules");
 }
 
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 4f4bdac793be..2340191ca97d 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1133,6 +1133,16 @@ static bool isWholeArchivePresent(const ArgList &Args) {
   return WholeArchiveActive;
 }
 
+/// Determine if driver is invoked to create a shared object library (-static)
+static bool isSharedLinkage(const ArgList &Args) {
+  return Args.hasArg(options::OPT_shared);
+}
+
+/// Determine if driver is invoked to create a static object library (-shared)
+static bool isStaticLinkage(const ArgList &Args) {
+  return Args.hasArg(options::OPT_static);
+}
+
 /// Add Fortran runtime libs for MSVC
 static void addFortranRuntimeLibsMSVC(const ArgList &Args,
                                       llvm::opt::ArgStringList &CmdArgs) {
@@ -1164,6 +1174,16 @@ static void addFortranRuntimeLibsMSVC(const ArgList &Args,
 // Add FortranMain runtime lib
 static void addFortranMain(const ToolChain &TC, const ArgList &Args,
                            llvm::opt::ArgStringList &CmdArgs) {
+  // 0. Shared-library linkage
+  // If we are attempting to link a library, we should not add
+  // -lFortran_main.a to the link line, as the `main` symbol is not
+  // required for a library and should also be provided by one of
+  // the translation units of the code that this shared library
+  // will be linked against eventually.
+  if (isSharedLinkage(Args) || isStaticLinkage(Args)) {
+    return;
+  }
+
   // 1. MSVC
   if (TC.getTriple().isKnownWindowsMSVCEnvironment()) {
     addFortranRuntimeLibsMSVC(Args, CmdArgs);
@@ -1174,8 +1194,9 @@ static void addFortranMain(const ToolChain &TC, const ArgList &Args,
   // The --whole-archive option needs to be part of the link line to make
   // sure that the main() function from Fortran_main.a is pulled in by the
   // linker. However, it shouldn't be used if it's already active.
-  // TODO: Find an equivalent of `--whole-archive` for Darwin.
-  if (!isWholeArchivePresent(Args) && !TC.getTriple().isMacOSX()) {
+  // TODO: Find an equivalent of `--whole-archive` for Darwin and AIX.
+  if (!isWholeArchivePresent(Args) && !TC.getTriple().isMacOSX() &&
+      !TC.getTriple().isOSAIX()) {
     CmdArgs.push_back("--whole-archive");
     CmdArgs.push_back("-lFortran_main");
     CmdArgs.push_back("--no-whole-archive");
@@ -2367,8 +2388,7 @@ static void GetSDLFromOffloadArchive(
       FoundAOB = true;
     }
   } else {
-    if (Lib.starts_with("-l"))
-      Lib = Lib.drop_front(2);
+    Lib.consume_front("-l");
     for (auto LPath : LibraryPaths) {
       ArchiveOfBundles.clear();
       auto LibFile = (Lib.starts_with(":") ? Lib.drop_front()
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 38361d6889a1..a610a94a39a2 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -1741,11 +1741,9 @@ selectRISCVMultilib(const MultilibSet &RISCVMultilibSet, StringRef Arch,
       llvm::RISCVISAInfo::parseArchString(
           Arch, /*EnableExperimentalExtension=*/true,
           /*ExperimentalExtensionVersionCheck=*/false);
-  if (!ParseResult) {
-    // Ignore any error here, we assume it will be handled in another place.
-    consumeError(ParseResult.takeError());
+  // Ignore any error here, we assume it will be handled in another place.
+  if (llvm::errorToBool(ParseResult.takeError()))
     return false;
-  }
 
   auto &ISAInfo = *ParseResult;
 
@@ -1780,10 +1778,8 @@ selectRISCVMultilib(const MultilibSet &RISCVMultilibSet, StringRef Arch,
           llvm::RISCVISAInfo::parseArchString(
               Flag, /*EnableExperimentalExtension=*/true,
               /*ExperimentalExtensionVersionCheck=*/false);
-      if (!MLConfigParseResult) {
-        // Ignore any error here, we assume it will handled in another place.
-        llvm::consumeError(MLConfigParseResult.takeError());
-
+      // Ignore any error here, we assume it will handled in another place.
+      if (llvm::errorToBool(MLConfigParseResult.takeError())) {
         // We might get a parsing error if rv32e in the list, we could just skip
         // that and process the rest of multi-lib configs.
         Skip = true;
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index 9a9792d019d5..200ac46aa534 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -295,13 +295,12 @@ static StringRef getSolarisLibSuffix(const llvm::Triple &Triple) {
   switch (Triple.getArch()) {
   case llvm::Triple::x86:
   case llvm::Triple::sparc:
+  default:
     break;
   case llvm::Triple::x86_64:
     return "/amd64";
   case llvm::Triple::sparcv9:
     return "/sparcv9";
-  default:
-    llvm_unreachable("Unsupported architecture");
   }
   return "";
 }
diff --git a/clang/lib/Format/CMakeLists.txt b/clang/lib/Format/CMakeLists.txt
index 015ec7c0cc84..84a3c136f650 100644
--- a/clang/lib/Format/CMakeLists.txt
+++ b/clang/lib/Format/CMakeLists.txt
@@ -11,6 +11,7 @@ add_clang_library(clangFormat
   IntegerLiteralSeparatorFixer.cpp
   MacroCallReconstructor.cpp
   MacroExpander.cpp
+  MatchFilePath.cpp
   NamespaceEndCommentsFixer.cpp
   ObjCPropertyAttributeOrderFixer.cpp
   QualifierAlignmentFixer.cpp
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index bd319f21b05f..102504182c45 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -398,7 +398,7 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
   }
   if ((startsNextParameter(Current, Style) || Previous.is(tok::semi) ||
        (Previous.is(TT_TemplateCloser) && Current.is(TT_StartOfName) &&
-        Style.isCpp() &&
+        State.Line->First->isNot(TT_AttributeSquare) && Style.isCpp() &&
         // FIXME: This is a temporary workaround for the case where clang-format
         // sets BreakBeforeParameter to avoid bin packing and this creates a
         // completely unnecessary line break after a template type that isn't
@@ -583,17 +583,15 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
       return true;
   }
 
-  // If the return type spans multiple lines, wrap before the function name.
-  if (((Current.is(TT_FunctionDeclarationName) &&
-        !State.Line->ReturnTypeWrapped &&
-        // Don't break before a C# function when no break after return type.
-        (!Style.isCSharp() ||
-         Style.AlwaysBreakAfterReturnType != FormatStyle::RTBS_None) &&
-        // Don't always break between a JavaScript `function` and the function
-        // name.
-        !Style.isJavaScript()) ||
-       (Current.is(tok::kw_operator) && Previous.isNot(tok::coloncolon))) &&
-      Previous.isNot(tok::kw_template) && CurrentState.BreakBeforeParameter) {
+  if (Current.is(TT_FunctionDeclarationName) &&
+      !State.Line->ReturnTypeWrapped &&
+      // Don't break before a C# function when no break after return type.
+      (!Style.isCSharp() ||
+       Style.AlwaysBreakAfterReturnType != FormatStyle::RTBS_None) &&
+      // Don't always break between a JavaScript `function` and the function
+      // name.
+      !Style.isJavaScript() && Previous.isNot(tok::kw_template) &&
+      CurrentState.BreakBeforeParameter) {
     return true;
   }
 
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 28271181e07d..f798d555bf99 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -1698,6 +1698,9 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
           /*BasedOnStyle=*/"google",
       },
   };
+  GoogleStyle.AttributeMacros.push_back("GUARDED_BY");
+  GoogleStyle.AttributeMacros.push_back("ABSL_GUARDED_BY");
+
   GoogleStyle.SpacesBeforeTrailingComments = 2;
   GoogleStyle.Standard = FormatStyle::LS_Auto;
 
diff --git a/clang/lib/Format/MatchFilePath.cpp b/clang/lib/Format/MatchFilePath.cpp
new file mode 100644
index 000000000000..062b334dcdd8
--- /dev/null
+++ b/clang/lib/Format/MatchFilePath.cpp
@@ -0,0 +1,122 @@
+//===--- MatchFilePath.cpp - Match file path with pattern -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the functionality of matching a file path name to
+/// a pattern, similar to the POSIX fnmatch() function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MatchFilePath.h"
+
+using namespace llvm;
+
+namespace clang {
+namespace format {
+
+// Check whether `FilePath` matches `Pattern` based on POSIX 2.13.1, 2.13.2, and
+// Rule 1 of 2.13.3.
+bool matchFilePath(StringRef Pattern, StringRef FilePath) {
+  assert(!Pattern.empty());
+  assert(!FilePath.empty());
+
+  // No match if `Pattern` ends with a non-meta character not equal to the last
+  // character of `FilePath`.
+  if (const auto C = Pattern.back(); !strchr("?*]", C) && C != FilePath.back())
+    return false;
+
+  constexpr auto Separator = '/';
+  const auto EOP = Pattern.size();  // End of `Pattern`.
+  const auto End = FilePath.size(); // End of `FilePath`.
+  unsigned I = 0;                   // Index to `Pattern`.
+
+  for (unsigned J = 0; J < End; ++J) {
+    if (I == EOP)
+      return false;
+
+    switch (const auto F = FilePath[J]; Pattern[I]) {
+    case '\\':
+      if (++I == EOP || F != Pattern[I])
+        return false;
+      break;
+    case '?':
+      if (F == Separator)
+        return false;
+      break;
+    case '*': {
+      while (++I < EOP && Pattern[I] == '*') { // Skip consecutive stars.
+      }
+      const auto K = FilePath.find(Separator, J); // Index of next `Separator`.
+      const bool NoMoreSeparatorsInFilePath = K == StringRef::npos;
+      if (I == EOP) // `Pattern` ends with a star.
+        return NoMoreSeparatorsInFilePath;
+      // `Pattern` ends with a lone backslash.
+      if (Pattern[I] == '\\' && ++I == EOP)
+        return false;
+      // The star is followed by a (possibly escaped) `Separator`.
+      if (Pattern[I] == Separator) {
+        if (NoMoreSeparatorsInFilePath)
+          return false;
+        J = K; // Skip to next `Separator` in `FilePath`.
+        break;
+      }
+      // Recurse.
+      for (auto Pat = Pattern.substr(I); J < End && FilePath[J] != Separator;
+           ++J) {
+        if (matchFilePath(Pat, FilePath.substr(J)))
+          return true;
+      }
+      return false;
+    }
+    case '[':
+      // Skip e.g. `[!]`.
+      if (I + 3 < EOP || (I + 3 == EOP && Pattern[I + 1] != '!')) {
+        // Skip unpaired `[`, brackets containing slashes, and `[]`.
+        if (const auto K = Pattern.find_first_of("]/", I + 1);
+            K != StringRef::npos && Pattern[K] == ']' && K > I + 1) {
+          if (F == Separator)
+            return false;
+          ++I; // After the `[`.
+          bool Negated = false;
+          if (Pattern[I] == '!') {
+            Negated = true;
+            ++I; // After the `!`.
+          }
+          bool Match = false;
+          do {
+            if (I + 2 < K && Pattern[I + 1] == '-') {
+              Match = Pattern[I] <= F && F <= Pattern[I + 2];
+              I += 3; // After the range, e.g. `A-Z`.
+            } else {
+              Match = F == Pattern[I++];
+            }
+          } while (!Match && I < K);
+          if (Negated ? Match : !Match)
+            return false;
+          I = K + 1; // After the `]`.
+          continue;
+        }
+      }
+      [[fallthrough]]; // Match `[` literally.
+    default:
+      if (F != Pattern[I])
+        return false;
+    }
+
+    ++I;
+  }
+
+  // Match trailing stars with null strings.
+  while (I < EOP && Pattern[I] == '*')
+    ++I;
+
+  return I == EOP;
+}
+
+} // namespace format
+} // namespace clang
diff --git a/clang/lib/Format/MatchFilePath.h b/clang/lib/Format/MatchFilePath.h
new file mode 100644
index 000000000000..482dab7c748e
--- /dev/null
+++ b/clang/lib/Format/MatchFilePath.h
@@ -0,0 +1,22 @@
+//===--- MatchFilePath.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_FORMAT_MATCHFILEPATH_H
+#define LLVM_CLANG_LIB_FORMAT_MATCHFILEPATH_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace clang {
+namespace format {
+
+bool matchFilePath(llvm::StringRef Pattern, llvm::StringRef FilePath);
+
+} // end namespace format
+} // end namespace clang
+
+#endif
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index f3551af34243..3ac3aa3c5e3a 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -3403,7 +3403,8 @@ static bool isFunctionDeclarationName(bool IsCpp, const FormatToken &Current,
       continue;
     }
     if (Tok->is(tok::kw_const) || Tok->isSimpleTypeSpecifier() ||
-        Tok->isOneOf(TT_PointerOrReference, TT_StartOfName, tok::ellipsis)) {
+        Tok->isOneOf(TT_PointerOrReference, TT_StartOfName, tok::ellipsis,
+                     TT_TypeName)) {
       return true;
     }
     if (Tok->isOneOf(tok::l_brace, TT_ObjCMethodExpr) || Tok->Tok.isLiteral())
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index c38b4c884070..684609747a55 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1650,8 +1650,10 @@ void UnwrappedLineParser::parseStructuralElement(
       return;
     }
     // In Verilog labels can be any expression, so we don't do them here.
-    if (!Style.isVerilog() && Tokens->peekNextToken()->is(tok::colon) &&
-        !Line->MustBeDeclaration) {
+    // JS doesn't have macros, and within classes colons indicate fields, not
+    // labels.
+    if (!Style.isJavaScript() && !Style.isVerilog() &&
+        Tokens->peekNextToken()->is(tok::colon) && !Line->MustBeDeclaration) {
       nextToken();
       Line->Tokens.begin()->Tok->MustBreakBefore = true;
       FormatTok->setFinalizedType(TT_GotoLabelColon);
diff --git a/clang/lib/Frontend/DependencyGraph.cpp b/clang/lib/Frontend/DependencyGraph.cpp
index e96669f856bb..b471471f3528 100644
--- a/clang/lib/Frontend/DependencyGraph.cpp
+++ b/clang/lib/Frontend/DependencyGraph.cpp
@@ -110,8 +110,7 @@ void DependencyGraphCallback::OutputGraphFile() {
     writeNodeReference(OS, AllFiles[I]);
     OS << " [ shape=\"box\", label=\"";
     StringRef FileName = AllFiles[I].getName();
-    if (FileName.starts_with(SysRoot))
-      FileName = FileName.substr(SysRoot.size());
+    FileName.consume_front(SysRoot);
 
     OS << DOT::EscapeString(std::string(FileName)) << "\"];\n";
   }
diff --git a/clang/lib/Frontend/LayoutOverrideSource.cpp b/clang/lib/Frontend/LayoutOverrideSource.cpp
index f474d4fe8fdc..a1866ec09c9d 100644
--- a/clang/lib/Frontend/LayoutOverrideSource.cpp
+++ b/clang/lib/Frontend/LayoutOverrideSource.cpp
@@ -147,8 +147,7 @@ LayoutOverrideSource::LayoutOverrideSource(StringRef Filename) {
 
         // Skip over this offset, the following comma, and any spaces.
         LineStr = LineStr.substr(1);
-        while (!LineStr.empty() && isWhitespace(LineStr[0]))
-          LineStr = LineStr.substr(1);
+        LineStr = LineStr.drop_while(isWhitespace);
       }
     }
 
@@ -163,8 +162,7 @@ LayoutOverrideSource::LayoutOverrideSource(StringRef Filename) {
 
         // Skip over this offset, the following comma, and any spaces.
         LineStr = LineStr.substr(1);
-        while (!LineStr.empty() && isWhitespace(LineStr[0]))
-          LineStr = LineStr.substr(1);
+        LineStr = LineStr.drop_while(isWhitespace);
       }
       continue;
     }
@@ -180,8 +178,7 @@ LayoutOverrideSource::LayoutOverrideSource(StringRef Filename) {
 
         // Skip over this offset, the following comma, and any spaces.
         LineStr = LineStr.substr(1);
-        while (!LineStr.empty() && isWhitespace(LineStr[0]))
-          LineStr = LineStr.substr(1);
+        LineStr = LineStr.drop_while(isWhitespace);
       }
     }
   }
diff --git a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
index 09c1460d54e1..8a3d2286cd16 100644
--- a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
+++ b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
@@ -1144,8 +1144,7 @@ std::unique_ptr<Directive> Directive::create(bool RegexKind,
   std::string RegexStr;
   StringRef S = Text;
   while (!S.empty()) {
-    if (S.starts_with("{{")) {
-      S = S.drop_front(2);
+    if (S.consume_front("{{")) {
       size_t RegexMatchLength = S.find("}}");
       assert(RegexMatchLength != StringRef::npos);
       // Append the regex, enclosed in parentheses.
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index f8fdd402777e..735e4e4e3be8 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -139,6 +139,7 @@ set(webassembly_files
 
 set(x86_files
 # Intrinsics
+  adcintrin.h
   adxintrin.h
   ammintrin.h
   amxcomplexintrin.h
diff --git a/clang/lib/Headers/adcintrin.h b/clang/lib/Headers/adcintrin.h
new file mode 100644
index 000000000000..0065a1b543f8
--- /dev/null
+++ b/clang/lib/Headers/adcintrin.h
@@ -0,0 +1,160 @@
+/*===---- adcintrin.h - ADC intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ADCINTRIN_H
+#define __ADCINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/* Use C++ inline semantics in C++, GNU inline for C mode. */
+#if defined(__cplusplus)
+#define __INLINE __inline
+#else
+#define __INLINE static __inline
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
+///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
+///    at \a __p, and returns the 8-bit carry-out (carry flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store32(__p, __x + __y + temp)
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    A 32-bit unsigned addend.
+/// \param __y
+///    A 32-bit unsigned addend.
+/// \param __p
+///    Pointer to memory for storing the sum.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
+                                                        unsigned int __x,
+                                                        unsigned int __y,
+                                                        unsigned int *__p) {
+  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
+///    flag \a __cf, and subtracts the result from unsigned 32-bit integer
+///    \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
+///    and returns the 8-bit carry-out (carry or overflow flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store32(__p, __x - (__y + temp))
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c SBB instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    The 32-bit unsigned minuend.
+/// \param __y
+///    The 32-bit unsigned subtrahend.
+/// \param __p
+///    Pointer to memory for storing the difference.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
+                                                         unsigned int __x,
+                                                         unsigned int __y,
+                                                         unsigned int *__p) {
+  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
+///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
+///    at \a __p, and returns the 8-bit carry-out (carry flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store64(__p, __x + __y + temp)
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    A 64-bit unsigned addend.
+/// \param __y
+///    A 64-bit unsigned addend.
+/// \param __p
+///    Pointer to memory for storing the sum.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u64(unsigned char __cf, unsigned long long __x,
+              unsigned long long __y, unsigned long long *__p) {
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+
+/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
+///    flag \a __cf, and subtracts the result from unsigned 64-bit integer
+///    \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
+///    and returns the 8-bit carry-out (carry or overflow flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store64(__p, __x - (__y + temp))
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    The 64-bit unsigned minuend.
+/// \param __y
+///    The 64-bit unsigned subtrahend.
+/// \param __p
+///    Pointer to memory for storing the difference.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long *__p) {
+  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+}
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#undef __INLINE
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __ADCINTRIN_H */
diff --git a/clang/lib/Headers/adxintrin.h b/clang/lib/Headers/adxintrin.h
index 20f6211e567b..bc6a4caf3533 100644
--- a/clang/lib/Headers/adxintrin.h
+++ b/clang/lib/Headers/adxintrin.h
@@ -15,7 +15,8 @@
 #define __ADXINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("adx")))
 
 /* Use C++ inline semantics in C++, GNU inline for C mode. */
 #if defined(__cplusplus)
@@ -53,10 +54,10 @@ extern "C" {
 /// \param __p
 ///    Pointer to memory for storing the sum.
 /// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char
-    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-    _addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
-                   unsigned int *__p) {
+__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarryx_u32(unsigned char __cf,
+                                                         unsigned int __x,
+                                                         unsigned int __y,
+                                                         unsigned int *__p) {
   return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
 }
 
@@ -84,137 +85,10 @@ __INLINE unsigned char
 /// \param __p
 ///    Pointer to memory for storing the sum.
 /// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char
-    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-    _addcarryx_u64(unsigned char __cf, unsigned long long __x,
-                   unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
-}
-#endif
-
-/* Intrinsics that are also available if __ADX__ is undefined. */
-
-/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 32-bit unsigned addend.
-/// \param __y
-///    A 32-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
-                                                        unsigned int __x,
-                                                        unsigned int __y,
-                                                        unsigned int *__p) {
-  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 64-bit unsigned addend.
-/// \param __y
-///    A 64-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
 __INLINE unsigned char __DEFAULT_FN_ATTRS
-_addcarry_u64(unsigned char __cf, unsigned long long __x,
-              unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
-}
-#endif
-
-/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
-///    flag \a __cf, and subtracts the result from unsigned 32-bit integer
-///    \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
-///    and returns the 8-bit carry-out (carry or overflow flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x - (__y + temp))
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c SBB instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    The 32-bit unsigned minuend.
-/// \param __y
-///    The 32-bit unsigned subtrahend.
-/// \param __p
-///    Pointer to memory for storing the difference.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
-                                                         unsigned int __x,
-                                                         unsigned int __y,
-                                                         unsigned int *__p) {
-  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
-///    flag \a __cf, and subtracts the result from unsigned 64-bit integer
-///    \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
-///    and returns the 8-bit carry-out (carry or overflow flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x - (__y + temp))
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    The 64-bit unsigned minuend.
-/// \param __y
-///    The 64-bit unsigned subtrahend.
-/// \param __p
-///    Pointer to memory for storing the difference.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS
-_subborrow_u64(unsigned char __cf, unsigned long long __x,
+_addcarryx_u64(unsigned char __cf, unsigned long long __x,
                unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
 }
 #endif
 
@@ -222,6 +96,7 @@ _subborrow_u64(unsigned char __cf, unsigned long long __x,
 }
 #endif
 
+#undef __INLINE
 #undef __DEFAULT_FN_ATTRS
 
 #endif /* __ADXINTRIN_H */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 9bfe2fcdabdb..0149a1cdea63 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -580,9 +580,13 @@ _storebe_i64(void * __P, long long __D) {
 #include <cetintrin.h>
 #endif
 
-/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
- * whereas others are also available at all times. */
+/* Intrinsics inside adcintrin.h are available at all times. */
+#include <adcintrin.h>
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__ADX__)
 #include <adxintrin.h>
+#endif
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__RDSEED__)
diff --git a/clang/lib/Headers/riscv_bitmanip.h b/clang/lib/Headers/riscv_bitmanip.h
index 1a81cc8618c9..2bc7ee022a96 100644
--- a/clang/lib/Headers/riscv_bitmanip.h
+++ b/clang/lib/Headers/riscv_bitmanip.h
@@ -34,7 +34,7 @@ __riscv_ctz_32(uint32_t __x) {
 
 static __inline__ unsigned __attribute__((__always_inline__, __nodebug__))
 __riscv_cpop_32(uint32_t __x) {
-  return __builtin_riscv_cpop_32(__x);
+  return __builtin_popcount(__x);
 }
 
 #if __riscv_xlen == 64
@@ -55,7 +55,7 @@ __riscv_ctz_64(uint64_t __x) {
 
 static __inline__ unsigned __attribute__((__always_inline__, __nodebug__))
 __riscv_cpop_64(uint64_t __x) {
-  return __builtin_riscv_cpop_64(__x);
+  return __builtin_popcountll(__x);
 }
 #endif
 #endif // defined(__riscv_zbb)
@@ -120,7 +120,23 @@ __riscv_zip_32(uint32_t __x) {
 #endif
 #endif // defined(__riscv_zbkb)
 
-#if defined(__riscv_zbkc)
+#if defined(__riscv_zbc)
+#if __riscv_xlen == 32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_clmulr_32(uint32_t __x, uint32_t __y) {
+  return __builtin_riscv_clmulr_32(__x, __y);
+}
+#endif
+
+#if __riscv_xlen == 64
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_clmulr_64(uint64_t __x, uint64_t __y) {
+  return __builtin_riscv_clmulr_64(__x, __y);
+}
+#endif
+#endif // defined(__riscv_zbc)
+
+#if defined(__riscv_zbkc) || defined(__riscv_zbc)
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __riscv_clmul_32(uint32_t __x, uint32_t __y) {
   return __builtin_riscv_clmul_32(__x, __y);
@@ -144,7 +160,7 @@ __riscv_clmulh_64(uint64_t __x, uint64_t __y) {
   return __builtin_riscv_clmulh_64(__x, __y);
 }
 #endif
-#endif // defined(__riscv_zbkc)
+#endif // defined(__riscv_zbkc) || defined(__riscv_zbc)
 
 #if defined(__riscv_zbkx)
 #if __riscv_xlen == 32
diff --git a/clang/lib/Headers/usermsrintrin.h b/clang/lib/Headers/usermsrintrin.h
index 6d1424ad3b2e..61388376706d 100644
--- a/clang/lib/Headers/usermsrintrin.h
+++ b/clang/lib/Headers/usermsrintrin.h
@@ -14,12 +14,33 @@
 #define __USERMSRINTRIN_H
 #ifdef __x86_64__
 
+/// Reads the contents of a 64-bit MSR specified in \a __A into \a dst.
+///
+/// This intrinsic corresponds to the <c> URDMSR </c> instruction.
+/// \param __A
+///    An unsigned long long.
+///
+/// \code{.operation}
+///    DEST := MSR[__A]
+/// \endcode
 static __inline__ unsigned long long
     __attribute__((__always_inline__, __nodebug__, __target__("usermsr")))
     _urdmsr(unsigned long long __A) {
   return __builtin_ia32_urdmsr(__A);
 }
 
+/// Writes the contents of \a __B into the 64-bit MSR specified in \a __A.
+///
+/// This intrinsic corresponds to the <c> UWRMSR </c> instruction.
+///
+/// \param __A
+///    An unsigned long long.
+/// \param __B
+///    An unsigned long long.
+///
+/// \code{.operation}
+///    MSR[__A] := __B
+/// \endcode
 static __inline__ void
     __attribute__((__always_inline__, __nodebug__, __target__("usermsr")))
     _uwrmsr(unsigned long long __A, unsigned long long __B) {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 2a69325f0295..da0570b7b0f1 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1219,8 +1219,8 @@ void Sema::checkFortifiedBuiltinMemoryFunction(FunctionDecl *FD,
     if (IsChkVariant) {
       FunctionName = FunctionName.drop_front(std::strlen("__builtin___"));
       FunctionName = FunctionName.drop_back(std::strlen("_chk"));
-    } else if (FunctionName.starts_with("__builtin_")) {
-      FunctionName = FunctionName.drop_front(std::strlen("__builtin_"));
+    } else {
+      FunctionName.consume_front("__builtin_");
     }
     return FunctionName;
   };
@@ -5388,15 +5388,17 @@ bool Sema::CheckRISCVBuiltinFunctionCall(const TargetInfo &TI,
     QualType Op1Type = TheCall->getArg(0)->getType();
     QualType Op2Type = TheCall->getArg(1)->getType();
     QualType Op3Type = TheCall->getArg(2)->getType();
-    uint64_t ElemSize = Op1Type->isRVVType(32, false) ? 32 : 64;
+    ASTContext::BuiltinVectorTypeInfo Info =
+        Context.getBuiltinVectorTypeInfo(Op1Type->castAs<BuiltinType>());
+    uint64_t ElemSize = Context.getTypeSize(Info.ElementType);
     if (ElemSize == 64 && !TI.hasFeature("zvknhb"))
       return Diag(TheCall->getBeginLoc(),
-                  diag::err_riscv_type_requires_extension)
-             << Op1Type << "zvknhb";
+                  diag::err_riscv_builtin_requires_extension)
+             << /* IsExtension */ true << TheCall->getSourceRange() << "zvknb";
 
-    return CheckInvalidVLENandLMUL(TI, TheCall, *this, Op1Type, ElemSize << 2) ||
-           CheckInvalidVLENandLMUL(TI, TheCall, *this, Op2Type, ElemSize << 2) ||
-           CheckInvalidVLENandLMUL(TI, TheCall, *this, Op3Type, ElemSize << 2);
+    return CheckInvalidVLENandLMUL(TI, TheCall, *this, Op1Type, ElemSize * 4) ||
+           CheckInvalidVLENandLMUL(TI, TheCall, *this, Op2Type, ElemSize * 4) ||
+           CheckInvalidVLENandLMUL(TI, TheCall, *this, Op3Type, ElemSize * 4);
   }
 
   case RISCVVector::BI__builtin_rvv_sf_vc_i_se_u8mf8:
@@ -6170,30 +6172,33 @@ bool Sema::CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
 
 void Sema::checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D) {
   const TargetInfo &TI = Context.getTargetInfo();
+
+  ASTContext::BuiltinVectorTypeInfo Info =
+      Context.getBuiltinVectorTypeInfo(Ty->castAs<BuiltinType>());
+  unsigned EltSize = Context.getTypeSize(Info.ElementType);
+  unsigned MinElts = Info.EC.getKnownMinValue();
+
   // (ELEN, LMUL) pairs of (8, mf8), (16, mf4), (32, mf2), (64, m1) requires at
   // least zve64x
-  if ((Ty->isRVVType(/* Bitwidth */ 64, /* IsFloat */ false) ||
-       Ty->isRVVType(/* ElementCount */ 1)) &&
+  if (((EltSize == 64 && Info.ElementType->isIntegerType()) || MinElts == 1) &&
       !TI.hasFeature("zve64x"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64x";
-  if (Ty->isRVVType(/* Bitwidth */ 16, /* IsFloat */ true) &&
-      !TI.hasFeature("zvfh") && !TI.hasFeature("zvfhmin"))
+  else if (Info.ElementType->isFloat16Type() && !TI.hasFeature("zvfh") &&
+           !TI.hasFeature("zvfhmin"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D)
         << Ty << "zvfh or zvfhmin";
-  // Check if enabled zvfbfmin for BFloat16
-  if (Ty->isRVVType(/* Bitwidth */ 16, /* IsFloat */ false,
-                    /* IsBFloat */ true) &&
-      !TI.hasFeature("experimental-zvfbfmin"))
+  else if (Info.ElementType->isBFloat16Type() &&
+           !TI.hasFeature("experimental-zvfbfmin"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zvfbfmin";
-  if (Ty->isRVVType(/* Bitwidth */ 32, /* IsFloat */ true) &&
-      !TI.hasFeature("zve32f"))
+  else if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Float) &&
+           !TI.hasFeature("zve32f"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32f";
-  if (Ty->isRVVType(/* Bitwidth */ 64, /* IsFloat */ true) &&
-      !TI.hasFeature("zve64d"))
+  else if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Double) &&
+           !TI.hasFeature("zve64d"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64d";
   // Given that caller already checked isRVVType() before calling this function,
   // if we don't have at least zve32x supported, then we need to emit error.
-  if (!TI.hasFeature("zve32x"))
+  else if (!TI.hasFeature("zve32x"))
     Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32x";
 }
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index af8b90ecfed9..4a385a396fa6 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5825,8 +5825,7 @@ struct IntrinToName {
 static bool ArmBuiltinAliasValid(unsigned BuiltinID, StringRef AliasName,
                                  ArrayRef<IntrinToName> Map,
                                  const char *IntrinNames) {
-  if (AliasName.starts_with("__arm_"))
-    AliasName = AliasName.substr(6);
+  AliasName.consume_front("__arm_");
   const IntrinToName *It =
       llvm::lower_bound(Map, BuiltinID, [](const IntrinToName &L, unsigned Id) {
         return L.Id < Id;
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 0fbd87ce34db..cc9db5ded114 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -25,6 +25,7 @@
 #include "clang/Sema/EnterExpressionEvaluationContext.h"
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
+#include "clang/Sema/Ownership.h"
 #include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -5429,18 +5430,12 @@ static void TryOrBuildParenListInitialization(
   auto HandleInitializedEntity = [&](const InitializedEntity &SubEntity,
                                      const InitializationKind &SubKind,
                                      Expr *Arg, Expr **InitExpr = nullptr) {
-    InitializationSequence IS = [&]() {
-      if (Arg)
-        return InitializationSequence(S, SubEntity, SubKind, Arg);
-      return InitializationSequence(S, SubEntity, SubKind, std::nullopt);
-    }();
+    InitializationSequence IS = InitializationSequence(
+        S, SubEntity, SubKind, Arg ? MultiExprArg(Arg) : std::nullopt);
 
     if (IS.Failed()) {
       if (!VerifyOnly) {
-        if (Arg)
-          IS.Diagnose(S, SubEntity, SubKind, Arg);
-        else
-          IS.Diagnose(S, SubEntity, SubKind, std::nullopt);
+        IS.Diagnose(S, SubEntity, SubKind, Arg ? ArrayRef(Arg) : std::nullopt);
       } else {
         Sequence.SetFailed(
             InitializationSequence::FK_ParenthesizedListInitFailed);
@@ -5450,10 +5445,8 @@ static void TryOrBuildParenListInitialization(
     }
     if (!VerifyOnly) {
       ExprResult ER;
-      if (Arg)
-        ER = IS.Perform(S, SubEntity, SubKind, Arg);
-      else
-        ER = IS.Perform(S, SubEntity, SubKind, std::nullopt);
+      ER = IS.Perform(S, SubEntity, SubKind,
+                      Arg ? MultiExprArg(Arg) : std::nullopt);
       if (InitExpr)
         *InitExpr = ER.get();
       else
@@ -5519,6 +5512,14 @@ static void TryOrBuildParenListInitialization(
   } else if (auto *RT = Entity.getType()->getAs<RecordType>()) {
     bool IsUnion = RT->isUnionType();
     const CXXRecordDecl *RD = cast<CXXRecordDecl>(RT->getDecl());
+    if (RD->isInvalidDecl()) {
+      // Exit early to avoid confusion when processing members.
+      // We do the same for braced list initialization in
+      // `CheckStructUnionTypes`.
+      Sequence.SetFailed(
+          clang::InitializationSequence::FK_ParenthesizedListInitFailed);
+      return;
+    }
 
     if (!IsUnion) {
       for (const CXXBaseSpecifier &Base : RD->bases()) {
@@ -10410,40 +10411,53 @@ static void DiagnoseNarrowingInInitList(Sema &S,
     // No narrowing occurred.
     return;
 
-  case NK_Type_Narrowing:
+  case NK_Type_Narrowing: {
     // This was a floating-to-integer conversion, which is always considered a
     // narrowing conversion even if the value is a constant and can be
     // represented exactly as an integer.
-    S.Diag(PostInit->getBeginLoc(), NarrowingErrs(S.getLangOpts())
-                                        ? diag::ext_init_list_type_narrowing
-                                        : diag::warn_init_list_type_narrowing)
+    QualType T = EntityType.getNonReferenceType();
+    S.Diag(PostInit->getBeginLoc(),
+           NarrowingErrs(S.getLangOpts())
+               ? (T == EntityType
+                      ? diag::ext_init_list_type_narrowing
+                      : diag::ext_init_list_type_narrowing_const_reference)
+               : diag::warn_init_list_type_narrowing)
         << PostInit->getSourceRange()
         << PreNarrowingType.getLocalUnqualifiedType()
-        << EntityType.getNonReferenceType().getLocalUnqualifiedType();
+        << T.getLocalUnqualifiedType();
     break;
+  }
 
-  case NK_Constant_Narrowing:
+  case NK_Constant_Narrowing: {
     // A constant value was narrowed.
+    QualType T = EntityType.getNonReferenceType();
     S.Diag(PostInit->getBeginLoc(),
            NarrowingErrs(S.getLangOpts())
-               ? diag::ext_init_list_constant_narrowing
+               ? (T == EntityType
+                      ? diag::ext_init_list_constant_narrowing
+                      : diag::ext_init_list_constant_narrowing_const_reference)
                : diag::warn_init_list_constant_narrowing)
         << PostInit->getSourceRange()
         << ConstantValue.getAsString(S.getASTContext(), ConstantType)
         << EntityType.getNonReferenceType().getLocalUnqualifiedType();
     break;
+  }
 
-  case NK_Variable_Narrowing:
+  case NK_Variable_Narrowing: {
     // A variable's value may have been narrowed.
+    QualType T = EntityType.getNonReferenceType();
     S.Diag(PostInit->getBeginLoc(),
            NarrowingErrs(S.getLangOpts())
-               ? diag::ext_init_list_variable_narrowing
+               ? (T == EntityType
+                      ? diag::ext_init_list_variable_narrowing
+                      : diag::ext_init_list_variable_narrowing_const_reference)
                : diag::warn_init_list_variable_narrowing)
         << PostInit->getSourceRange()
         << PreNarrowingType.getLocalUnqualifiedType()
         << EntityType.getNonReferenceType().getLocalUnqualifiedType();
     break;
   }
+  }
 
   SmallString<128> StaticCast;
   llvm::raw_svector_ostream OS(StaticCast);
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index db0cbd5ec6d6..ed7f626971f3 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -529,7 +529,8 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
   if (!Mod)
     return true;
 
-  if (!Mod->isInterfaceOrPartition() && !ModuleName.empty()) {
+  if (!Mod->isInterfaceOrPartition() && !ModuleName.empty() &&
+      !getLangOpts().ObjC) {
     Diag(ImportLoc, diag::err_module_import_non_interface_nor_parition)
         << ModuleName;
     return true;
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 3826994ef212..f34d2959dc61 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -12683,7 +12683,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
       break;
     }
     case OMPC_fail: {
-      if (AtomicKind != OMPC_compare) {
+      if (!EncounteredAtomicKinds.contains(OMPC_compare)) {
         Diag(C->getBeginLoc(), diag::err_omp_atomic_fail_no_compare)
             << SourceRange(C->getBeginLoc(), C->getEndLoc());
         return StmtError();
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 63348d27a8c9..f0b03db69084 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -1271,6 +1271,9 @@ Sema::ActOnFinishSwitchStmt(SourceLocation SwitchLoc, Stmt *Switch,
 
   bool CaseListIsErroneous = false;
 
+  // FIXME: We'd better diagnose missing or duplicate default labels even
+  // in the dependent case. Because default labels themselves are never
+  // dependent.
   for (SwitchCase *SC = SS->getSwitchCaseList(); SC && !HasDependentValue;
        SC = SC->getNextSwitchCase()) {
 
@@ -1327,9 +1330,6 @@ Sema::ActOnFinishSwitchStmt(SourceLocation SwitchLoc, Stmt *Switch,
     }
   }
 
-  if (!TheDefaultStmt)
-    Diag(SwitchLoc, diag::warn_switch_default);
-
   if (!HasDependentValue) {
     // If we don't have a default statement, check whether the
     // condition is constant.
@@ -1344,6 +1344,7 @@ Sema::ActOnFinishSwitchStmt(SourceLocation SwitchLoc, Stmt *Switch,
       assert(!HasConstantCond ||
              (ConstantCondValue.getBitWidth() == CondWidth &&
               ConstantCondValue.isSigned() == CondIsSigned));
+      Diag(SwitchLoc, diag::warn_switch_default);
     }
     bool ShouldCheckConstantCond = HasConstantCond;
 
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 7140a14aefbf..547eb77930b4 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -584,7 +584,18 @@ void ASTDeclReader::Visit(Decl *D) {
 
 void ASTDeclReader::VisitDecl(Decl *D) {
   BitsUnpacker DeclBits(Record.readInt());
+  auto ModuleOwnership =
+      (Decl::ModuleOwnershipKind)DeclBits.getNextBits(/*Width=*/3);
+  D->setReferenced(DeclBits.getNextBit());
+  D->Used = DeclBits.getNextBit();
+  IsDeclMarkedUsed |= D->Used;
+  D->setAccess((AccessSpecifier)DeclBits.getNextBits(/*Width=*/2));
+  D->setImplicit(DeclBits.getNextBit());
   bool HasStandaloneLexicalDC = DeclBits.getNextBit();
+  bool HasAttrs = DeclBits.getNextBit();
+  D->setTopLevelDeclInObjCContainer(DeclBits.getNextBit());
+  D->InvalidDecl = DeclBits.getNextBit();
+  D->FromASTFile = true;
 
   if (D->isTemplateParameter() || D->isTemplateParameterPack() ||
       isa<ParmVarDecl, ObjCTypeParamDecl>(D)) {
@@ -623,20 +634,6 @@ void ASTDeclReader::VisitDecl(Decl *D) {
   }
   D->setLocation(ThisDeclLoc);
 
-  D->InvalidDecl = DeclBits.getNextBit();
-  bool HasAttrs = DeclBits.getNextBit();
-  D->setImplicit(DeclBits.getNextBit());
-  D->Used = DeclBits.getNextBit();
-  IsDeclMarkedUsed |= D->Used;
-  D->setReferenced(DeclBits.getNextBit());
-  D->setTopLevelDeclInObjCContainer(DeclBits.getNextBit());
-  D->setAccess((AccessSpecifier)DeclBits.getNextBits(/*Width=*/2));
-  D->FromASTFile = true;
-  auto ModuleOwnership =
-      (Decl::ModuleOwnershipKind)DeclBits.getNextBits(/*Width=*/3);
-  bool ModulePrivate =
-      (ModuleOwnership == Decl::ModuleOwnershipKind::ModulePrivate);
-
   if (HasAttrs) {
     AttrVec Attrs;
     Record.readAttributes(Attrs);
@@ -647,8 +644,9 @@ void ASTDeclReader::VisitDecl(Decl *D) {
 
   // Determine whether this declaration is part of a (sub)module. If so, it
   // may not yet be visible.
+  bool ModulePrivate =
+      (ModuleOwnership == Decl::ModuleOwnershipKind::ModulePrivate);
   if (unsigned SubmoduleID = readSubmoduleID()) {
-
     switch (ModuleOwnership) {
     case Decl::ModuleOwnershipKind::Visible:
       ModuleOwnership = Decl::ModuleOwnershipKind::VisibleWhenImported;
@@ -1065,9 +1063,11 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) {
   // after everything else is read.
   BitsUnpacker FunctionDeclBits(Record.readInt());
 
+  FD->setCachedLinkage((Linkage)FunctionDeclBits.getNextBits(/*Width=*/3));
   FD->setStorageClass((StorageClass)FunctionDeclBits.getNextBits(/*Width=*/3));
   FD->setInlineSpecified(FunctionDeclBits.getNextBit());
   FD->setImplicitlyInline(FunctionDeclBits.getNextBit());
+  FD->setHasSkippedBody(FunctionDeclBits.getNextBit());
   FD->setVirtualAsWritten(FunctionDeclBits.getNextBit());
   // We defer calling `FunctionDecl::setPure()` here as for methods of
   // `CXXTemplateSpecializationDecl`s, we may not have connected up the
@@ -1081,16 +1081,14 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) {
   FD->setDefaulted(FunctionDeclBits.getNextBit());
   FD->setExplicitlyDefaulted(FunctionDeclBits.getNextBit());
   FD->setIneligibleOrNotSelected(FunctionDeclBits.getNextBit());
-  FD->setHasImplicitReturnZero(FunctionDeclBits.getNextBit());
   FD->setConstexprKind(
       (ConstexprSpecKind)FunctionDeclBits.getNextBits(/*Width=*/2));
-  FD->setUsesSEHTry(FunctionDeclBits.getNextBit());
-  FD->setHasSkippedBody(FunctionDeclBits.getNextBit());
+  FD->setHasImplicitReturnZero(FunctionDeclBits.getNextBit());
   FD->setIsMultiVersion(FunctionDeclBits.getNextBit());
   FD->setLateTemplateParsed(FunctionDeclBits.getNextBit());
   FD->setFriendConstraintRefersToEnclosingTemplate(
       FunctionDeclBits.getNextBit());
-  FD->setCachedLinkage((Linkage)FunctionDeclBits.getNextBits(/*Width=*/3));
+  FD->setUsesSEHTry(FunctionDeclBits.getNextBit());
 
   FD->EndRangeLoc = readSourceLocation();
   if (FD->isExplicitlyDefaulted())
@@ -1597,6 +1595,8 @@ ASTDeclReader::RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
   VisitDeclaratorDecl(VD);
 
   BitsUnpacker VarDeclBits(Record.readInt());
+  auto VarLinkage = Linkage(VarDeclBits.getNextBits(/*Width=*/3));
+  bool DefGeneratedInModule = VarDeclBits.getNextBit();
   VD->VarDeclBits.SClass = (StorageClass)VarDeclBits.getNextBits(/*Width=*/3);
   VD->VarDeclBits.TSCSpec = VarDeclBits.getNextBits(/*Width=*/2);
   VD->VarDeclBits.InitStyle = VarDeclBits.getNextBits(/*Width=*/2);
@@ -1608,17 +1608,20 @@ ASTDeclReader::RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
     VD->NonParmVarDeclBits.ExceptionVar = VarDeclBits.getNextBit();
     VD->NonParmVarDeclBits.NRVOVariable = VarDeclBits.getNextBit();
     VD->NonParmVarDeclBits.CXXForRangeDecl = VarDeclBits.getNextBit();
-    VD->NonParmVarDeclBits.ObjCForDecl = VarDeclBits.getNextBit();
+
     VD->NonParmVarDeclBits.IsInline = VarDeclBits.getNextBit();
     VD->NonParmVarDeclBits.IsInlineSpecified = VarDeclBits.getNextBit();
     VD->NonParmVarDeclBits.IsConstexpr = VarDeclBits.getNextBit();
     VD->NonParmVarDeclBits.IsInitCapture = VarDeclBits.getNextBit();
     VD->NonParmVarDeclBits.PreviousDeclInSameBlockScope =
         VarDeclBits.getNextBit();
-    VD->NonParmVarDeclBits.ImplicitParamKind =
-        VarDeclBits.getNextBits(/*Width*/ 3);
+
     VD->NonParmVarDeclBits.EscapingByref = VarDeclBits.getNextBit();
     HasDeducedType = VarDeclBits.getNextBit();
+    VD->NonParmVarDeclBits.ImplicitParamKind =
+        VarDeclBits.getNextBits(/*Width*/ 3);
+
+    VD->NonParmVarDeclBits.ObjCForDecl = VarDeclBits.getNextBit();
   }
 
   // If this variable has a deduced type, defer reading that type until we are
@@ -1630,7 +1633,6 @@ ASTDeclReader::RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
     VD->setType(Reader.GetType(DeferredTypeID));
   DeferredTypeID = 0;
 
-  auto VarLinkage = Linkage(VarDeclBits.getNextBits(/*Width=*/3));
   VD->setCachedLinkage(VarLinkage);
 
   // Reconstruct the one piece of the IdentifierNamespace that we need.
@@ -1638,7 +1640,7 @@ ASTDeclReader::RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
       VD->getLexicalDeclContext()->isFunctionOrMethod())
     VD->setLocalExternDecl();
 
-  if (VarDeclBits.getNextBit()) {
+  if (DefGeneratedInModule) {
     Reader.DefinitionSource[VD] =
         Loc.F->Kind == ModuleKind::MK_MainFile ||
         Reader.getContext().getLangOpts().BuildingPCHWithObjectFile;
@@ -1704,10 +1706,10 @@ void ASTDeclReader::VisitImplicitParamDecl(ImplicitParamDecl *PD) {
 void ASTDeclReader::VisitParmVarDecl(ParmVarDecl *PD) {
   VisitVarDecl(PD);
 
+  unsigned scopeIndex = Record.readInt();
   BitsUnpacker ParmVarDeclBits(Record.readInt());
   unsigned isObjCMethodParam = ParmVarDeclBits.getNextBit();
   unsigned scopeDepth = ParmVarDeclBits.getNextBits(/*Width=*/7);
-  unsigned scopeIndex = ParmVarDeclBits.getNextBits(/*Width=*/8);
   unsigned declQualifier = ParmVarDeclBits.getNextBits(/*Width=*/7);
   if (isObjCMethodParam) {
     assert(scopeDepth == 0);
@@ -2660,7 +2662,7 @@ void ASTDeclReader::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
 
   D->setDeclaredWithTypename(Record.readInt());
 
-  if (Record.readBool()) {
+  if (D->hasTypeConstraint()) {
     ConceptReference *CR = nullptr;
     if (Record.readBool())
       CR = Record.readConceptReference();
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index b3a6f619372b..21aed570ba26 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -73,6 +73,8 @@ namespace clang {
     ASTRecordReader &Record;
     llvm::BitstreamCursor &DeclsCursor;
 
+    std::optional<BitsUnpacker> CurrentUnpackingBits;
+
     SourceLocation readSourceLocation() {
       return Record.readSourceLocation();
     }
@@ -110,6 +112,9 @@ namespace clang {
     /// itself.
     static const unsigned NumExprFields = NumStmtFields + 2;
 
+    /// The number of bits required for the packing bits for the Expr class.
+    static const unsigned NumExprBits = 10;
+
     /// Read and initialize a ExplicitTemplateArgumentList structure.
     void ReadTemplateKWAndArgsInfo(ASTTemplateKWAndArgsInfo &Args,
                                    TemplateArgumentLoc *ArgsLocArray,
@@ -214,9 +219,11 @@ void ASTStmtReader::VisitAttributedStmt(AttributedStmt *S) {
 void ASTStmtReader::VisitIfStmt(IfStmt *S) {
   VisitStmt(S);
 
-  bool HasElse = Record.readInt();
-  bool HasVar = Record.readInt();
-  bool HasInit = Record.readInt();
+  CurrentUnpackingBits.emplace(Record.readInt());
+
+  bool HasElse = CurrentUnpackingBits->getNextBit();
+  bool HasVar = CurrentUnpackingBits->getNextBit();
+  bool HasInit = CurrentUnpackingBits->getNextBit();
 
   S->setStatementKind(static_cast<IfStatementKind>(Record.readInt()));
   S->setCond(Record.readSubExpr());
@@ -523,14 +530,15 @@ void ASTStmtReader::VisitCapturedStmt(CapturedStmt *S) {
 
 void ASTStmtReader::VisitExpr(Expr *E) {
   VisitStmt(E);
+  CurrentUnpackingBits.emplace(Record.readInt());
+  E->setDependence(static_cast<ExprDependence>(
+      CurrentUnpackingBits->getNextBits(/*Width=*/5)));
+  E->setValueKind(static_cast<ExprValueKind>(
+      CurrentUnpackingBits->getNextBits(/*Width=*/2)));
+  E->setObjectKind(static_cast<ExprObjectKind>(
+      CurrentUnpackingBits->getNextBits(/*Width=*/3)));
+
   E->setType(Record.readType());
-  BitsUnpacker ExprBits(Record.readInt());
-  E->setDependence(
-      static_cast<ExprDependence>(ExprBits.getNextBits(/*Width=*/5)));
-  E->setValueKind(
-      static_cast<ExprValueKind>(ExprBits.getNextBits(/*Width=*/2)));
-  E->setObjectKind(
-      static_cast<ExprObjectKind>(ExprBits.getNextBits(/*Width=*/3)));
   assert(Record.getIdx() == NumExprFields &&
          "Incorrect expression field count");
 }
@@ -591,13 +599,17 @@ void ASTStmtReader::VisitPredefinedExpr(PredefinedExpr *E) {
 void ASTStmtReader::VisitDeclRefExpr(DeclRefExpr *E) {
   VisitExpr(E);
 
-  E->DeclRefExprBits.HasQualifier = Record.readInt();
-  E->DeclRefExprBits.HasFoundDecl = Record.readInt();
-  E->DeclRefExprBits.HasTemplateKWAndArgsInfo = Record.readInt();
-  E->DeclRefExprBits.HadMultipleCandidates = Record.readInt();
-  E->DeclRefExprBits.RefersToEnclosingVariableOrCapture = Record.readInt();
-  E->DeclRefExprBits.NonOdrUseReason = Record.readInt();
-  E->DeclRefExprBits.IsImmediateEscalating = Record.readInt();
+  CurrentUnpackingBits.emplace(Record.readInt());
+  E->DeclRefExprBits.HadMultipleCandidates = CurrentUnpackingBits->getNextBit();
+  E->DeclRefExprBits.RefersToEnclosingVariableOrCapture =
+      CurrentUnpackingBits->getNextBit();
+  E->DeclRefExprBits.NonOdrUseReason =
+      CurrentUnpackingBits->getNextBits(/*Width=*/2);
+  E->DeclRefExprBits.IsImmediateEscalating = CurrentUnpackingBits->getNextBit();
+  E->DeclRefExprBits.HasFoundDecl = CurrentUnpackingBits->getNextBit();
+  E->DeclRefExprBits.HasQualifier = CurrentUnpackingBits->getNextBit();
+  E->DeclRefExprBits.HasTemplateKWAndArgsInfo =
+      CurrentUnpackingBits->getNextBit();
   E->DeclRefExprBits.CapturedByCopyInLambdaWithExplicitObjectParameter = false;
   unsigned NumTemplateArgs = 0;
   if (E->hasTemplateKWAndArgsInfo())
@@ -706,12 +718,13 @@ void ASTStmtReader::VisitParenListExpr(ParenListExpr *E) {
 
 void ASTStmtReader::VisitUnaryOperator(UnaryOperator *E) {
   VisitExpr(E);
-  bool hasFP_Features = Record.readInt();
+  bool hasFP_Features = CurrentUnpackingBits->getNextBit();
   assert(hasFP_Features == E->hasStoredFPFeatures());
   E->setSubExpr(Record.readSubExpr());
-  E->setOpcode((UnaryOperator::Opcode)Record.readInt());
+  E->setOpcode(
+      (UnaryOperator::Opcode)CurrentUnpackingBits->getNextBits(/*Width=*/5));
   E->setOperatorLoc(readSourceLocation());
-  E->setCanOverflow(Record.readInt());
+  E->setCanOverflow(CurrentUnpackingBits->getNextBit());
   if (hasFP_Features)
     E->setStoredFPFeatures(
         FPOptionsOverride::getFromOpaqueInt(Record.readInt()));
@@ -1000,12 +1013,11 @@ void ASTStmtReader::VisitOMPIteratorExpr(OMPIteratorExpr *E) {
 void ASTStmtReader::VisitCallExpr(CallExpr *E) {
   VisitExpr(E);
 
-  BitsUnpacker CallExprBits = Record.readInt();
-
-  unsigned NumArgs = CallExprBits.getNextBits(/*Width=*/16);
-  bool HasFPFeatures = CallExprBits.getNextBit();
+  unsigned NumArgs = Record.readInt();
+  CurrentUnpackingBits.emplace(Record.readInt());
   E->setADLCallKind(
-      static_cast<CallExpr::ADLCallKind>(CallExprBits.getNextBit()));
+      static_cast<CallExpr::ADLCallKind>(CurrentUnpackingBits->getNextBit()));
+  bool HasFPFeatures = CurrentUnpackingBits->getNextBit();
   assert((NumArgs == E->getNumArgs()) && "Wrong NumArgs!");
   E->setRParenLoc(readSourceLocation());
   E->setCallee(Record.readSubExpr());
@@ -1024,27 +1036,29 @@ void ASTStmtReader::VisitCXXMemberCallExpr(CXXMemberCallExpr *E) {
 void ASTStmtReader::VisitMemberExpr(MemberExpr *E) {
   VisitExpr(E);
 
-  bool HasQualifier = Record.readInt();
-  bool HasFoundDecl = Record.readInt();
-  bool HasTemplateInfo = Record.readInt();
+  CurrentUnpackingBits.emplace(Record.readInt());
+  bool HasQualifier = CurrentUnpackingBits->getNextBit();
+  bool HasFoundDecl = CurrentUnpackingBits->getNextBit();
+  bool HasTemplateInfo = CurrentUnpackingBits->getNextBit();
   unsigned NumTemplateArgs = Record.readInt();
 
   E->Base = Record.readSubExpr();
   E->MemberDecl = Record.readDeclAs<ValueDecl>();
   E->MemberDNLoc = Record.readDeclarationNameLoc(E->MemberDecl->getDeclName());
   E->MemberLoc = Record.readSourceLocation();
-  E->MemberExprBits.IsArrow = Record.readInt();
+  E->MemberExprBits.IsArrow = CurrentUnpackingBits->getNextBit();
   E->MemberExprBits.HasQualifierOrFoundDecl = HasQualifier || HasFoundDecl;
   E->MemberExprBits.HasTemplateKWAndArgsInfo = HasTemplateInfo;
-  E->MemberExprBits.HadMultipleCandidates = Record.readInt();
-  E->MemberExprBits.NonOdrUseReason = Record.readInt();
+  E->MemberExprBits.HadMultipleCandidates = CurrentUnpackingBits->getNextBit();
+  E->MemberExprBits.NonOdrUseReason =
+      CurrentUnpackingBits->getNextBits(/*Width=*/2);
   E->MemberExprBits.OperatorLoc = Record.readSourceLocation();
 
   if (HasQualifier || HasFoundDecl) {
     DeclAccessPair FoundDecl;
     if (HasFoundDecl) {
       auto *FoundD = Record.readDeclAs<NamedDecl>();
-      auto AS = (AccessSpecifier)Record.readInt();
+      auto AS = (AccessSpecifier)CurrentUnpackingBits->getNextBits(/*Width=*/2);
       FoundDecl = DeclAccessPair::make(FoundD, AS);
     } else {
       FoundDecl = DeclAccessPair::make(E->MemberDecl,
@@ -1091,10 +1105,14 @@ void ASTStmtReader::VisitCastExpr(CastExpr *E) {
   VisitExpr(E);
   unsigned NumBaseSpecs = Record.readInt();
   assert(NumBaseSpecs == E->path_size());
-  unsigned HasFPFeatures = Record.readInt();
+
+  CurrentUnpackingBits.emplace(Record.readInt());
+  E->setCastKind((CastKind)CurrentUnpackingBits->getNextBits(/*Width=*/7));
+  unsigned HasFPFeatures = CurrentUnpackingBits->getNextBit();
   assert(E->hasStoredFPFeatures() == HasFPFeatures);
+
   E->setSubExpr(Record.readSubExpr());
-  E->setCastKind((CastKind)Record.readInt());
+
   CastExpr::path_iterator BaseI = E->path_begin();
   while (NumBaseSpecs--) {
     auto *BaseSpec = new (Record.getContext()) CXXBaseSpecifier;
@@ -1107,10 +1125,12 @@ void ASTStmtReader::VisitCastExpr(CastExpr *E) {
 }
 
 void ASTStmtReader::VisitBinaryOperator(BinaryOperator *E) {
-  bool hasFP_Features;
   VisitExpr(E);
-  E->setHasStoredFPFeatures(hasFP_Features = Record.readInt());
-  E->setOpcode((BinaryOperator::Opcode)Record.readInt());
+  CurrentUnpackingBits.emplace(Record.readInt());
+  E->setOpcode(
+      (BinaryOperator::Opcode)CurrentUnpackingBits->getNextBits(/*Width=*/6));
+  bool hasFP_Features = CurrentUnpackingBits->getNextBit();
+  E->setHasStoredFPFeatures(hasFP_Features);
   E->setLHS(Record.readSubExpr());
   E->setRHS(Record.readSubExpr());
   E->setOperatorLoc(readSourceLocation());
@@ -1148,7 +1168,7 @@ ASTStmtReader::VisitBinaryConditionalOperator(BinaryConditionalOperator *E) {
 
 void ASTStmtReader::VisitImplicitCastExpr(ImplicitCastExpr *E) {
   VisitCastExpr(E);
-  E->setIsPartOfExplicitCast(Record.readInt());
+  E->setIsPartOfExplicitCast(CurrentUnpackingBits->getNextBit());
 }
 
 void ASTStmtReader::VisitExplicitCastExpr(ExplicitCastExpr *E) {
@@ -1764,8 +1784,8 @@ void ASTStmtReader::VisitCXXNamedCastExpr(CXXNamedCastExpr *E) {
   SourceRange R = readSourceRange();
   E->Loc = R.getBegin();
   E->RParenLoc = R.getEnd();
-  R = readSourceRange();
-  E->AngleBrackets = R;
+  if (CurrentUnpackingBits->getNextBit())
+    E->AngleBrackets = readSourceRange();
 }
 
 void ASTStmtReader::VisitCXXStaticCastExpr(CXXStaticCastExpr *E) {
@@ -1961,9 +1981,10 @@ void ASTStmtReader::VisitCXXDependentScopeMemberExpr(
     CXXDependentScopeMemberExpr *E) {
   VisitExpr(E);
 
-  bool HasTemplateKWAndArgsInfo = Record.readInt();
   unsigned NumTemplateArgs = Record.readInt();
-  bool HasFirstQualifierFoundInScope = Record.readInt();
+  CurrentUnpackingBits.emplace(Record.readInt());
+  bool HasTemplateKWAndArgsInfo = CurrentUnpackingBits->getNextBit();
+  bool HasFirstQualifierFoundInScope = CurrentUnpackingBits->getNextBit();
 
   assert((HasTemplateKWAndArgsInfo == E->hasTemplateKWAndArgsInfo()) &&
          "Wrong HasTemplateKWAndArgsInfo!");
@@ -1979,11 +2000,18 @@ void ASTStmtReader::VisitCXXDependentScopeMemberExpr(
   assert((NumTemplateArgs == E->getNumTemplateArgs()) &&
          "Wrong NumTemplateArgs!");
 
-  E->CXXDependentScopeMemberExprBits.IsArrow = Record.readInt();
-  E->CXXDependentScopeMemberExprBits.OperatorLoc = readSourceLocation();
+  E->CXXDependentScopeMemberExprBits.IsArrow =
+      CurrentUnpackingBits->getNextBit();
+
   E->BaseType = Record.readType();
   E->QualifierLoc = Record.readNestedNameSpecifierLoc();
-  E->Base = Record.readSubExpr();
+  // not ImplicitAccess
+  if (CurrentUnpackingBits->getNextBit())
+    E->Base = Record.readSubExpr();
+  else
+    E->Base = nullptr;
+
+  E->CXXDependentScopeMemberExprBits.OperatorLoc = readSourceLocation();
 
   if (HasFirstQualifierFoundInScope)
     *E->getTrailingObjects<NamedDecl *>() = readDeclAs<NamedDecl>();
@@ -1995,11 +2023,11 @@ void
 ASTStmtReader::VisitDependentScopeDeclRefExpr(DependentScopeDeclRefExpr *E) {
   VisitExpr(E);
 
-  if (Record.readInt()) // HasTemplateKWAndArgsInfo
+  if (CurrentUnpackingBits->getNextBit()) // HasTemplateKWAndArgsInfo
     ReadTemplateKWAndArgsInfo(
         *E->getTrailingObjects<ASTTemplateKWAndArgsInfo>(),
         E->getTrailingObjects<TemplateArgumentLoc>(),
-        /*NumTemplateArgs=*/Record.readInt());
+        /*NumTemplateArgs=*/CurrentUnpackingBits->getNextBits(/*Width=*/16));
 
   E->QualifierLoc = Record.readNestedNameSpecifierLoc();
   E->NameInfo = Record.readDeclarationNameInfo();
@@ -2022,15 +2050,15 @@ ASTStmtReader::VisitCXXUnresolvedConstructExpr(CXXUnresolvedConstructExpr *E) {
 void ASTStmtReader::VisitOverloadExpr(OverloadExpr *E) {
   VisitExpr(E);
 
-  BitsUnpacker OverloadExprBits = Record.readInt();
-  unsigned NumResults = OverloadExprBits.getNextBits(/*Width=*/14);
-  bool HasTemplateKWAndArgsInfo = OverloadExprBits.getNextBit();
+  unsigned NumResults = Record.readInt();
+  CurrentUnpackingBits.emplace(Record.readInt());
+  bool HasTemplateKWAndArgsInfo = CurrentUnpackingBits->getNextBit();
   assert((E->getNumDecls() == NumResults) && "Wrong NumResults!");
   assert((E->hasTemplateKWAndArgsInfo() == HasTemplateKWAndArgsInfo) &&
          "Wrong HasTemplateKWAndArgsInfo!");
 
   if (HasTemplateKWAndArgsInfo) {
-    unsigned NumTemplateArgs = OverloadExprBits.getNextBits(/*Width=*/14);
+    unsigned NumTemplateArgs = Record.readInt();
     ReadTemplateKWAndArgsInfo(*E->getTrailingASTTemplateKWAndArgsInfo(),
                               E->getTrailingTemplateArgumentLoc(),
                               NumTemplateArgs);
@@ -2057,17 +2085,24 @@ void ASTStmtReader::VisitOverloadExpr(OverloadExpr *E) {
 
 void ASTStmtReader::VisitUnresolvedMemberExpr(UnresolvedMemberExpr *E) {
   VisitOverloadExpr(E);
-  E->UnresolvedMemberExprBits.IsArrow = Record.readInt();
-  E->UnresolvedMemberExprBits.HasUnresolvedUsing = Record.readInt();
-  E->Base = Record.readSubExpr();
-  E->BaseType = Record.readType();
+  E->UnresolvedMemberExprBits.IsArrow = CurrentUnpackingBits->getNextBit();
+  E->UnresolvedMemberExprBits.HasUnresolvedUsing =
+      CurrentUnpackingBits->getNextBit();
+
+  if (/*!isImplicitAccess=*/CurrentUnpackingBits->getNextBit())
+    E->Base = Record.readSubExpr();
+  else
+    E->Base = nullptr;
+
   E->OperatorLoc = readSourceLocation();
+
+  E->BaseType = Record.readType();
 }
 
 void ASTStmtReader::VisitUnresolvedLookupExpr(UnresolvedLookupExpr *E) {
   VisitOverloadExpr(E);
-  E->UnresolvedLookupExprBits.RequiresADL = Record.readInt();
-  E->UnresolvedLookupExprBits.Overloaded = Record.readInt();
+  E->UnresolvedLookupExprBits.RequiresADL = CurrentUnpackingBits->getNextBit();
+  E->UnresolvedLookupExprBits.Overloaded = CurrentUnpackingBits->getNextBit();
   E->NamingClass = readDeclAs<CXXRecordDecl>();
 }
 
@@ -2142,9 +2177,12 @@ void ASTStmtReader::VisitSubstNonTypeTemplateParmExpr(
                                               SubstNonTypeTemplateParmExpr *E) {
   VisitExpr(E);
   E->AssociatedDeclAndRef.setPointer(readDeclAs<Decl>());
-  E->AssociatedDeclAndRef.setInt(Record.readInt());
-  E->Index = Record.readInt();
-  E->PackIndex = Record.readInt();
+  E->AssociatedDeclAndRef.setInt(CurrentUnpackingBits->getNextBit());
+  E->Index = CurrentUnpackingBits->getNextBits(/*Width=*/12);
+  if (CurrentUnpackingBits->getNextBit())
+    E->PackIndex = Record.readInt();
+  else
+    E->PackIndex = 0;
   E->SubstNonTypeTemplateParmExprBits.NameLoc = readSourceLocation();
   E->Replacement = Record.readSubExpr();
 }
@@ -2836,11 +2874,12 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = new (Context) NullStmt(Empty);
       break;
 
-    case STMT_COMPOUND:
-      S = CompoundStmt::CreateEmpty(
-          Context, /*NumStmts=*/Record[ASTStmtReader::NumStmtFields],
-          /*HasFPFeatures=*/Record[ASTStmtReader::NumStmtFields + 1]);
+    case STMT_COMPOUND: {
+      unsigned NumStmts = Record[ASTStmtReader::NumStmtFields];
+      bool HasFPFeatures = Record[ASTStmtReader::NumStmtFields + 1];
+      S = CompoundStmt::CreateEmpty(Context, NumStmts, HasFPFeatures);
       break;
+    }
 
     case STMT_CASE:
       S = CaseStmt::CreateEmpty(
@@ -2862,13 +2901,14 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
         /*NumAttrs*/Record[ASTStmtReader::NumStmtFields]);
       break;
 
-    case STMT_IF:
-      S = IfStmt::CreateEmpty(
-          Context,
-          /* HasElse=*/Record[ASTStmtReader::NumStmtFields],
-          /* HasVar=*/Record[ASTStmtReader::NumStmtFields + 1],
-          /* HasInit=*/Record[ASTStmtReader::NumStmtFields + 2]);
+    case STMT_IF: {
+      BitsUnpacker IfStmtBits(Record[ASTStmtReader::NumStmtFields]);
+      bool HasElse = IfStmtBits.getNextBit();
+      bool HasVar = IfStmtBits.getNextBit();
+      bool HasInit = IfStmtBits.getNextBit();
+      S = IfStmt::CreateEmpty(Context, HasElse, HasVar, HasInit);
       break;
+    }
 
     case STMT_SWITCH:
       S = SwitchStmt::CreateEmpty(
@@ -2945,17 +2985,19 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
           /*HasFunctionName*/ Record[ASTStmtReader::NumExprFields]);
       break;
 
-    case EXPR_DECL_REF:
-      S = DeclRefExpr::CreateEmpty(
-          Context,
-          /*HasQualifier=*/Record[ASTStmtReader::NumExprFields],
-          /*HasFoundDecl=*/Record[ASTStmtReader::NumExprFields + 1],
-          /*HasTemplateKWAndArgsInfo=*/Record[ASTStmtReader::NumExprFields + 2],
-          /*NumTemplateArgs=*/
-          Record[ASTStmtReader::NumExprFields + 2]
-              ? Record[ASTStmtReader::NumExprFields + 7]
-              : 0);
+    case EXPR_DECL_REF: {
+      BitsUnpacker DeclRefExprBits(Record[ASTStmtReader::NumExprFields]);
+      DeclRefExprBits.advance(5);
+      bool HasFoundDecl = DeclRefExprBits.getNextBit();
+      bool HasQualifier = DeclRefExprBits.getNextBit();
+      bool HasTemplateKWAndArgsInfo = DeclRefExprBits.getNextBit();
+      unsigned NumTemplateArgs = HasTemplateKWAndArgsInfo
+                                     ? Record[ASTStmtReader::NumExprFields + 1]
+                                     : 0;
+      S = DeclRefExpr::CreateEmpty(Context, HasQualifier, HasFoundDecl,
+                                   HasTemplateKWAndArgsInfo, NumTemplateArgs);
       break;
+    }
 
     case EXPR_INTEGER_LITERAL:
       S = IntegerLiteral::Create(Context, Empty);
@@ -2995,10 +3037,13 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
           /* NumExprs=*/Record[ASTStmtReader::NumExprFields]);
       break;
 
-    case EXPR_UNARY_OPERATOR:
-      S = UnaryOperator::CreateEmpty(Context,
-                                     Record[ASTStmtReader::NumExprFields]);
+    case EXPR_UNARY_OPERATOR: {
+      BitsUnpacker UnaryOperatorBits(Record[ASTStmtReader::NumStmtFields]);
+      UnaryOperatorBits.advance(ASTStmtReader::NumExprBits);
+      bool HasFPFeatures = UnaryOperatorBits.getNextBit();
+      S = UnaryOperator::CreateEmpty(Context, HasFPFeatures);
       break;
+    }
 
     case EXPR_OFFSETOF:
       S = OffsetOfExpr::CreateEmpty(Context,
@@ -3033,8 +3078,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_CALL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields]);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/16);
+      auto NumArgs = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CallExprBits.advance(1);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = CallExpr::CreateEmpty(Context, NumArgs, HasFPFeatures, Empty);
       break;
@@ -3045,22 +3091,32 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
           Context, /*NumArgs=*/Record[ASTStmtReader::NumExprFields]);
       break;
 
-    case EXPR_MEMBER:
-      S = MemberExpr::CreateEmpty(Context, Record[ASTStmtReader::NumExprFields],
-                                  Record[ASTStmtReader::NumExprFields + 1],
-                                  Record[ASTStmtReader::NumExprFields + 2],
-                                  Record[ASTStmtReader::NumExprFields + 3]);
+    case EXPR_MEMBER: {
+      BitsUnpacker ExprMemberBits(Record[ASTStmtReader::NumExprFields]);
+      bool HasQualifier = ExprMemberBits.getNextBit();
+      bool HasFoundDecl = ExprMemberBits.getNextBit();
+      bool HasTemplateInfo = ExprMemberBits.getNextBit();
+      unsigned NumTemplateArgs = Record[ASTStmtReader::NumExprFields + 1];
+      S = MemberExpr::CreateEmpty(Context, HasQualifier, HasFoundDecl,
+                                  HasTemplateInfo, NumTemplateArgs);
       break;
+    }
 
-    case EXPR_BINARY_OPERATOR:
-      S = BinaryOperator::CreateEmpty(Context,
-                                      Record[ASTStmtReader::NumExprFields]);
+    case EXPR_BINARY_OPERATOR: {
+      BitsUnpacker BinaryOperatorBits(Record[ASTStmtReader::NumExprFields]);
+      BinaryOperatorBits.advance(/*Size of opcode*/ 6);
+      bool HasFPFeatures = BinaryOperatorBits.getNextBit();
+      S = BinaryOperator::CreateEmpty(Context, HasFPFeatures);
       break;
+    }
 
-    case EXPR_COMPOUND_ASSIGN_OPERATOR:
-      S = CompoundAssignOperator::CreateEmpty(
-          Context, Record[ASTStmtReader::NumExprFields]);
+    case EXPR_COMPOUND_ASSIGN_OPERATOR: {
+      BitsUnpacker BinaryOperatorBits(Record[ASTStmtReader::NumExprFields]);
+      BinaryOperatorBits.advance(/*Size of opcode*/ 6);
+      bool HasFPFeatures = BinaryOperatorBits.getNextBit();
+      S = CompoundAssignOperator::CreateEmpty(Context, HasFPFeatures);
       break;
+    }
 
     case EXPR_CONDITIONAL_OPERATOR:
       S = new (Context) ConditionalOperator(Empty);
@@ -3070,19 +3126,23 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = new (Context) BinaryConditionalOperator(Empty);
       break;
 
-    case EXPR_IMPLICIT_CAST:
-      S = ImplicitCastExpr::CreateEmpty(
-          Context,
-          /*PathSize*/ Record[ASTStmtReader::NumExprFields],
-          /*HasFPFeatures*/ Record[ASTStmtReader::NumExprFields + 1]);
+    case EXPR_IMPLICIT_CAST: {
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CastExprBits.advance(7);
+      bool HasFPFeatures = CastExprBits.getNextBit();
+      S = ImplicitCastExpr::CreateEmpty(Context, PathSize, HasFPFeatures);
       break;
+    }
 
-    case EXPR_CSTYLE_CAST:
-      S = CStyleCastExpr::CreateEmpty(
-          Context,
-          /*PathSize*/ Record[ASTStmtReader::NumExprFields],
-          /*HasFPFeatures*/ Record[ASTStmtReader::NumExprFields + 1]);
+    case EXPR_CSTYLE_CAST: {
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CastExprBits.advance(7);
+      bool HasFPFeatures = CastExprBits.getNextBit();
+      S = CStyleCastExpr::CreateEmpty(Context, PathSize, HasFPFeatures);
       break;
+    }
 
     case EXPR_COMPOUND_LITERAL:
       S = new (Context) CompoundLiteralExpr(Empty);
@@ -3777,8 +3837,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
     }
 
     case EXPR_CXX_OPERATOR_CALL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields]);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/16);
+      auto NumArgs = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CallExprBits.advance(1);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = CXXOperatorCallExpr::CreateEmpty(Context, NumArgs, HasFPFeatures,
                                            Empty);
@@ -3786,8 +3847,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
     }
 
     case EXPR_CXX_MEMBER_CALL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields]);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/16);
+      auto NumArgs = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CallExprBits.advance(1);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = CXXMemberCallExpr::CreateEmpty(Context, NumArgs, HasFPFeatures,
                                          Empty);
@@ -3814,22 +3876,26 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
           /* NumArgs=*/Record[ASTStmtReader::NumExprFields]);
       break;
 
-    case EXPR_CXX_STATIC_CAST:
-      S = CXXStaticCastExpr::CreateEmpty(
-          Context,
-          /*PathSize*/ Record[ASTStmtReader::NumExprFields],
-          /*HasFPFeatures*/ Record[ASTStmtReader::NumExprFields + 1]);
+    case EXPR_CXX_STATIC_CAST: {
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CastExprBits.advance(7);
+      bool HasFPFeatures = CastExprBits.getNextBit();
+      S = CXXStaticCastExpr::CreateEmpty(Context, PathSize, HasFPFeatures);
       break;
+    }
 
-    case EXPR_CXX_DYNAMIC_CAST:
-      S = CXXDynamicCastExpr::CreateEmpty(Context,
-                       /*PathSize*/ Record[ASTStmtReader::NumExprFields]);
+    case EXPR_CXX_DYNAMIC_CAST: {
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      S = CXXDynamicCastExpr::CreateEmpty(Context, PathSize);
       break;
+    }
 
-    case EXPR_CXX_REINTERPRET_CAST:
-      S = CXXReinterpretCastExpr::CreateEmpty(Context,
-                       /*PathSize*/ Record[ASTStmtReader::NumExprFields]);
+    case EXPR_CXX_REINTERPRET_CAST: {
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      S = CXXReinterpretCastExpr::CreateEmpty(Context, PathSize);
       break;
+    }
 
     case EXPR_CXX_CONST_CAST:
       S = CXXConstCastExpr::CreateEmpty(Context);
@@ -3839,21 +3905,28 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = CXXAddrspaceCastExpr::CreateEmpty(Context);
       break;
 
-    case EXPR_CXX_FUNCTIONAL_CAST:
-      S = CXXFunctionalCastExpr::CreateEmpty(
-          Context,
-          /*PathSize*/ Record[ASTStmtReader::NumExprFields],
-          /*HasFPFeatures*/ Record[ASTStmtReader::NumExprFields + 1]);
+    case EXPR_CXX_FUNCTIONAL_CAST: {
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CastExprBits.advance(7);
+      bool HasFPFeatures = CastExprBits.getNextBit();
+      S = CXXFunctionalCastExpr::CreateEmpty(Context, PathSize, HasFPFeatures);
       break;
+    }
 
-    case EXPR_BUILTIN_BIT_CAST:
-      assert(Record[ASTStmtReader::NumExprFields] == 0 && "Wrong PathSize!");
+    case EXPR_BUILTIN_BIT_CAST: {
+#ifndef NDEBUG
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      assert(PathSize == 0 && "Wrong PathSize!");
+#endif
       S = new (Context) BuiltinBitCastExpr(Empty);
       break;
+    }
 
     case EXPR_USER_DEFINED_LITERAL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields]);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/16);
+      auto NumArgs = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CallExprBits.advance(1);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = UserDefinedLiteral::CreateEmpty(Context, NumArgs, HasFPFeatures,
                                           Empty);
@@ -3944,47 +4017,62 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
                                    Record[ASTStmtReader::NumExprFields]);
       break;
 
-    case EXPR_CXX_DEPENDENT_SCOPE_MEMBER:
+    case EXPR_CXX_DEPENDENT_SCOPE_MEMBER: {
+      unsigned NumTemplateArgs = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker DependentScopeMemberBits(
+          Record[ASTStmtReader::NumExprFields + 1]);
+      bool HasTemplateKWAndArgsInfo = DependentScopeMemberBits.getNextBit();
+
+      bool HasFirstQualifierFoundInScope =
+          DependentScopeMemberBits.getNextBit();
       S = CXXDependentScopeMemberExpr::CreateEmpty(
-          Context,
-          /*HasTemplateKWAndArgsInfo=*/Record[ASTStmtReader::NumExprFields],
-          /*NumTemplateArgs=*/Record[ASTStmtReader::NumExprFields + 1],
-          /*HasFirstQualifierFoundInScope=*/
-          Record[ASTStmtReader::NumExprFields + 2]);
+          Context, HasTemplateKWAndArgsInfo, NumTemplateArgs,
+          HasFirstQualifierFoundInScope);
       break;
+    }
 
-    case EXPR_CXX_DEPENDENT_SCOPE_DECL_REF:
-      S = DependentScopeDeclRefExpr::CreateEmpty(Context,
-         /*HasTemplateKWAndArgsInfo=*/Record[ASTStmtReader::NumExprFields],
-                  /*NumTemplateArgs=*/Record[ASTStmtReader::NumExprFields]
-                                   ? Record[ASTStmtReader::NumExprFields + 1]
-                                   : 0);
+    case EXPR_CXX_DEPENDENT_SCOPE_DECL_REF: {
+      BitsUnpacker DependentScopeDeclRefBits(
+          Record[ASTStmtReader::NumStmtFields]);
+      DependentScopeDeclRefBits.advance(ASTStmtReader::NumExprBits);
+      bool HasTemplateKWAndArgsInfo = DependentScopeDeclRefBits.getNextBit();
+      unsigned NumTemplateArgs =
+          HasTemplateKWAndArgsInfo
+              ? DependentScopeDeclRefBits.getNextBits(/*Width=*/16)
+              : 0;
+      S = DependentScopeDeclRefExpr::CreateEmpty(
+          Context, HasTemplateKWAndArgsInfo, NumTemplateArgs);
       break;
+    }
 
     case EXPR_CXX_UNRESOLVED_CONSTRUCT:
       S = CXXUnresolvedConstructExpr::CreateEmpty(Context,
                               /*NumArgs=*/Record[ASTStmtReader::NumExprFields]);
       break;
 
-    case EXPR_CXX_UNRESOLVED_MEMBER:
+    case EXPR_CXX_UNRESOLVED_MEMBER: {
+      auto NumResults = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker OverloadExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      auto HasTemplateKWAndArgsInfo = OverloadExprBits.getNextBit();
+      auto NumTemplateArgs = HasTemplateKWAndArgsInfo
+                                 ? Record[ASTStmtReader::NumExprFields + 2]
+                                 : 0;
       S = UnresolvedMemberExpr::CreateEmpty(
-          Context,
-          /*NumResults=*/Record[ASTStmtReader::NumExprFields] & ((1 << 14) - 1),
-          /*HasTemplateKWAndArgsInfo=*/
-          (Record[ASTStmtReader::NumExprFields] >> 14) & (0x1),
-          /*NumTemplateArgs=*/Record[ASTStmtReader::NumExprFields] >> 14 &
-              ((1 << 14) - 1));
+          Context, NumResults, HasTemplateKWAndArgsInfo, NumTemplateArgs);
       break;
+    }
 
-    case EXPR_CXX_UNRESOLVED_LOOKUP:
+    case EXPR_CXX_UNRESOLVED_LOOKUP: {
+      auto NumResults = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker OverloadExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      auto HasTemplateKWAndArgsInfo = OverloadExprBits.getNextBit();
+      auto NumTemplateArgs = HasTemplateKWAndArgsInfo
+                                 ? Record[ASTStmtReader::NumExprFields + 2]
+                                 : 0;
       S = UnresolvedLookupExpr::CreateEmpty(
-          Context,
-          /*NumResults=*/Record[ASTStmtReader::NumExprFields] & ((1 << 14) - 1),
-          /*HasTemplateKWAndArgsInfo=*/
-          (Record[ASTStmtReader::NumExprFields] >> 14) & (0x1),
-          /*NumTemplateArgs=*/Record[ASTStmtReader::NumExprFields] >> 14 &
-              ((1 << 14) - 1));
+          Context, NumResults, HasTemplateKWAndArgsInfo, NumTemplateArgs);
       break;
+    }
 
     case EXPR_TYPE_TRAIT:
       S = TypeTraitExpr::CreateDeserialized(Context,
@@ -4044,8 +4132,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_CUDA_KERNEL_CALL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields]);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/16);
+      auto NumArgs = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CallExprBits.advance(1);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = CUDAKernelCallExpr::CreateEmpty(Context, NumArgs, HasFPFeatures,
                                           Empty);
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 91eb2af8f8ad..78939bfd533f 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6003,12 +6003,17 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
 
   BitsPacker DefinitionBits;
 
-#define FIELD(Name, Width, Merge) DefinitionBits.addBits(Data.Name, Width);
+#define FIELD(Name, Width, Merge)                                              \
+  if (!DefinitionBits.canWriteNextNBits(Width)) {                              \
+    Record->push_back(DefinitionBits);                                         \
+    DefinitionBits.reset(0);                                                   \
+  }                                                                            \
+  DefinitionBits.addBits(Data.Name, Width);
+
 #include "clang/AST/CXXRecordDeclDefinitionBits.def"
 #undef FIELD
 
-  while (DefinitionBits.hasUnconsumedValues())
-    Record->push_back(DefinitionBits.getNextValue());
+  Record->push_back(DefinitionBits);
 
   // getODRHash will compute the ODRHash if it has not been previously computed.
   Record->push_back(D->getODRHash());
@@ -6047,7 +6052,7 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
     LambdaBits.addBits(Lambda.CaptureDefault, /*Width=*/2);
     LambdaBits.addBits(Lambda.NumCaptures, /*Width=*/15);
     LambdaBits.addBit(Lambda.HasKnownInternalLinkage);
-    Record->push_back(LambdaBits.getNextValue());
+    Record->push_back(LambdaBits);
 
     Record->push_back(Lambda.NumExplicitCaptures);
     Record->push_back(Lambda.ManglingNumber);
@@ -6058,10 +6063,12 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
     for (unsigned I = 0, N = Lambda.NumCaptures; I != N; ++I) {
       const LambdaCapture &Capture = Lambda.Captures.front()[I];
       AddSourceLocation(Capture.getLocation());
+
       BitsPacker CaptureBits;
       CaptureBits.addBit(Capture.isImplicit());
       CaptureBits.addBits(Capture.getCaptureKind(), /*Width=*/3);
       Record->push_back(CaptureBits);
+
       switch (Capture.getCaptureKind()) {
       case LCK_StarThis:
       case LCK_This:
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 43169b2befc6..9e3299f04918 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -321,15 +321,25 @@ void ASTDeclWriter::Visit(Decl *D) {
 
 void ASTDeclWriter::VisitDecl(Decl *D) {
   BitsPacker DeclBits;
+
+  // The order matters here. It will be better to put the bit with higher
+  // probability to be 0 in the end of the bits.
+  //
+  // Since we're using VBR6 format to store it.
+  // It will be pretty effient if all the higher bits are 0.
+  // For example, if we need to pack 8 bits into a value and the stored value
+  // is 0xf0, the actual stored value will be 0b000111'110000, which takes 12
+  // bits actually. However, if we changed the order to be 0x0f, then we can
+  // store it as 0b001111, which takes 6 bits only now.
+  DeclBits.addBits((uint64_t)D->getModuleOwnershipKind(), /*BitWidth=*/3);
+  DeclBits.addBit(D->isReferenced());
+  DeclBits.addBit(D->isUsed(false));
+  DeclBits.addBits(D->getAccess(), /*BitWidth=*/2);
+  DeclBits.addBit(D->isImplicit());
   DeclBits.addBit(D->getDeclContext() != D->getLexicalDeclContext());
-  DeclBits.addBit(D->isInvalidDecl());
   DeclBits.addBit(D->hasAttrs());
-  DeclBits.addBit(D->isImplicit());
-  DeclBits.addBit(D->isUsed(false));
-  DeclBits.addBit(D->isReferenced());
   DeclBits.addBit(D->isTopLevelDeclInObjCContainer());
-  DeclBits.addBits(D->getAccess(), /*BitWidth=*/2);
-  DeclBits.addBits((uint64_t)D->getModuleOwnershipKind(), /*BitWidth=*/3);
+  DeclBits.addBit(D->isInvalidDecl());
   Record.push_back(DeclBits);
 
   Record.AddDeclRef(cast_or_null<Decl>(D->getDeclContext()));
@@ -493,21 +503,13 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
     Record.AddDeclRef(nullptr);
   }
 
-  if (D->getDeclContext() == D->getLexicalDeclContext() &&
-      !D->hasAttrs() &&
-      !D->isImplicit() &&
-      !D->isUsed(false) &&
-      !D->hasExtInfo() &&
+  if (D->getDeclContext() == D->getLexicalDeclContext() && !D->hasAttrs() &&
+      !D->isInvalidDecl() && !D->isImplicit() && !D->hasExtInfo() &&
       !D->getTypedefNameForAnonDecl() &&
       D->getFirstDecl() == D->getMostRecentDecl() &&
-      !D->isInvalidDecl() &&
-      !D->isReferenced() &&
       !D->isTopLevelDeclInObjCContainer() &&
-      D->getAccess() == AS_none &&
-      !D->isModulePrivate() &&
       !CXXRecordDecl::classofKind(D->getKind()) &&
-      !D->getIntegerTypeSourceInfo() &&
-      !D->getMemberSpecializationInfo() &&
+      !D->getIntegerTypeSourceInfo() && !D->getMemberSpecializationInfo() &&
       !needsAnonymousDeclarationNumber(D) &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier)
     AbbrevToUse = Writer.getDeclEnumAbbrev();
@@ -542,18 +544,11 @@ void ASTDeclWriter::VisitRecordDecl(RecordDecl *D) {
   if (!isa<CXXRecordDecl>(D))
     Record.push_back(D->getODRHash());
 
-  if (D->getDeclContext() == D->getLexicalDeclContext() &&
-      !D->hasAttrs() &&
-      !D->isImplicit() &&
-      !D->isUsed(false) &&
-      !D->hasExtInfo() &&
+  if (D->getDeclContext() == D->getLexicalDeclContext() && !D->hasAttrs() &&
+      !D->isImplicit() && !D->isInvalidDecl() && !D->hasExtInfo() &&
       !D->getTypedefNameForAnonDecl() &&
       D->getFirstDecl() == D->getMostRecentDecl() &&
-      !D->isInvalidDecl() &&
-      !D->isReferenced() &&
       !D->isTopLevelDeclInObjCContainer() &&
-      D->getAccess() == AS_none &&
-      !D->isModulePrivate() &&
       !CXXRecordDecl::classofKind(D->getKind()) &&
       !needsAnonymousDeclarationNumber(D) &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier)
@@ -674,11 +669,16 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   Record.AddDeclarationNameLoc(D->DNLoc, D->getDeclName());
   Record.push_back(D->getIdentifierNamespace());
 
+  // The order matters here. It will be better to put the bit with higher
+  // probability to be 0 in the end of the bits. See the comments in VisitDecl
+  // for details.
   BitsPacker FunctionDeclBits;
   // FIXME: stable encoding
+  FunctionDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), 3);
   FunctionDeclBits.addBits((uint32_t)D->getStorageClass(), /*BitWidth=*/3);
   FunctionDeclBits.addBit(D->isInlineSpecified());
   FunctionDeclBits.addBit(D->isInlined());
+  FunctionDeclBits.addBit(D->hasSkippedBody());
   FunctionDeclBits.addBit(D->isVirtualAsWritten());
   FunctionDeclBits.addBit(D->isPure());
   FunctionDeclBits.addBit(D->hasInheritedPrototype());
@@ -689,14 +689,12 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   FunctionDeclBits.addBit(D->isDefaulted());
   FunctionDeclBits.addBit(D->isExplicitlyDefaulted());
   FunctionDeclBits.addBit(D->isIneligibleOrNotSelected());
-  FunctionDeclBits.addBit(D->hasImplicitReturnZero());
   FunctionDeclBits.addBits((uint64_t)(D->getConstexprKind()), /*BitWidth=*/2);
-  FunctionDeclBits.addBit(D->usesSEHTry());
-  FunctionDeclBits.addBit(D->hasSkippedBody());
+  FunctionDeclBits.addBit(D->hasImplicitReturnZero());
   FunctionDeclBits.addBit(D->isMultiVersion());
   FunctionDeclBits.addBit(D->isLateTemplateParsed());
   FunctionDeclBits.addBit(D->FriendConstraintRefersToEnclosingTemplate());
-  FunctionDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), 3);
+  FunctionDeclBits.addBit(D->usesSEHTry());
   Record.push_back(FunctionDeclBits);
 
   Record.AddSourceLocation(D->getEndLoc());
@@ -1060,7 +1058,28 @@ void ASTDeclWriter::VisitVarDecl(VarDecl *D) {
   VisitRedeclarable(D);
   VisitDeclaratorDecl(D);
 
+  // The order matters here. It will be better to put the bit with higher
+  // probability to be 0 in the end of the bits. See the comments in VisitDecl
+  // for details.
   BitsPacker VarDeclBits;
+  VarDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()),
+                      /*BitWidth=*/3);
+
+  bool ModulesCodegen = false;
+  if (Writer.WritingModule && D->getStorageDuration() == SD_Static &&
+      !D->getDescribedVarTemplate()) {
+    // When building a C++20 module interface unit or a partition unit, a
+    // strong definition in the module interface is provided by the
+    // compilation of that unit, not by its users. (Inline variables are still
+    // emitted in module users.)
+    ModulesCodegen =
+        (Writer.WritingModule->isInterfaceOrPartition() ||
+         (D->hasAttr<DLLExportAttr>() &&
+          Writer.Context->getLangOpts().BuildingPCHWithObjectFile)) &&
+        Writer.Context->GetGVALinkageForVariable(D) >= GVA_StrongExternal;
+  }
+  VarDeclBits.addBit(ModulesCodegen);
+
   VarDeclBits.addBits(D->getStorageClass(), /*BitWidth=*/3);
   VarDeclBits.addBits(D->getTSCSpec(), /*BitWidth=*/2);
   VarDeclBits.addBits(D->getInitStyle(), /*BitWidth=*/2);
@@ -1072,41 +1091,26 @@ void ASTDeclWriter::VisitVarDecl(VarDecl *D) {
     VarDeclBits.addBit(D->isExceptionVariable());
     VarDeclBits.addBit(D->isNRVOVariable());
     VarDeclBits.addBit(D->isCXXForRangeDecl());
-    VarDeclBits.addBit(D->isObjCForDecl());
+
     VarDeclBits.addBit(D->isInline());
     VarDeclBits.addBit(D->isInlineSpecified());
     VarDeclBits.addBit(D->isConstexpr());
     VarDeclBits.addBit(D->isInitCapture());
     VarDeclBits.addBit(D->isPreviousDeclInSameBlockScope());
 
+    VarDeclBits.addBit(D->isEscapingByref());
+    HasDeducedType = D->getType()->getContainedDeducedType();
+    VarDeclBits.addBit(HasDeducedType);
+
     if (const auto *IPD = dyn_cast<ImplicitParamDecl>(D))
       VarDeclBits.addBits(llvm::to_underlying(IPD->getParameterKind()),
                           /*Width=*/3);
     else
       VarDeclBits.addBits(0, /*Width=*/3);
 
-    VarDeclBits.addBit(D->isEscapingByref());
-    HasDeducedType = D->getType()->getContainedDeducedType();
-    VarDeclBits.addBit(HasDeducedType);
-  }
-
-  VarDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), /*BitWidth=*/3);
-
-  bool ModulesCodegen = false;
-  if (Writer.WritingModule && D->getStorageDuration() == SD_Static &&
-      !D->getDescribedVarTemplate()) {
-    // When building a C++20 module interface unit or a partition unit, a
-    // strong definition in the module interface is provided by the
-    // compilation of that unit, not by its users. (Inline variables are still
-    // emitted in module users.)
-    ModulesCodegen =
-        (Writer.WritingModule->isInterfaceOrPartition() ||
-         (D->hasAttr<DLLExportAttr>() &&
-          Writer.Context->getLangOpts().BuildingPCHWithObjectFile)) &&
-         Writer.Context->GetGVALinkageForVariable(D) >= GVA_StrongExternal;
+    VarDeclBits.addBit(D->isObjCForDecl());
   }
 
-  VarDeclBits.addBit(ModulesCodegen);
   Record.push_back(VarDeclBits);
 
   if (ModulesCodegen)
@@ -1135,29 +1139,17 @@ void ASTDeclWriter::VisitVarDecl(VarDecl *D) {
     Record.push_back(VarNotTemplate);
   }
 
-  if (D->getDeclContext() == D->getLexicalDeclContext() &&
-      !D->hasAttrs() &&
-      !D->isImplicit() &&
-      !D->isUsed(false) &&
-      !D->isInvalidDecl() &&
-      !D->isReferenced() &&
+  if (D->getDeclContext() == D->getLexicalDeclContext() && !D->hasAttrs() &&
       !D->isTopLevelDeclInObjCContainer() &&
-      D->getAccess() == AS_none &&
-      !D->isModulePrivate() &&
       !needsAnonymousDeclarationNumber(D) &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier &&
-      !D->hasExtInfo() &&
-      D->getFirstDecl() == D->getMostRecentDecl() &&
-      D->getKind() == Decl::Var &&
-      !D->isInline() &&
-      !D->isConstexpr() &&
-      !D->isInitCapture() &&
-      !D->isPreviousDeclInSameBlockScope() &&
-      !D->isEscapingByref() &&
-      !HasDeducedType &&
-      D->getStorageDuration() != SD_Static &&
-      !D->getDescribedVarTemplate() &&
-      !D->getMemberSpecializationInfo())
+      !D->hasExtInfo() && D->getFirstDecl() == D->getMostRecentDecl() &&
+      D->getKind() == Decl::Var && !D->isInline() && !D->isConstexpr() &&
+      !D->isInitCapture() && !D->isPreviousDeclInSameBlockScope() &&
+      !D->isEscapingByref() && !HasDeducedType &&
+      D->getStorageDuration() != SD_Static && !D->getDescribedVarTemplate() &&
+      !D->getMemberSpecializationInfo() && !D->isObjCForDecl() &&
+      !isa<ImplicitParamDecl>(D) && !D->isEscapingByref())
     AbbrevToUse = Writer.getDeclVarAbbrev();
 
   Code = serialization::DECL_VAR;
@@ -1171,10 +1163,14 @@ void ASTDeclWriter::VisitImplicitParamDecl(ImplicitParamDecl *D) {
 void ASTDeclWriter::VisitParmVarDecl(ParmVarDecl *D) {
   VisitVarDecl(D);
 
+  // See the implementation of `ParmVarDecl::getParameterIndex()`, which may
+  // exceed the size of the normal bitfield. So it may be better to not pack
+  // these bits.
+  Record.push_back(D->getFunctionScopeIndex());
+
   BitsPacker ParmVarDeclBits;
   ParmVarDeclBits.addBit(D->isObjCMethodParameter());
   ParmVarDeclBits.addBits(D->getFunctionScopeDepth(), /*BitsWidth=*/7);
-  ParmVarDeclBits.addBits(D->getFunctionScopeIndex(), /*BitsWidth=*/8);
   // FIXME: stable encoding
   ParmVarDeclBits.addBits(D->getObjCDeclQualifier(), /*BitsWidth=*/7);
   ParmVarDeclBits.addBit(D->isKNRPromoted());
@@ -1193,14 +1189,10 @@ void ASTDeclWriter::VisitParmVarDecl(ParmVarDecl *D) {
   // we dynamically check for the properties that we optimize for, but don't
   // know are true of all PARM_VAR_DECLs.
   if (D->getDeclContext() == D->getLexicalDeclContext() && !D->hasAttrs() &&
-      !D->hasExtInfo() && !D->isImplicit() && !D->isUsed(false) &&
-      !D->isInvalidDecl() && !D->isReferenced() && D->getAccess() == AS_none &&
-      !D->isModulePrivate() && D->getStorageClass() == 0 &&
+      !D->hasExtInfo() && D->getStorageClass() == 0 && !D->isInvalidDecl() &&
+      !D->isTopLevelDeclInObjCContainer() &&
       D->getInitStyle() == VarDecl::CInit && // Can params have anything else?
-      D->getFunctionScopeDepth() == 0 && D->getObjCDeclQualifier() == 0 &&
-      !D->isKNRPromoted() && !D->isExplicitObjectParameter() &&
-      !D->hasInheritedDefaultArg() && D->getInit() == nullptr &&
-      !D->hasUninstantiatedDefaultArg()) // No default expr.
+      D->getInit() == nullptr)               // No default expr.
     AbbrevToUse = Writer.getDeclParmVarAbbrev();
 
   // Check things we know are true of *every* PARM_VAR_DECL, which is more than
@@ -1403,6 +1395,13 @@ void ASTDeclWriter::VisitUsingShadowDecl(UsingShadowDecl *D) {
   Record.push_back(D->getIdentifierNamespace());
   Record.AddDeclRef(D->UsingOrNextShadow);
   Record.AddDeclRef(Context.getInstantiatedFromUsingShadowDecl(D));
+
+  if (D->getDeclContext() == D->getLexicalDeclContext() &&
+      D->getFirstDecl() == D->getMostRecentDecl() && !D->hasAttrs() &&
+      !needsAnonymousDeclarationNumber(D) &&
+      D->getDeclName().getNameKind() == DeclarationName::Identifier)
+    AbbrevToUse = Writer.getDeclUsingShadowAbbrev();
+
   Code = serialization::DECL_USING_SHADOW;
 }
 
@@ -1507,10 +1506,32 @@ void ASTDeclWriter::VisitCXXMethodDecl(CXXMethodDecl *D) {
       D->getFirstDecl() == D->getMostRecentDecl() && !D->isInvalidDecl() &&
       !D->hasAttrs() && !D->isTopLevelDeclInObjCContainer() &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier &&
-      !D->hasExtInfo() && !D->hasInheritedPrototype() &&
-      D->hasWrittenPrototype() &&
-      D->getTemplatedKind() == FunctionDecl::TK_NonTemplate)
-    AbbrevToUse = Writer.getDeclCXXMethodAbbrev();
+      !D->hasExtInfo() && !D->isExplicitlyDefaulted()) {
+    if (D->getTemplatedKind() == FunctionDecl::TK_NonTemplate ||
+        D->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate ||
+        D->getTemplatedKind() == FunctionDecl::TK_MemberSpecialization ||
+        D->getTemplatedKind() == FunctionDecl::TK_DependentNonTemplate)
+      AbbrevToUse = Writer.getDeclCXXMethodAbbrev(D->getTemplatedKind());
+    else if (D->getTemplatedKind() ==
+             FunctionDecl::TK_FunctionTemplateSpecialization) {
+      FunctionTemplateSpecializationInfo *FTSInfo =
+          D->getTemplateSpecializationInfo();
+
+      if (FTSInfo->TemplateArguments->size() == 1) {
+        const TemplateArgument &TA = FTSInfo->TemplateArguments->get(0);
+        if (TA.getKind() == TemplateArgument::Type &&
+            !FTSInfo->TemplateArgumentsAsWritten &&
+            !FTSInfo->getMemberSpecializationInfo())
+          AbbrevToUse = Writer.getDeclCXXMethodAbbrev(D->getTemplatedKind());
+      }
+    } else if (D->getTemplatedKind() ==
+               FunctionDecl::TK_DependentFunctionTemplateSpecialization) {
+      DependentFunctionTemplateSpecializationInfo *DFTSInfo =
+          D->getDependentSpecializationInfo();
+      if (!DFTSInfo->TemplateArgumentsAsWritten)
+        AbbrevToUse = Writer.getDeclCXXMethodAbbrev(D->getTemplatedKind());
+    }
+  }
 
   Code = serialization::DECL_CXX_METHOD;
 }
@@ -1782,7 +1803,7 @@ void ASTDeclWriter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   Record.push_back(D->wasDeclaredWithTypename());
 
   const TypeConstraint *TC = D->getTypeConstraint();
-  Record.push_back(TC != nullptr);
+  assert((bool)TC == D->hasTypeConstraint());
   if (TC) {
     auto *CR = TC->getConceptReference();
     Record.push_back(CR != nullptr);
@@ -1800,6 +1821,13 @@ void ASTDeclWriter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   if (OwnsDefaultArg)
     Record.AddTypeSourceInfo(D->getDefaultArgumentInfo());
 
+  if (!TC && !OwnsDefaultArg &&
+      D->getDeclContext() == D->getLexicalDeclContext() &&
+      !D->isInvalidDecl() && !D->hasAttrs() &&
+      !D->isTopLevelDeclInObjCContainer() && !D->isImplicit() &&
+      D->getDeclName().getNameKind() == DeclarationName::Identifier)
+    AbbrevToUse = Writer.getDeclTemplateTypeParmAbbrev();
+
   Code = serialization::DECL_TEMPLATE_TYPE_PARM;
 }
 
@@ -2031,6 +2059,106 @@ void ASTDeclWriter::VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D) {
 // ASTWriter Implementation
 //===----------------------------------------------------------------------===//
 
+namespace {
+template <FunctionDecl::TemplatedKind Kind>
+std::shared_ptr<llvm::BitCodeAbbrev>
+getFunctionDeclAbbrev(serialization::DeclCode Code) {
+  using namespace llvm;
+
+  auto Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(Code));
+  // RedeclarableDecl
+  Abv->Add(BitCodeAbbrevOp(0)); // CanonicalDecl
+  Abv->Add(BitCodeAbbrevOp(Kind));
+  if constexpr (Kind == FunctionDecl::TK_NonTemplate) {
+
+  } else if constexpr (Kind == FunctionDecl::TK_FunctionTemplate) {
+    // DescribedFunctionTemplate
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  } else if constexpr (Kind == FunctionDecl::TK_DependentNonTemplate) {
+    // Instantiated From Decl
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  } else if constexpr (Kind == FunctionDecl::TK_MemberSpecialization) {
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // InstantiatedFrom
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                             3)); // TemplateSpecializationKind
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Specialized Location
+  } else if constexpr (Kind ==
+                       FunctionDecl::TK_FunctionTemplateSpecialization) {
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Template
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                             3)); // TemplateSpecializationKind
+    Abv->Add(BitCodeAbbrevOp(1)); // Template Argument Size
+    Abv->Add(BitCodeAbbrevOp(TemplateArgument::Type)); // Template Argument Kind
+    Abv->Add(
+        BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Template Argument Type
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Is Defaulted
+    Abv->Add(BitCodeAbbrevOp(0)); // TemplateArgumentsAsWritten
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SourceLocation
+    Abv->Add(BitCodeAbbrevOp(0));
+    Abv->Add(
+        BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Canonical Decl of template
+  } else if constexpr (Kind == FunctionDecl::
+                                   TK_DependentFunctionTemplateSpecialization) {
+    // Candidates of specialization
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abv->Add(BitCodeAbbrevOp(0)); // TemplateArgumentsAsWritten
+  } else {
+    llvm_unreachable("Unknown templated kind?");
+  }
+  // Decl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                           8)); // Packed DeclBits: ModuleOwnershipKind,
+                                // isUsed, isReferenced,  AccessSpecifier,
+                                // isImplicit
+                                //
+                                // The following bits should be 0:
+                                // HasStandaloneLexicalDC, HasAttrs,
+                                // TopLevelDeclInObjCContainer,
+                                // isInvalidDecl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
+  // NamedDecl
+  Abv->Add(BitCodeAbbrevOp(DeclarationName::Identifier)); // NameKind
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));     // Identifier
+  Abv->Add(BitCodeAbbrevOp(0));                           // AnonDeclNumber
+  // ValueDecl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // DeclaratorDecl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // InnerLocStart
+  Abv->Add(BitCodeAbbrevOp(0));                       // HasExtInfo
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TSIType
+  // FunctionDecl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 11)); // IDNS
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      27)); // Packed Function Bits: StorageClass, Inline, InlineSpecified,
+            // VirtualAsWritten, Pure, HasInheritedProto, HasWrittenProto,
+            // Deleted, Trivial, TrivialForCall, Defaulted, ExplicitlyDefaulted,
+            // IsIneligibleOrNotSelected, ImplicitReturnZero, Constexpr,
+            // UsesSEHTry, SkippedBody, MultiVersion, LateParsed,
+            // FriendConstraintRefersToEnclosingTemplate, Linkage
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // LocEnd
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // ODRHash
+  // This Array slurps the rest of the record. Fortunately we want to encode
+  // (nearly) all the remaining (variable number of) fields in the same way.
+  //
+  // This is:
+  //         NumParams and Params[] from FunctionDecl, and
+  //         NumOverriddenMethods, OverriddenMethods[] from CXXMethodDecl.
+  //
+  //  Add an AbbrevOp for 'size then elements' and use it here.
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  return Abv;
+}
+
+template <FunctionDecl::TemplatedKind Kind>
+std::shared_ptr<llvm::BitCodeAbbrev> getCXXMethodAbbrev() {
+  return getFunctionDeclAbbrev<Kind>(serialization::DECL_CXX_METHOD);
+}
+} // namespace
+
 void ASTWriter::WriteDeclAbbrevs() {
   using namespace llvm;
 
@@ -2041,10 +2169,13 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_FIELD));
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
-                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
-                                 // isReferenced, TopLevelDeclInObjCContainer,
-                                 // AccessSpecifier, ModuleOwnershipKind
+                           7)); // Packed DeclBits: ModuleOwnershipKind,
+                                // isUsed, isReferenced,  AccessSpecifier,
+                                //
+                                // The following bits should be 0:
+                                // isImplicit, HasStandaloneLexicalDC, HasAttrs,
+                                // TopLevelDeclInObjCContainer,
+                                // isInvalidDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
@@ -2104,10 +2235,13 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(0));                       // No redeclaration
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
-                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
-                                 // isReferenced, TopLevelDeclInObjCContainer,
-                                 // AccessSpecifier, ModuleOwnershipKind
+                           7)); // Packed DeclBits: ModuleOwnershipKind,
+                                // isUsed, isReferenced,  AccessSpecifier,
+                                //
+                                // The following bits should be 0:
+                                // isImplicit, HasStandaloneLexicalDC, HasAttrs,
+                                // TopLevelDeclInObjCContainer,
+                                // isInvalidDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
@@ -2145,10 +2279,13 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(0));                       // No redeclaration
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
-                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
-                                 // isReferenced, TopLevelDeclInObjCContainer,
-                                 // AccessSpecifier, ModuleOwnershipKind
+                           7)); // Packed DeclBits: ModuleOwnershipKind,
+                                // isUsed, isReferenced,  AccessSpecifier,
+                                //
+                                // The following bits should be 0:
+                                // isImplicit, HasStandaloneLexicalDC, HasAttrs,
+                                // TopLevelDeclInObjCContainer,
+                                // isInvalidDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
@@ -2193,10 +2330,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(0));                       // No redeclaration
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
-                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
-                                 // isReferenced, TopLevelDeclInObjCContainer,
-                                 // AccessSpecifier, ModuleOwnershipKind
+                           8)); // Packed DeclBits: ModuleOwnershipKind, isUsed,
+                                // isReferenced, AccessSpecifier,
+                                // HasStandaloneLexicalDC, HasAttrs, isImplicit,
+                                // TopLevelDeclInObjCContainer,
+                                // isInvalidDecl,
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
@@ -2216,10 +2354,11 @@ void ASTWriter::WriteDeclAbbrevs() {
                             // isARCPseudoStrong, Linkage, ModulesCodegen
   Abv->Add(BitCodeAbbrevOp(0));                          // VarKind (local enum)
   // ParmVarDecl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ScopeIndex
   Abv->Add(BitCodeAbbrevOp(
       BitCodeAbbrevOp::Fixed,
-      27)); // Packed Parm Var Decl bits: IsObjCMethodParameter, ScopeDepth,
-            // ScopeIndex, ObjCDeclQualifier, KNRPromoted,
+      19)); // Packed Parm Var Decl bits: IsObjCMethodParameter, ScopeDepth,
+            // ObjCDeclQualifier, KNRPromoted,
             // HasInheritedDefaultArg, HasUninstantiatedDefaultArg
   // Type Source Info
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
@@ -2233,10 +2372,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(0));                       // No redeclaration
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
-                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
-                                 // isReferenced, TopLevelDeclInObjCContainer,
-                                 // AccessSpecifier, ModuleOwnershipKind
+                           7)); // Packed DeclBits: ModuleOwnershipKind,
+                                // isReferenced, isUsed, AccessSpecifier. Other
+                                // higher bits should be 0: isImplicit,
+                                // HasStandaloneLexicalDC, HasAttrs,
+                                // TopLevelDeclInObjCContainer, isInvalidDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
@@ -2277,12 +2417,13 @@ void ASTWriter::WriteDeclAbbrevs() {
   // VarDecl
   Abv->Add(BitCodeAbbrevOp(
       BitCodeAbbrevOp::Fixed,
-      27)); // Packed Var Decl bits: SClass, TSCSpec, InitStyle,
+      21)); // Packed Var Decl bits:  Linkage, ModulesCodegen,
+            // SClass, TSCSpec, InitStyle,
             // isARCPseudoStrong, IsThisDeclarationADemotedDefinition,
             // isExceptionVariable, isNRVOVariable, isCXXForRangeDecl,
-            // isObjCForDecl, isInline, isInlineSpecified, isConstexpr,
-            // isInitCapture, isPrevDeclInSameScope, ImplicitParamKind,
-            // EscapingByref, HasDeducedType, Linkage, ModulesCodegen
+            // isInline, isInlineSpecified, isConstexpr,
+            // isInitCapture, isPrevDeclInSameScope,
+            // EscapingByref, HasDeducedType, ImplicitParamKind, isObjCForDecl
   Abv->Add(BitCodeAbbrevOp(0));                         // VarKind (local enum)
   // Type Source Info
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
@@ -2290,71 +2431,84 @@ void ASTWriter::WriteDeclAbbrevs() {
   DeclVarAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_CXX_METHOD
+  DeclCXXMethodAbbrev =
+      Stream.EmitAbbrev(getCXXMethodAbbrev<FunctionDecl::TK_NonTemplate>());
+  DeclTemplateCXXMethodAbbrev = Stream.EmitAbbrev(
+      getCXXMethodAbbrev<FunctionDecl::TK_FunctionTemplate>());
+  DeclDependentNonTemplateCXXMethodAbbrev = Stream.EmitAbbrev(
+      getCXXMethodAbbrev<FunctionDecl::TK_DependentNonTemplate>());
+  DeclMemberSpecializedCXXMethodAbbrev = Stream.EmitAbbrev(
+      getCXXMethodAbbrev<FunctionDecl::TK_MemberSpecialization>());
+  DeclTemplateSpecializedCXXMethodAbbrev = Stream.EmitAbbrev(
+      getCXXMethodAbbrev<FunctionDecl::TK_FunctionTemplateSpecialization>());
+  DeclDependentSpecializationCXXMethodAbbrev = Stream.EmitAbbrev(
+      getCXXMethodAbbrev<
+          FunctionDecl::TK_DependentFunctionTemplateSpecialization>());
+
+  // Abbreviation for DECL_TEMPLATE_TYPE_PARM
   Abv = std::make_shared<BitCodeAbbrev>();
-  Abv->Add(BitCodeAbbrevOp(serialization::DECL_CXX_METHOD));
-  // RedeclarableDecl
-  Abv->Add(BitCodeAbbrevOp(0));                         // CanonicalDecl
-  // FIXME: Implement abbreviation for other template kinds.
-  Abv->Add(BitCodeAbbrevOp(FunctionDecl::TK_NonTemplate)); // TemplateKind
+  Abv->Add(BitCodeAbbrevOp(serialization::DECL_TEMPLATE_TYPE_PARM));
+  Abv->Add(BitCodeAbbrevOp(0)); // hasTypeConstraint
+  // Decl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                           7)); // Packed DeclBits: ModuleOwnershipKind,
+                                // isReferenced, isUsed, AccessSpecifier. Other
+                                // higher bits should be 0: isImplicit,
+                                // HasStandaloneLexicalDC, HasAttrs,
+                                // TopLevelDeclInObjCContainer, isInvalidDecl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
+  // NamedDecl
+  Abv->Add(BitCodeAbbrevOp(0));                       // NameKind = Identifier
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Name
+  Abv->Add(BitCodeAbbrevOp(0));
+  // TypeDecl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type Ref
+  // TemplateTypeParmDecl
+  Abv->Add(
+      BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // wasDeclaredWithTypename
+  Abv->Add(BitCodeAbbrevOp(0));                    // OwnsDefaultArg
+  DeclTemplateTypeParmAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
+  // Abbreviation for DECL_USING_SHADOW
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::DECL_USING_SHADOW));
+  // Redeclarable
+  Abv->Add(BitCodeAbbrevOp(0)); // No redeclaration
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
                            12)); // Packed DeclBits: HasStandaloneLexicalDC,
                                  // isInvalidDecl, HasAttrs, isImplicit, isUsed,
                                  // isReferenced, TopLevelDeclInObjCContainer,
                                  // AccessSpecifier, ModuleOwnershipKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // DeclContext
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // SubmoduleID
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
-  Abv->Add(BitCodeAbbrevOp(DeclarationName::Identifier)); // NameKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Identifier
-  Abv->Add(BitCodeAbbrevOp(0));                         // AnonDeclNumber
-  // ValueDecl
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Type
-  // DeclaratorDecl
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // InnerLocStart
-  Abv->Add(BitCodeAbbrevOp(0));                         // HasExtInfo
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // TSIType
-  // FunctionDecl
+  Abv->Add(BitCodeAbbrevOp(0));                       // NameKind = Identifier
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Name
+  Abv->Add(BitCodeAbbrevOp(0));
+  // UsingShadowDecl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // TargetDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 11)); // IDNS
-  Abv->Add(BitCodeAbbrevOp(
-      BitCodeAbbrevOp::Fixed,
-      27)); // Packed Function Bits: StorageClass, Inline, InlineSpecified,
-            // VirtualAsWritten, Pure, HasInheritedProto, HasWrittenProto,
-            // Deleted, Trivial, TrivialForCall, Defaulted, ExplicitlyDefaulted,
-            // IsIneligibleOrNotSelected, ImplicitReturnZero, Constexpr,
-            // UsesSEHTry, SkippedBody, MultiVersion, LateParsed,
-            // FriendConstraintRefersToEnclosingTemplate, Linkage
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LocEnd
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Default
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // ODRHash
-  // This Array slurps the rest of the record. Fortunately we want to encode
-  // (nearly) all the remaining (variable number of) fields in the same way.
-  //
-  // This is:
-  //         NumParams and Params[] from FunctionDecl, and
-  //         NumOverriddenMethods, OverriddenMethods[] from CXXMethodDecl.
-  //
-  //  Add an AbbrevOp for 'size then elements' and use it here.
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  DeclCXXMethodAbbrev = Stream.EmitAbbrev(std::move(Abv));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // UsingOrNextShadow
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR,
+                           6)); // InstantiatedFromUsingShadowDecl
+  DeclUsingShadowAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for EXPR_DECL_REF
   Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_DECL_REF));
-  //Stmt
-  // Expr
+  // Stmt
+  //  Expr
+  //  PackingBits: DependenceKind, ValueKind. ObjectKind should be 0.
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
-  // DependenceKind, ValueKind, ObjectKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
-  //DeclRefExpr
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //HasQualifier
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //GetDeclFound
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ExplicitTemplateArgs
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //HadMultipleCandidates
-  Abv->Add(BitCodeAbbrevOp(0)); // RefersToEnclosingVariableOrCapture
-  Abv->Add(BitCodeAbbrevOp(0)); // NonOdrUseReason
-  Abv->Add(BitCodeAbbrevOp(0)); // IsImmediateEscalating
+  // DeclRefExpr
+  // Packing Bits: , HadMultipleCandidates, RefersToEnclosingVariableOrCapture,
+  // IsImmediateEscalating, NonOdrUseReason.
+  // GetDeclFound, HasQualifier and ExplicitTemplateArgs should be 0.
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 5));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclRef
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Location
   DeclRefExprAbbrev = Stream.EmitAbbrev(std::move(Abv));
@@ -2364,10 +2518,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_INTEGER_LITERAL));
   //Stmt
   // Expr
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
   // DependenceKind, ValueKind, ObjectKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
-  //Integer Literal
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // Integer Literal
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Location
   Abv->Add(BitCodeAbbrevOp(32));                      // Bit Width
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Value
@@ -2378,10 +2532,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_CHARACTER_LITERAL));
   //Stmt
   // Expr
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
   // DependenceKind, ValueKind, ObjectKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
-  //Character Literal
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // Character Literal
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // getValue
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Location
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // getKind
@@ -2392,17 +2546,108 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_IMPLICIT_CAST));
   // Stmt
   // Expr
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
-  // DependenceKind, ValueKind, ObjectKind
+  // Packing Bits: DependenceKind, ValueKind, ObjectKind,
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
   // CastExpr
   Abv->Add(BitCodeAbbrevOp(0)); // PathSize
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // HasFPFeatures
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 6)); // CastKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // PartOfExplicitCast
+  // Packing Bits: CastKind, StoredFPFeatures, isPartOfExplicitCast
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 9));
   // ImplicitCastExpr
   ExprImplicitCastAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
+  // Abbreviation for EXPR_BINARY_OPERATOR
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::EXPR_BINARY_OPERATOR));
+  // Stmt
+  // Expr
+  // Packing Bits: DependenceKind. ValueKind and ObjectKind should
+  // be 0 in this case.
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 5));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // BinaryOperator
+  Abv->Add(
+      BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpCode and HasFPFeatures
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  BinaryOperatorAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
+  // Abbreviation for EXPR_COMPOUND_ASSIGN_OPERATOR
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::EXPR_COMPOUND_ASSIGN_OPERATOR));
+  // Stmt
+  // Expr
+  // Packing Bits: DependenceKind. ValueKind and ObjectKind should
+  // be 0 in this case.
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 5));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // BinaryOperator
+  // Packing Bits: OpCode. The HasFPFeatures bit should be 0
+  Abv->Add(
+      BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpCode and HasFPFeatures
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  // CompoundAssignOperator
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHSType
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Result Type
+  CompoundAssignOperatorAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
+  // Abbreviation for EXPR_CALL
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::EXPR_CALL));
+  // Stmt
+  // Expr
+  // Packing Bits: DependenceKind, ValueKind, ObjectKind,
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // CallExpr
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // NumArgs
+  Abv->Add(BitCodeAbbrevOp(0));                       // ADLCallKind
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  CallExprAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
+  // Abbreviation for EXPR_CXX_OPERATOR_CALL
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::EXPR_CXX_OPERATOR_CALL));
+  // Stmt
+  // Expr
+  // Packing Bits: DependenceKind, ValueKind, ObjectKind,
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // CallExpr
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // NumArgs
+  Abv->Add(BitCodeAbbrevOp(0));                       // ADLCallKind
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  // CXXOperatorCallExpr
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Operator Kind
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  CXXOperatorCallExprAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
+  // Abbreviation for EXPR_CXX_MEMBER_CALL
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::EXPR_CXX_MEMBER_CALL));
+  // Stmt
+  // Expr
+  // Packing Bits: DependenceKind, ValueKind, ObjectKind,
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // CallExpr
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // NumArgs
+  Abv->Add(BitCodeAbbrevOp(0));                       // ADLCallKind
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  // CXXMemberCallExpr
+  CXXMemberCallExprAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
+  // Abbreviation for STMT_COMPOUND
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::STMT_COMPOUND));
+  // Stmt
+  // CompoundStmt
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Num Stmts
+  Abv->Add(BitCodeAbbrevOp(0));                       // hasStoredFPFeatures
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  CompoundStmtAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
   Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_CONTEXT_LEXICAL));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 8524484ea8a0..7f888e44dde1 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -37,15 +37,70 @@ namespace clang {
     serialization::StmtCode Code;
     unsigned AbbrevToUse;
 
+    /// A helper that can help us to write a packed bit across function
+    /// calls. For example, we may write seperate bits in seperate functions:
+    ///
+    ///  void VisitA(A* a) {
+    ///     Record.push_back(a->isSomething());
+    ///  }
+    ///
+    ///  void Visitb(B *b) {
+    ///     VisitA(b);
+    ///     Record.push_back(b->isAnother());
+    ///  }
+    ///
+    /// In such cases, it'll be better if we can pack these 2 bits. We achieve
+    /// this by writing a zero value in `VisitA` and recorded that first and add
+    /// the new bit to the recorded value.
+    class PakedBitsWriter {
+    public:
+      PakedBitsWriter(ASTRecordWriter &Record) : RecordRef(Record) {}
+      ~PakedBitsWriter() { assert(!CurrentIndex); }
+
+      void addBit(bool Value) {
+        assert(CurrentIndex && "Writing Bits without recording first!");
+        PackingBits.addBit(Value);
+      }
+      void addBits(uint32_t Value, uint32_t BitsWidth) {
+        assert(CurrentIndex && "Writing Bits without recording first!");
+        PackingBits.addBits(Value, BitsWidth);
+      }
+
+      void writeBits() {
+        if (!CurrentIndex)
+          return;
+
+        RecordRef[*CurrentIndex] = (uint32_t)PackingBits;
+        CurrentIndex = std::nullopt;
+        PackingBits.reset(0);
+      }
+
+      void updateBits() {
+        writeBits();
+
+        CurrentIndex = RecordRef.size();
+        RecordRef.push_back(0);
+      }
+
+    private:
+      BitsPacker PackingBits;
+      ASTRecordWriter &RecordRef;
+      std::optional<unsigned> CurrentIndex;
+    };
+
+    PakedBitsWriter CurrentPackingBits;
+
   public:
     ASTStmtWriter(ASTWriter &Writer, ASTWriter::RecordData &Record)
         : Writer(Writer), Record(Writer, Record),
-          Code(serialization::STMT_NULL_PTR), AbbrevToUse(0) {}
+          Code(serialization::STMT_NULL_PTR), AbbrevToUse(0),
+          CurrentPackingBits(this->Record) {}
 
     ASTStmtWriter(const ASTStmtWriter&) = delete;
     ASTStmtWriter &operator=(const ASTStmtWriter &) = delete;
 
     uint64_t Emit() {
+      CurrentPackingBits.writeBits();
       assert(Code != serialization::STMT_NULL_PTR &&
              "unhandled sub-statement writing AST file");
       return Record.EmitStmt(Code, AbbrevToUse);
@@ -82,14 +137,20 @@ void ASTStmtWriter::VisitNullStmt(NullStmt *S) {
 
 void ASTStmtWriter::VisitCompoundStmt(CompoundStmt *S) {
   VisitStmt(S);
+
   Record.push_back(S->size());
   Record.push_back(S->hasStoredFPFeatures());
+
   for (auto *CS : S->body())
     Record.AddStmt(CS);
   if (S->hasStoredFPFeatures())
     Record.push_back(S->getStoredFPFeatures().getAsOpaqueInt());
   Record.AddSourceLocation(S->getLBracLoc());
   Record.AddSourceLocation(S->getRBracLoc());
+
+  if (!S->hasStoredFPFeatures())
+    AbbrevToUse = Writer.getCompoundStmtAbbrev();
+
   Code = serialization::STMT_COMPOUND;
 }
 
@@ -143,9 +204,11 @@ void ASTStmtWriter::VisitIfStmt(IfStmt *S) {
   bool HasVar = S->getConditionVariableDeclStmt() != nullptr;
   bool HasInit = S->getInit() != nullptr;
 
-  Record.push_back(HasElse);
-  Record.push_back(HasVar);
-  Record.push_back(HasInit);
+  CurrentPackingBits.updateBits();
+
+  CurrentPackingBits.addBit(HasElse);
+  CurrentPackingBits.addBit(HasVar);
+  CurrentPackingBits.addBit(HasInit);
   Record.push_back(static_cast<uint64_t>(S->getStatementKind()));
   Record.AddStmt(S->getCond());
   Record.AddStmt(S->getThen());
@@ -548,15 +611,13 @@ void ASTStmtWriter::VisitCapturedStmt(CapturedStmt *S) {
 
 void ASTStmtWriter::VisitExpr(Expr *E) {
   VisitStmt(E);
-  Record.AddTypeRef(E->getType());
 
-  BitsPacker ExprBits;
+  CurrentPackingBits.updateBits();
+  CurrentPackingBits.addBits(E->getDependence(), /*BitsWidth=*/5);
+  CurrentPackingBits.addBits(E->getValueKind(), /*BitsWidth=*/2);
+  CurrentPackingBits.addBits(E->getObjectKind(), /*BitsWidth=*/3);
 
-  ExprBits.addBits(E->getDependence(), /*BitsWidth=*/5);
-  ExprBits.addBits(E->getValueKind(), /*BitsWidth=*/2);
-  ExprBits.addBits(E->getObjectKind(), /*BitsWidth=*/3);
-
-  Record.push_back(ExprBits);
+  Record.AddTypeRef(E->getType());
 }
 
 void ASTStmtWriter::VisitConstantExpr(ConstantExpr *E) {
@@ -612,13 +673,15 @@ void ASTStmtWriter::VisitPredefinedExpr(PredefinedExpr *E) {
 void ASTStmtWriter::VisitDeclRefExpr(DeclRefExpr *E) {
   VisitExpr(E);
 
-  Record.push_back(E->hasQualifier());
-  Record.push_back(E->getDecl() != E->getFoundDecl());
-  Record.push_back(E->hasTemplateKWAndArgsInfo());
-  Record.push_back(E->hadMultipleCandidates());
-  Record.push_back(E->refersToEnclosingVariableOrCapture());
-  Record.push_back(E->isNonOdrUse());
-  Record.push_back(E->isImmediateEscalating());
+  CurrentPackingBits.updateBits();
+
+  CurrentPackingBits.addBit(E->hadMultipleCandidates());
+  CurrentPackingBits.addBit(E->refersToEnclosingVariableOrCapture());
+  CurrentPackingBits.addBits(E->isNonOdrUse(), /*Width=*/2);
+  CurrentPackingBits.addBit(E->isImmediateEscalating());
+  CurrentPackingBits.addBit(E->getDecl() != E->getFoundDecl());
+  CurrentPackingBits.addBit(E->hasQualifier());
+  CurrentPackingBits.addBit(E->hasTemplateKWAndArgsInfo());
 
   if (E->hasTemplateKWAndArgsInfo()) {
     unsigned NumTemplateArgs = E->getNumTemplateArgs();
@@ -629,9 +692,7 @@ void ASTStmtWriter::VisitDeclRefExpr(DeclRefExpr *E) {
 
   if ((!E->hasTemplateKWAndArgsInfo()) && (!E->hasQualifier()) &&
       (E->getDecl() == E->getFoundDecl()) &&
-      nk == DeclarationName::Identifier &&
-      !E->refersToEnclosingVariableOrCapture() && !E->isNonOdrUse() &&
-      !E->isImmediateEscalating()) {
+      nk == DeclarationName::Identifier && E->getObjectKind() == OK_Ordinary) {
     AbbrevToUse = Writer.getDeclRefExprAbbrev();
   }
 
@@ -742,11 +803,13 @@ void ASTStmtWriter::VisitUnaryOperator(UnaryOperator *E) {
   bool HasFPFeatures = E->hasStoredFPFeatures();
   // Write this first for easy access when deserializing, as they affect the
   // size of the UnaryOperator.
-  Record.push_back(HasFPFeatures);
+  CurrentPackingBits.addBit(HasFPFeatures);
   Record.AddStmt(E->getSubExpr());
-  Record.push_back(E->getOpcode()); // FIXME: stable encoding
+  CurrentPackingBits.addBits(E->getOpcode(),
+                             /*Width=*/5); // FIXME: stable encoding
   Record.AddSourceLocation(E->getOperatorLoc());
-  Record.push_back(E->canOverflow());
+  CurrentPackingBits.addBit(E->canOverflow());
+
   if (HasFPFeatures)
     Record.push_back(E->getStoredFPFeatures().getAsOpaqueInt());
   Code = serialization::EXPR_UNARY_OPERATOR;
@@ -872,12 +935,10 @@ void ASTStmtWriter::VisitOMPIteratorExpr(OMPIteratorExpr *E) {
 void ASTStmtWriter::VisitCallExpr(CallExpr *E) {
   VisitExpr(E);
 
-  BitsPacker CallExprBits;
-  // 16 bits should be sufficient to store the number args;
-  CallExprBits.addBits(E->getNumArgs(), /*BitsWidth=*/16);
-  CallExprBits.addBit(E->hasStoredFPFeatures());
-  CallExprBits.addBit(static_cast<bool>(E->getADLCallKind()));
-  Record.push_back(CallExprBits);
+  Record.push_back(E->getNumArgs());
+  CurrentPackingBits.updateBits();
+  CurrentPackingBits.addBit(static_cast<bool>(E->getADLCallKind()));
+  CurrentPackingBits.addBit(E->hasStoredFPFeatures());
 
   Record.AddSourceLocation(E->getRParenLoc());
   Record.AddStmt(E->getCallee());
@@ -887,6 +948,11 @@ void ASTStmtWriter::VisitCallExpr(CallExpr *E) {
 
   if (E->hasStoredFPFeatures())
     Record.push_back(E->getFPFeatures().getAsOpaqueInt());
+
+  if (!E->hasStoredFPFeatures() && !static_cast<bool>(E->getADLCallKind()) &&
+      E->getStmtClass() == Stmt::CallExprClass)
+    AbbrevToUse = Writer.getCallExprAbbrev();
+
   Code = serialization::EXPR_CALL;
 }
 
@@ -913,9 +979,10 @@ void ASTStmtWriter::VisitMemberExpr(MemberExpr *E) {
 
   // Write these first for easy access when deserializing, as they affect the
   // size of the MemberExpr.
-  Record.push_back(HasQualifier);
-  Record.push_back(HasFoundDecl);
-  Record.push_back(HasTemplateInfo);
+  CurrentPackingBits.updateBits();
+  CurrentPackingBits.addBit(HasQualifier);
+  CurrentPackingBits.addBit(HasFoundDecl);
+  CurrentPackingBits.addBit(HasTemplateInfo);
   Record.push_back(NumTemplateArgs);
 
   Record.AddStmt(E->getBase());
@@ -923,15 +990,15 @@ void ASTStmtWriter::VisitMemberExpr(MemberExpr *E) {
   Record.AddDeclarationNameLoc(E->MemberDNLoc,
                                E->getMemberDecl()->getDeclName());
   Record.AddSourceLocation(E->getMemberLoc());
-  Record.push_back(E->isArrow());
-  Record.push_back(E->hadMultipleCandidates());
-  Record.push_back(E->isNonOdrUse());
+  CurrentPackingBits.addBit(E->isArrow());
+  CurrentPackingBits.addBit(E->hadMultipleCandidates());
+  CurrentPackingBits.addBits(E->isNonOdrUse(), /*Width=*/2);
   Record.AddSourceLocation(E->getOperatorLoc());
 
   if (HasFoundDecl) {
     DeclAccessPair FoundDecl = E->getFoundDecl();
     Record.AddDeclRef(FoundDecl.getDecl());
-    Record.push_back(FoundDecl.getAccess());
+    CurrentPackingBits.addBits(FoundDecl.getAccess(), /*BitWidth=*/2);
   }
 
   if (HasQualifier)
@@ -971,10 +1038,13 @@ void ASTStmtWriter::VisitObjCBridgedCastExpr(ObjCBridgedCastExpr *E) {
 
 void ASTStmtWriter::VisitCastExpr(CastExpr *E) {
   VisitExpr(E);
+
   Record.push_back(E->path_size());
-  Record.push_back(E->hasStoredFPFeatures());
+  CurrentPackingBits.updateBits();
+  // 7 bits should be enough to store the casting kinds.
+  CurrentPackingBits.addBits(E->getCastKind(), /*Width=*/7);
+  CurrentPackingBits.addBit(E->hasStoredFPFeatures());
   Record.AddStmt(E->getSubExpr());
-  Record.push_back(E->getCastKind()); // FIXME: stable encoding
 
   for (CastExpr::path_iterator
          PI = E->path_begin(), PE = E->path_end(); PI != PE; ++PI)
@@ -986,16 +1056,23 @@ void ASTStmtWriter::VisitCastExpr(CastExpr *E) {
 
 void ASTStmtWriter::VisitBinaryOperator(BinaryOperator *E) {
   VisitExpr(E);
-  bool HasFPFeatures = E->hasStoredFPFeatures();
+
   // Write this first for easy access when deserializing, as they affect the
   // size of the UnaryOperator.
-  Record.push_back(HasFPFeatures);
-  Record.push_back(E->getOpcode()); // FIXME: stable encoding
+  CurrentPackingBits.updateBits();
+  CurrentPackingBits.addBits(E->getOpcode(), /*Width=*/6);
+  bool HasFPFeatures = E->hasStoredFPFeatures();
+  CurrentPackingBits.addBit(HasFPFeatures);
   Record.AddStmt(E->getLHS());
   Record.AddStmt(E->getRHS());
   Record.AddSourceLocation(E->getOperatorLoc());
   if (HasFPFeatures)
     Record.push_back(E->getStoredFPFeatures().getAsOpaqueInt());
+
+  if (!HasFPFeatures && E->getValueKind() == VK_PRValue &&
+      E->getObjectKind() == OK_Ordinary)
+    AbbrevToUse = Writer.getBinaryOperatorAbbrev();
+
   Code = serialization::EXPR_BINARY_OPERATOR;
 }
 
@@ -1003,6 +1080,11 @@ void ASTStmtWriter::VisitCompoundAssignOperator(CompoundAssignOperator *E) {
   VisitBinaryOperator(E);
   Record.AddTypeRef(E->getComputationLHSType());
   Record.AddTypeRef(E->getComputationResultType());
+
+  if (!E->hasStoredFPFeatures() && E->getValueKind() == VK_PRValue &&
+      E->getObjectKind() == OK_Ordinary)
+    AbbrevToUse = Writer.getCompoundAssignOperatorAbbrev();
+
   Code = serialization::EXPR_COMPOUND_ASSIGN_OPERATOR;
 }
 
@@ -1031,7 +1113,7 @@ ASTStmtWriter::VisitBinaryConditionalOperator(BinaryConditionalOperator *E) {
 
 void ASTStmtWriter::VisitImplicitCastExpr(ImplicitCastExpr *E) {
   VisitCastExpr(E);
-  Record.push_back(E->isPartOfExplicitCast());
+  CurrentPackingBits.addBit(E->isPartOfExplicitCast());
 
   if (E->path_size() == 0 && !E->hasStoredFPFeatures())
     AbbrevToUse = Writer.getExprImplicitCastAbbrev();
@@ -1588,11 +1670,19 @@ void ASTStmtWriter::VisitCXXOperatorCallExpr(CXXOperatorCallExpr *E) {
   VisitCallExpr(E);
   Record.push_back(E->getOperator());
   Record.AddSourceRange(E->Range);
+
+  if (!E->hasStoredFPFeatures() && !static_cast<bool>(E->getADLCallKind()))
+    AbbrevToUse = Writer.getCXXOperatorCallExprAbbrev();
+
   Code = serialization::EXPR_CXX_OPERATOR_CALL;
 }
 
 void ASTStmtWriter::VisitCXXMemberCallExpr(CXXMemberCallExpr *E) {
   VisitCallExpr(E);
+
+  if (!E->hasStoredFPFeatures() && !static_cast<bool>(E->getADLCallKind()))
+    AbbrevToUse = Writer.getCXXMemberCallExprAbbrev();
+
   Code = serialization::EXPR_CXX_MEMBER_CALL;
 }
 
@@ -1673,7 +1763,9 @@ void ASTStmtWriter::VisitCXXStdInitializerListExpr(CXXStdInitializerListExpr *E)
 void ASTStmtWriter::VisitCXXNamedCastExpr(CXXNamedCastExpr *E) {
   VisitExplicitCastExpr(E);
   Record.AddSourceRange(SourceRange(E->getOperatorLoc(), E->getRParenLoc()));
-  Record.AddSourceRange(E->getAngleBrackets());
+  CurrentPackingBits.addBit(E->getAngleBrackets().isValid());
+  if (E->getAngleBrackets().isValid())
+    Record.AddSourceRange(E->getAngleBrackets());
 }
 
 void ASTStmtWriter::VisitCXXStaticCastExpr(CXXStaticCastExpr *E) {
@@ -1750,6 +1842,7 @@ void ASTStmtWriter::VisitCXXThisExpr(CXXThisExpr *E) {
   VisitExpr(E);
   Record.AddSourceLocation(E->getLocation());
   Record.push_back(E->isImplicit());
+
   Code = serialization::EXPR_CXX_THIS;
 }
 
@@ -1883,10 +1976,10 @@ void ASTStmtWriter::VisitCXXDependentScopeMemberExpr(
 
   // Don't emit anything here (or if you do you will have to update
   // the corresponding deserialization function).
-
-  Record.push_back(E->hasTemplateKWAndArgsInfo());
   Record.push_back(E->getNumTemplateArgs());
-  Record.push_back(E->hasFirstQualifierFoundInScope());
+  CurrentPackingBits.updateBits();
+  CurrentPackingBits.addBit(E->hasTemplateKWAndArgsInfo());
+  CurrentPackingBits.addBit(E->hasFirstQualifierFoundInScope());
 
   if (E->hasTemplateKWAndArgsInfo()) {
     const ASTTemplateKWAndArgsInfo &ArgInfo =
@@ -1895,14 +1988,15 @@ void ASTStmtWriter::VisitCXXDependentScopeMemberExpr(
                              E->getTrailingObjects<TemplateArgumentLoc>());
   }
 
-  Record.push_back(E->isArrow());
-  Record.AddSourceLocation(E->getOperatorLoc());
+  CurrentPackingBits.addBit(E->isArrow());
+
   Record.AddTypeRef(E->getBaseType());
   Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
+  CurrentPackingBits.addBit(!E->isImplicitAccess());
   if (!E->isImplicitAccess())
     Record.AddStmt(E->getBase());
-  else
-    Record.AddStmt(nullptr);
+
+  Record.AddSourceLocation(E->getOperatorLoc());
 
   if (E->hasFirstQualifierFoundInScope())
     Record.AddDeclRef(E->getFirstQualifierFoundInScope());
@@ -1917,12 +2011,14 @@ ASTStmtWriter::VisitDependentScopeDeclRefExpr(DependentScopeDeclRefExpr *E) {
 
   // Don't emit anything here, HasTemplateKWAndArgsInfo must be
   // emitted first.
+  CurrentPackingBits.addBit(
+      E->DependentScopeDeclRefExprBits.HasTemplateKWAndArgsInfo);
 
-  Record.push_back(E->DependentScopeDeclRefExprBits.HasTemplateKWAndArgsInfo);
   if (E->DependentScopeDeclRefExprBits.HasTemplateKWAndArgsInfo) {
     const ASTTemplateKWAndArgsInfo &ArgInfo =
         *E->getTrailingObjects<ASTTemplateKWAndArgsInfo>();
-    Record.push_back(ArgInfo.NumTemplateArgs);
+    // 16 bits should be enought to store the number of args
+    CurrentPackingBits.addBits(ArgInfo.NumTemplateArgs, /*Width=*/16);
     AddTemplateKWAndArgsInfo(ArgInfo,
                              E->getTrailingObjects<TemplateArgumentLoc>());
   }
@@ -1949,19 +2045,16 @@ ASTStmtWriter::VisitCXXUnresolvedConstructExpr(CXXUnresolvedConstructExpr *E) {
 void ASTStmtWriter::VisitOverloadExpr(OverloadExpr *E) {
   VisitExpr(E);
 
-  BitsPacker OverloadExprBits;
-  // 14 Bits should enough to store the number of decls.
-  OverloadExprBits.addBits(E->getNumDecls(), /*BitWidth=*/14);
-  OverloadExprBits.addBit(E->hasTemplateKWAndArgsInfo());
+  Record.push_back(E->getNumDecls());
+
+  CurrentPackingBits.updateBits();
+  CurrentPackingBits.addBit(E->hasTemplateKWAndArgsInfo());
   if (E->hasTemplateKWAndArgsInfo()) {
     const ASTTemplateKWAndArgsInfo &ArgInfo =
         *E->getTrailingASTTemplateKWAndArgsInfo();
-    // 14 Bits should enough to store the number of template args.
-    OverloadExprBits.addBits(ArgInfo.NumTemplateArgs, /*BitWidth=*/14);
-    Record.push_back(OverloadExprBits);
+    Record.push_back(ArgInfo.NumTemplateArgs);
     AddTemplateKWAndArgsInfo(ArgInfo, E->getTrailingTemplateArgumentLoc());
-  } else
-    Record.push_back(OverloadExprBits);
+  }
 
   for (OverloadExpr::decls_iterator OvI = E->decls_begin(),
                                     OvE = E->decls_end();
@@ -1976,18 +2069,22 @@ void ASTStmtWriter::VisitOverloadExpr(OverloadExpr *E) {
 
 void ASTStmtWriter::VisitUnresolvedMemberExpr(UnresolvedMemberExpr *E) {
   VisitOverloadExpr(E);
-  Record.push_back(E->isArrow());
-  Record.push_back(E->hasUnresolvedUsing());
-  Record.AddStmt(!E->isImplicitAccess() ? E->getBase() : nullptr);
-  Record.AddTypeRef(E->getBaseType());
+  CurrentPackingBits.addBit(E->isArrow());
+  CurrentPackingBits.addBit(E->hasUnresolvedUsing());
+  CurrentPackingBits.addBit(!E->isImplicitAccess());
+  if (!E->isImplicitAccess())
+    Record.AddStmt(E->getBase());
+
   Record.AddSourceLocation(E->getOperatorLoc());
+
+  Record.AddTypeRef(E->getBaseType());
   Code = serialization::EXPR_CXX_UNRESOLVED_MEMBER;
 }
 
 void ASTStmtWriter::VisitUnresolvedLookupExpr(UnresolvedLookupExpr *E) {
   VisitOverloadExpr(E);
-  Record.push_back(E->requiresADL());
-  Record.push_back(E->isOverloaded());
+  CurrentPackingBits.addBit(E->requiresADL());
+  CurrentPackingBits.addBit(E->isOverloaded());
   Record.AddDeclRef(E->getNamingClass());
   Code = serialization::EXPR_CXX_UNRESOLVED_LOOKUP;
 }
@@ -2059,12 +2156,12 @@ void ASTStmtWriter::VisitSubstNonTypeTemplateParmExpr(
                                               SubstNonTypeTemplateParmExpr *E) {
   VisitExpr(E);
   Record.AddDeclRef(E->getAssociatedDecl());
-  Record.push_back(E->isReferenceParameter());
-  Record.push_back(E->getIndex());
+  CurrentPackingBits.addBit(E->isReferenceParameter());
+  CurrentPackingBits.addBits(E->getIndex(), /*Width=*/12);
+  CurrentPackingBits.addBit((bool)E->getPackIndex());
   if (auto PackIndex = E->getPackIndex())
     Record.push_back(*PackIndex + 1);
-  else
-    Record.push_back(0);
+
   Record.AddSourceLocation(E->getNameLoc());
   Record.AddStmt(E->getReplacement());
   Code = serialization::EXPR_SUBST_NON_TYPE_TEMPLATE_PARM;
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp
index ce1265412655..c990ad138f89 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp
@@ -25,7 +25,7 @@ using namespace ento;
 namespace {
 class ArrayBoundChecker :
     public Checker<check::Location> {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Out-of-bound array access"};
 
 public:
   void checkLocation(SVal l, bool isLoad, const Stmt* S,
@@ -65,16 +65,13 @@ void ArrayBoundChecker::checkLocation(SVal l, bool isLoad, const Stmt* LoadS,
     if (!N)
       return;
 
-    if (!BT)
-      BT.reset(new BugType(this, "Out-of-bound array access"));
-
     // FIXME: It would be nice to eventually make this diagnostic more clear,
     // e.g., by referencing the original declaration or by saying *why* this
     // reference is outside the range.
 
     // Generate a report for this bug.
     auto report = std::make_unique<PathSensitiveBugReport>(
-        *BT, "Access out-of-bound array element (buffer overflow)", N);
+        BT, "Access out-of-bound array element (buffer overflow)", N);
 
     report->addRange(LoadS->getSourceRange());
     C.emitReport(std::move(report));
diff --git a/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp b/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
index 5e25153a148f..c72a97cc01e9 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
@@ -44,7 +44,7 @@ namespace {
 class APIMisuse : public BugType {
 public:
   APIMisuse(const CheckerBase *checker, const char *name)
-      : BugType(checker, name, "API Misuse (Apple)") {}
+      : BugType(checker, name, categories::AppleAPIMisuse) {}
 };
 } // end anonymous namespace
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
index 76f091562cd5..66e080adb138 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
@@ -25,20 +25,31 @@ using namespace clang;
 using namespace ento;
 
 namespace {
-
 class BlockInCriticalSectionChecker : public Checker<check::PostCall> {
-
-  mutable IdentifierInfo *IILockGuard, *IIUniqueLock;
-
-  CallDescription LockFn, UnlockFn, SleepFn, GetcFn, FgetsFn, ReadFn, RecvFn,
-                  PthreadLockFn, PthreadTryLockFn, PthreadUnlockFn,
-                  MtxLock, MtxTimedLock, MtxTryLock, MtxUnlock;
-
-  StringRef ClassLockGuard, ClassUniqueLock;
-
-  mutable bool IdentifierInfoInitialized;
-
-  std::unique_ptr<BugType> BlockInCritSectionBugType;
+  mutable IdentifierInfo *IILockGuard = nullptr;
+  mutable IdentifierInfo *IIUniqueLock = nullptr;
+  mutable bool IdentifierInfoInitialized = false;
+
+  const CallDescription LockFn{{"lock"}};
+  const CallDescription UnlockFn{{"unlock"}};
+  const CallDescription SleepFn{{"sleep"}};
+  const CallDescription GetcFn{{"getc"}};
+  const CallDescription FgetsFn{{"fgets"}};
+  const CallDescription ReadFn{{"read"}};
+  const CallDescription RecvFn{{"recv"}};
+  const CallDescription PthreadLockFn{{"pthread_mutex_lock"}};
+  const CallDescription PthreadTryLockFn{{"pthread_mutex_trylock"}};
+  const CallDescription PthreadUnlockFn{{"pthread_mutex_unlock"}};
+  const CallDescription MtxLock{{"mtx_lock"}};
+  const CallDescription MtxTimedLock{{"mtx_timedlock"}};
+  const CallDescription MtxTryLock{{"mtx_trylock"}};
+  const CallDescription MtxUnlock{{"mtx_unlock"}};
+
+  const llvm::StringLiteral ClassLockGuard{"lock_guard"};
+  const llvm::StringLiteral ClassUniqueLock{"unique_lock"};
+
+  const BugType BlockInCritSectionBugType{
+      this, "Call to blocking function in critical section", "Blocking Error"};
 
   void initIdentifierInfo(ASTContext &Ctx) const;
 
@@ -47,8 +58,6 @@ class BlockInCriticalSectionChecker : public Checker<check::PostCall> {
                                 CheckerContext &C) const;
 
 public:
-  BlockInCriticalSectionChecker();
-
   bool isBlockingFunction(const CallEvent &Call) const;
   bool isLockFunction(const CallEvent &Call) const;
   bool isUnlockFunction(const CallEvent &Call) const;
@@ -63,22 +72,6 @@ public:
 
 REGISTER_TRAIT_WITH_PROGRAMSTATE(MutexCounter, unsigned)
 
-BlockInCriticalSectionChecker::BlockInCriticalSectionChecker()
-    : IILockGuard(nullptr), IIUniqueLock(nullptr), LockFn({"lock"}),
-      UnlockFn({"unlock"}), SleepFn({"sleep"}), GetcFn({"getc"}),
-      FgetsFn({"fgets"}), ReadFn({"read"}), RecvFn({"recv"}),
-      PthreadLockFn({"pthread_mutex_lock"}),
-      PthreadTryLockFn({"pthread_mutex_trylock"}),
-      PthreadUnlockFn({"pthread_mutex_unlock"}), MtxLock({"mtx_lock"}),
-      MtxTimedLock({"mtx_timedlock"}), MtxTryLock({"mtx_trylock"}),
-      MtxUnlock({"mtx_unlock"}), ClassLockGuard("lock_guard"),
-      ClassUniqueLock("unique_lock"), IdentifierInfoInitialized(false) {
-  // Initialize the bug type.
-  BlockInCritSectionBugType.reset(
-      new BugType(this, "Call to blocking function in critical section",
-                        "Blocking Error"));
-}
-
 void BlockInCriticalSectionChecker::initIdentifierInfo(ASTContext &Ctx) const {
   if (!IdentifierInfoInitialized) {
     /* In case of checking C code, or when the corresponding headers are not
@@ -151,7 +144,7 @@ void BlockInCriticalSectionChecker::reportBlockInCritSection(
   llvm::raw_string_ostream os(msg);
   os << "Call to blocking function '" << Call.getCalleeIdentifier()->getName()
      << "' inside of critical section";
-  auto R = std::make_unique<PathSensitiveBugReport>(*BlockInCritSectionBugType,
+  auto R = std::make_unique<PathSensitiveBugReport>(BlockInCritSectionBugType,
                                                     os.str(), ErrNode);
   R->addRange(Call.getSourceRange());
   R->markInteresting(BlockDescSym);
diff --git a/clang/lib/StaticAnalyzer/Checkers/BoolAssignmentChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BoolAssignmentChecker.cpp
index 361a4eed9221..a09db6d2d0ec 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BoolAssignmentChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BoolAssignmentChecker.cpp
@@ -24,7 +24,7 @@ using namespace ento;
 
 namespace {
   class BoolAssignmentChecker : public Checker< check::Bind > {
-    mutable std::unique_ptr<BugType> BT;
+    const BugType BT{this, "Assignment of a non-Boolean value"};
     void emitReport(ProgramStateRef state, CheckerContext &C,
                     bool IsTainted = false) const;
 
@@ -36,12 +36,9 @@ namespace {
 void BoolAssignmentChecker::emitReport(ProgramStateRef state, CheckerContext &C,
                                        bool IsTainted) const {
   if (ExplodedNode *N = C.generateNonFatalErrorNode(state)) {
-    if (!BT)
-      BT.reset(new BugType(this, "Assignment of a non-Boolean value"));
-
     StringRef Msg = IsTainted ? "Might assign a tainted non-Boolean value"
                               : "Assignment of a non-Boolean value";
-    C.emitReport(std::make_unique<PathSensitiveBugReport>(*BT, Msg, N));
+    C.emitReport(std::make_unique<PathSensitiveBugReport>(BT, Msg, N));
   }
 }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
index 31f5b03dcdeb..b7b64c3da4f6 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
@@ -121,7 +121,7 @@ public:
                        const CallEvent *Call) const;
 
   using FnCheck = std::function<void(const CStringChecker *, CheckerContext &,
-                                     const CallExpr *)>;
+                                     const CallEvent &)>;
 
   CallDescriptionMap<FnCheck> Callbacks = {
       {{CDF_MaybeBuiltin, {"memcpy"}, 3},
@@ -173,56 +173,53 @@ public:
       StdCopyBackward{{"std", "copy_backward"}, 3};
 
   FnCheck identifyCall(const CallEvent &Call, CheckerContext &C) const;
-  void evalMemcpy(CheckerContext &C, const CallExpr *CE, CharKind CK) const;
-  void evalMempcpy(CheckerContext &C, const CallExpr *CE, CharKind CK) const;
-  void evalMemmove(CheckerContext &C, const CallExpr *CE, CharKind CK) const;
-  void evalBcopy(CheckerContext &C, const CallExpr *CE) const;
-  void evalCopyCommon(CheckerContext &C, const CallExpr *CE,
+  void evalMemcpy(CheckerContext &C, const CallEvent &Call, CharKind CK) const;
+  void evalMempcpy(CheckerContext &C, const CallEvent &Call, CharKind CK) const;
+  void evalMemmove(CheckerContext &C, const CallEvent &Call, CharKind CK) const;
+  void evalBcopy(CheckerContext &C, const CallEvent &Call) const;
+  void evalCopyCommon(CheckerContext &C, const CallEvent &Call,
                       ProgramStateRef state, SizeArgExpr Size,
                       DestinationArgExpr Dest, SourceArgExpr Source,
                       bool Restricted, bool IsMempcpy, CharKind CK) const;
 
-  void evalMemcmp(CheckerContext &C, const CallExpr *CE, CharKind CK) const;
+  void evalMemcmp(CheckerContext &C, const CallEvent &Call, CharKind CK) const;
 
-  void evalstrLength(CheckerContext &C, const CallExpr *CE) const;
-  void evalstrnLength(CheckerContext &C, const CallExpr *CE) const;
-  void evalstrLengthCommon(CheckerContext &C,
-                           const CallExpr *CE,
+  void evalstrLength(CheckerContext &C, const CallEvent &Call) const;
+  void evalstrnLength(CheckerContext &C, const CallEvent &Call) const;
+  void evalstrLengthCommon(CheckerContext &C, const CallEvent &Call,
                            bool IsStrnlen = false) const;
 
-  void evalStrcpy(CheckerContext &C, const CallExpr *CE) const;
-  void evalStrncpy(CheckerContext &C, const CallExpr *CE) const;
-  void evalStpcpy(CheckerContext &C, const CallExpr *CE) const;
-  void evalStrlcpy(CheckerContext &C, const CallExpr *CE) const;
-  void evalStrcpyCommon(CheckerContext &C, const CallExpr *CE, bool ReturnEnd,
-                        bool IsBounded, ConcatFnKind appendK,
+  void evalStrcpy(CheckerContext &C, const CallEvent &Call) const;
+  void evalStrncpy(CheckerContext &C, const CallEvent &Call) const;
+  void evalStpcpy(CheckerContext &C, const CallEvent &Call) const;
+  void evalStrlcpy(CheckerContext &C, const CallEvent &Call) const;
+  void evalStrcpyCommon(CheckerContext &C, const CallEvent &Call,
+                        bool ReturnEnd, bool IsBounded, ConcatFnKind appendK,
                         bool returnPtr = true) const;
 
-  void evalStrcat(CheckerContext &C, const CallExpr *CE) const;
-  void evalStrncat(CheckerContext &C, const CallExpr *CE) const;
-  void evalStrlcat(CheckerContext &C, const CallExpr *CE) const;
+  void evalStrcat(CheckerContext &C, const CallEvent &Call) const;
+  void evalStrncat(CheckerContext &C, const CallEvent &Call) const;
+  void evalStrlcat(CheckerContext &C, const CallEvent &Call) const;
 
-  void evalStrcmp(CheckerContext &C, const CallExpr *CE) const;
-  void evalStrncmp(CheckerContext &C, const CallExpr *CE) const;
-  void evalStrcasecmp(CheckerContext &C, const CallExpr *CE) const;
-  void evalStrncasecmp(CheckerContext &C, const CallExpr *CE) const;
-  void evalStrcmpCommon(CheckerContext &C,
-                        const CallExpr *CE,
-                        bool IsBounded = false,
-                        bool IgnoreCase = false) const;
+  void evalStrcmp(CheckerContext &C, const CallEvent &Call) const;
+  void evalStrncmp(CheckerContext &C, const CallEvent &Call) const;
+  void evalStrcasecmp(CheckerContext &C, const CallEvent &Call) const;
+  void evalStrncasecmp(CheckerContext &C, const CallEvent &Call) const;
+  void evalStrcmpCommon(CheckerContext &C, const CallEvent &Call,
+                        bool IsBounded = false, bool IgnoreCase = false) const;
 
-  void evalStrsep(CheckerContext &C, const CallExpr *CE) const;
+  void evalStrsep(CheckerContext &C, const CallEvent &Call) const;
 
-  void evalStdCopy(CheckerContext &C, const CallExpr *CE) const;
-  void evalStdCopyBackward(CheckerContext &C, const CallExpr *CE) const;
-  void evalStdCopyCommon(CheckerContext &C, const CallExpr *CE) const;
-  void evalMemset(CheckerContext &C, const CallExpr *CE) const;
-  void evalBzero(CheckerContext &C, const CallExpr *CE) const;
+  void evalStdCopy(CheckerContext &C, const CallEvent &Call) const;
+  void evalStdCopyBackward(CheckerContext &C, const CallEvent &Call) const;
+  void evalStdCopyCommon(CheckerContext &C, const CallEvent &Call) const;
+  void evalMemset(CheckerContext &C, const CallEvent &Call) const;
+  void evalBzero(CheckerContext &C, const CallEvent &Call) const;
 
-  void evalSprintf(CheckerContext &C, const CallExpr *CE) const;
-  void evalSnprintf(CheckerContext &C, const CallExpr *CE) const;
-  void evalSprintfCommon(CheckerContext &C, const CallExpr *CE, bool IsBounded,
-                         bool IsBuiltin) const;
+  void evalSprintf(CheckerContext &C, const CallEvent &Call) const;
+  void evalSnprintf(CheckerContext &C, const CallEvent &Call) const;
+  void evalSprintfCommon(CheckerContext &C, const CallEvent &Call,
+                         bool IsBounded, bool IsBuiltin) const;
 
   // Utility methods
   std::pair<ProgramStateRef , ProgramStateRef >
@@ -1291,7 +1288,7 @@ bool CStringChecker::memsetAux(const Expr *DstBuffer, SVal CharVal,
 // evaluation of individual function calls.
 //===----------------------------------------------------------------------===//
 
-void CStringChecker::evalCopyCommon(CheckerContext &C, const CallExpr *CE,
+void CStringChecker::evalCopyCommon(CheckerContext &C, const CallEvent &Call,
                                     ProgramStateRef state, SizeArgExpr Size,
                                     DestinationArgExpr Dest,
                                     SourceArgExpr Source, bool Restricted,
@@ -1313,7 +1310,8 @@ void CStringChecker::evalCopyCommon(CheckerContext &C, const CallExpr *CE,
   // If the size is zero, there won't be any actual memory access, so
   // just bind the return value to the destination buffer and return.
   if (stateZeroSize && !stateNonZeroSize) {
-    stateZeroSize = stateZeroSize->BindExpr(CE, LCtx, destVal);
+    stateZeroSize =
+        stateZeroSize->BindExpr(Call.getOriginExpr(), LCtx, destVal);
     C.addTransition(stateZeroSize);
     return;
   }
@@ -1361,15 +1359,15 @@ void CStringChecker::evalCopyCommon(CheckerContext &C, const CallExpr *CE,
       // If we don't know how much we copied, we can at least
       // conjure a return value for later.
       if (lastElement.isUnknown())
-        lastElement = C.getSValBuilder().conjureSymbolVal(nullptr, CE, LCtx,
-                                                          C.blockCount());
+        lastElement = C.getSValBuilder().conjureSymbolVal(
+            nullptr, Call.getOriginExpr(), LCtx, C.blockCount());
 
       // The byte after the last byte copied is the return value.
-      state = state->BindExpr(CE, LCtx, lastElement);
+      state = state->BindExpr(Call.getOriginExpr(), LCtx, lastElement);
     } else {
       // All other copies return the destination buffer.
       // (Well, bcopy() has a void return type, but this won't hurt.)
-      state = state->BindExpr(CE, LCtx, destVal);
+      state = state->BindExpr(Call.getOriginExpr(), LCtx, destVal);
     }
 
     // Invalidate the destination (regular invalidation without pointer-escaping
@@ -1391,69 +1389,69 @@ void CStringChecker::evalCopyCommon(CheckerContext &C, const CallExpr *CE,
   }
 }
 
-void CStringChecker::evalMemcpy(CheckerContext &C, const CallExpr *CE,
+void CStringChecker::evalMemcpy(CheckerContext &C, const CallEvent &Call,
                                 CharKind CK) const {
   // void *memcpy(void *restrict dst, const void *restrict src, size_t n);
   // The return value is the address of the destination buffer.
-  DestinationArgExpr Dest = {{CE->getArg(0), 0}};
-  SourceArgExpr Src = {{CE->getArg(1), 1}};
-  SizeArgExpr Size = {{CE->getArg(2), 2}};
+  DestinationArgExpr Dest = {{Call.getArgExpr(0), 0}};
+  SourceArgExpr Src = {{Call.getArgExpr(1), 1}};
+  SizeArgExpr Size = {{Call.getArgExpr(2), 2}};
 
   ProgramStateRef State = C.getState();
 
   constexpr bool IsRestricted = true;
   constexpr bool IsMempcpy = false;
-  evalCopyCommon(C, CE, State, Size, Dest, Src, IsRestricted, IsMempcpy, CK);
+  evalCopyCommon(C, Call, State, Size, Dest, Src, IsRestricted, IsMempcpy, CK);
 }
 
-void CStringChecker::evalMempcpy(CheckerContext &C, const CallExpr *CE,
+void CStringChecker::evalMempcpy(CheckerContext &C, const CallEvent &Call,
                                  CharKind CK) const {
   // void *mempcpy(void *restrict dst, const void *restrict src, size_t n);
   // The return value is a pointer to the byte following the last written byte.
-  DestinationArgExpr Dest = {{CE->getArg(0), 0}};
-  SourceArgExpr Src = {{CE->getArg(1), 1}};
-  SizeArgExpr Size = {{CE->getArg(2), 2}};
+  DestinationArgExpr Dest = {{Call.getArgExpr(0), 0}};
+  SourceArgExpr Src = {{Call.getArgExpr(1), 1}};
+  SizeArgExpr Size = {{Call.getArgExpr(2), 2}};
 
   constexpr bool IsRestricted = true;
   constexpr bool IsMempcpy = true;
-  evalCopyCommon(C, CE, C.getState(), Size, Dest, Src, IsRestricted, IsMempcpy,
-                 CK);
+  evalCopyCommon(C, Call, C.getState(), Size, Dest, Src, IsRestricted,
+                 IsMempcpy, CK);
 }
 
-void CStringChecker::evalMemmove(CheckerContext &C, const CallExpr *CE,
+void CStringChecker::evalMemmove(CheckerContext &C, const CallEvent &Call,
                                  CharKind CK) const {
   // void *memmove(void *dst, const void *src, size_t n);
   // The return value is the address of the destination buffer.
-  DestinationArgExpr Dest = {{CE->getArg(0), 0}};
-  SourceArgExpr Src = {{CE->getArg(1), 1}};
-  SizeArgExpr Size = {{CE->getArg(2), 2}};
+  DestinationArgExpr Dest = {{Call.getArgExpr(0), 0}};
+  SourceArgExpr Src = {{Call.getArgExpr(1), 1}};
+  SizeArgExpr Size = {{Call.getArgExpr(2), 2}};
 
   constexpr bool IsRestricted = false;
   constexpr bool IsMempcpy = false;
-  evalCopyCommon(C, CE, C.getState(), Size, Dest, Src, IsRestricted, IsMempcpy,
-                 CK);
+  evalCopyCommon(C, Call, C.getState(), Size, Dest, Src, IsRestricted,
+                 IsMempcpy, CK);
 }
 
-void CStringChecker::evalBcopy(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalBcopy(CheckerContext &C, const CallEvent &Call) const {
   // void bcopy(const void *src, void *dst, size_t n);
-  SourceArgExpr Src{{CE->getArg(0), 0}};
-  DestinationArgExpr Dest = {{CE->getArg(1), 1}};
-  SizeArgExpr Size = {{CE->getArg(2), 2}};
+  SourceArgExpr Src{{Call.getArgExpr(0), 0}};
+  DestinationArgExpr Dest = {{Call.getArgExpr(1), 1}};
+  SizeArgExpr Size = {{Call.getArgExpr(2), 2}};
 
   constexpr bool IsRestricted = false;
   constexpr bool IsMempcpy = false;
-  evalCopyCommon(C, CE, C.getState(), Size, Dest, Src, IsRestricted, IsMempcpy,
-                 CharKind::Regular);
+  evalCopyCommon(C, Call, C.getState(), Size, Dest, Src, IsRestricted,
+                 IsMempcpy, CharKind::Regular);
 }
 
-void CStringChecker::evalMemcmp(CheckerContext &C, const CallExpr *CE,
+void CStringChecker::evalMemcmp(CheckerContext &C, const CallEvent &Call,
                                 CharKind CK) const {
   // int memcmp(const void *s1, const void *s2, size_t n);
   CurrentFunctionDescription = "memory comparison function";
 
-  AnyArgExpr Left = {CE->getArg(0), 0};
-  AnyArgExpr Right = {CE->getArg(1), 1};
-  SizeArgExpr Size = {{CE->getArg(2), 2}};
+  AnyArgExpr Left = {Call.getArgExpr(0), 0};
+  AnyArgExpr Right = {Call.getArgExpr(1), 1};
+  SizeArgExpr Size = {{Call.getArgExpr(2), 2}};
 
   ProgramStateRef State = C.getState();
   SValBuilder &Builder = C.getSValBuilder();
@@ -1471,7 +1469,8 @@ void CStringChecker::evalMemcmp(CheckerContext &C, const CallExpr *CE,
   // have to check either of the buffers.
   if (stateZeroSize) {
     State = stateZeroSize;
-    State = State->BindExpr(CE, LCtx, Builder.makeZeroVal(CE->getType()));
+    State = State->BindExpr(Call.getOriginExpr(), LCtx,
+                            Builder.makeZeroVal(Call.getResultType()));
     C.addTransition(State);
   }
 
@@ -1497,8 +1496,8 @@ void CStringChecker::evalMemcmp(CheckerContext &C, const CallExpr *CE,
       State = SameBuffer;
       State = CheckBufferAccess(C, State, Left, Size, AccessKind::read);
       if (State) {
-        State =
-            SameBuffer->BindExpr(CE, LCtx, Builder.makeZeroVal(CE->getType()));
+        State = SameBuffer->BindExpr(Call.getOriginExpr(), LCtx,
+                                     Builder.makeZeroVal(Call.getResultType()));
         C.addTransition(State);
       }
       return;
@@ -1511,33 +1510,35 @@ void CStringChecker::evalMemcmp(CheckerContext &C, const CallExpr *CE,
     State = CheckBufferAccess(C, State, Left, Size, AccessKind::read, CK);
     if (State) {
       // The return value is the comparison result, which we don't know.
-      SVal CmpV = Builder.conjureSymbolVal(nullptr, CE, LCtx, C.blockCount());
-      State = State->BindExpr(CE, LCtx, CmpV);
+      SVal CmpV = Builder.conjureSymbolVal(nullptr, Call.getOriginExpr(), LCtx,
+                                           C.blockCount());
+      State = State->BindExpr(Call.getOriginExpr(), LCtx, CmpV);
       C.addTransition(State);
     }
   }
 }
 
 void CStringChecker::evalstrLength(CheckerContext &C,
-                                   const CallExpr *CE) const {
+                                   const CallEvent &Call) const {
   // size_t strlen(const char *s);
-  evalstrLengthCommon(C, CE, /* IsStrnlen = */ false);
+  evalstrLengthCommon(C, Call, /* IsStrnlen = */ false);
 }
 
 void CStringChecker::evalstrnLength(CheckerContext &C,
-                                    const CallExpr *CE) const {
+                                    const CallEvent &Call) const {
   // size_t strnlen(const char *s, size_t maxlen);
-  evalstrLengthCommon(C, CE, /* IsStrnlen = */ true);
+  evalstrLengthCommon(C, Call, /* IsStrnlen = */ true);
 }
 
-void CStringChecker::evalstrLengthCommon(CheckerContext &C, const CallExpr *CE,
+void CStringChecker::evalstrLengthCommon(CheckerContext &C,
+                                         const CallEvent &Call,
                                          bool IsStrnlen) const {
   CurrentFunctionDescription = "string length function";
   ProgramStateRef state = C.getState();
   const LocationContext *LCtx = C.getLocationContext();
 
   if (IsStrnlen) {
-    const Expr *maxlenExpr = CE->getArg(1);
+    const Expr *maxlenExpr = Call.getArgExpr(1);
     SVal maxlenVal = state->getSVal(maxlenExpr, LCtx);
 
     ProgramStateRef stateZeroSize, stateNonZeroSize;
@@ -1547,8 +1548,8 @@ void CStringChecker::evalstrLengthCommon(CheckerContext &C, const CallExpr *CE,
     // If the size can be zero, the result will be 0 in that case, and we don't
     // have to check the string itself.
     if (stateZeroSize) {
-      SVal zero = C.getSValBuilder().makeZeroVal(CE->getType());
-      stateZeroSize = stateZeroSize->BindExpr(CE, LCtx, zero);
+      SVal zero = C.getSValBuilder().makeZeroVal(Call.getResultType());
+      stateZeroSize = stateZeroSize->BindExpr(Call.getOriginExpr(), LCtx, zero);
       C.addTransition(stateZeroSize);
     }
 
@@ -1561,7 +1562,7 @@ void CStringChecker::evalstrLengthCommon(CheckerContext &C, const CallExpr *CE,
   }
 
   // Check that the string argument is non-null.
-  AnyArgExpr Arg = {CE->getArg(0), 0};
+  AnyArgExpr Arg = {Call.getArgExpr(0), 0};
   SVal ArgVal = state->getSVal(Arg.Expression, LCtx);
   state = checkNonNull(C, state, Arg, ArgVal);
 
@@ -1584,7 +1585,7 @@ void CStringChecker::evalstrLengthCommon(CheckerContext &C, const CallExpr *CE,
 
     // It's a little unfortunate to be getting this again,
     // but it's not that expensive...
-    const Expr *maxlenExpr = CE->getArg(1);
+    const Expr *maxlenExpr = Call.getArgExpr(1);
     SVal maxlenVal = state->getSVal(maxlenExpr, LCtx);
 
     std::optional<NonLoc> strLengthNL = strLength.getAs<NonLoc>();
@@ -1613,8 +1614,8 @@ void CStringChecker::evalstrLengthCommon(CheckerContext &C, const CallExpr *CE,
       // no guarantee the full string length will actually be returned.
       // All we know is the return value is the min of the string length
       // and the limit. This is better than nothing.
-      result = C.getSValBuilder().conjureSymbolVal(nullptr, CE, LCtx,
-                                                   C.blockCount());
+      result = C.getSValBuilder().conjureSymbolVal(
+          nullptr, Call.getOriginExpr(), LCtx, C.blockCount());
       NonLoc resultNL = result.castAs<NonLoc>();
 
       if (strLengthNL) {
@@ -1637,78 +1638,85 @@ void CStringChecker::evalstrLengthCommon(CheckerContext &C, const CallExpr *CE,
     // If we don't know the length of the string, conjure a return
     // value, so it can be used in constraints, at least.
     if (result.isUnknown()) {
-      result = C.getSValBuilder().conjureSymbolVal(nullptr, CE, LCtx,
-                                                   C.blockCount());
+      result = C.getSValBuilder().conjureSymbolVal(
+          nullptr, Call.getOriginExpr(), LCtx, C.blockCount());
     }
   }
 
   // Bind the return value.
   assert(!result.isUnknown() && "Should have conjured a value by now");
-  state = state->BindExpr(CE, LCtx, result);
+  state = state->BindExpr(Call.getOriginExpr(), LCtx, result);
   C.addTransition(state);
 }
 
-void CStringChecker::evalStrcpy(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalStrcpy(CheckerContext &C,
+                                const CallEvent &Call) const {
   // char *strcpy(char *restrict dst, const char *restrict src);
-  evalStrcpyCommon(C, CE,
+  evalStrcpyCommon(C, Call,
                    /* ReturnEnd = */ false,
                    /* IsBounded = */ false,
                    /* appendK = */ ConcatFnKind::none);
 }
 
-void CStringChecker::evalStrncpy(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalStrncpy(CheckerContext &C,
+                                 const CallEvent &Call) const {
   // char *strncpy(char *restrict dst, const char *restrict src, size_t n);
-  evalStrcpyCommon(C, CE,
+  evalStrcpyCommon(C, Call,
                    /* ReturnEnd = */ false,
                    /* IsBounded = */ true,
                    /* appendK = */ ConcatFnKind::none);
 }
 
-void CStringChecker::evalStpcpy(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalStpcpy(CheckerContext &C,
+                                const CallEvent &Call) const {
   // char *stpcpy(char *restrict dst, const char *restrict src);
-  evalStrcpyCommon(C, CE,
+  evalStrcpyCommon(C, Call,
                    /* ReturnEnd = */ true,
                    /* IsBounded = */ false,
                    /* appendK = */ ConcatFnKind::none);
 }
 
-void CStringChecker::evalStrlcpy(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalStrlcpy(CheckerContext &C,
+                                 const CallEvent &Call) const {
   // size_t strlcpy(char *dest, const char *src, size_t size);
-  evalStrcpyCommon(C, CE,
+  evalStrcpyCommon(C, Call,
                    /* ReturnEnd = */ true,
                    /* IsBounded = */ true,
                    /* appendK = */ ConcatFnKind::none,
                    /* returnPtr = */ false);
 }
 
-void CStringChecker::evalStrcat(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalStrcat(CheckerContext &C,
+                                const CallEvent &Call) const {
   // char *strcat(char *restrict s1, const char *restrict s2);
-  evalStrcpyCommon(C, CE,
+  evalStrcpyCommon(C, Call,
                    /* ReturnEnd = */ false,
                    /* IsBounded = */ false,
                    /* appendK = */ ConcatFnKind::strcat);
 }
 
-void CStringChecker::evalStrncat(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalStrncat(CheckerContext &C,
+                                 const CallEvent &Call) const {
   // char *strncat(char *restrict s1, const char *restrict s2, size_t n);
-  evalStrcpyCommon(C, CE,
+  evalStrcpyCommon(C, Call,
                    /* ReturnEnd = */ false,
                    /* IsBounded = */ true,
                    /* appendK = */ ConcatFnKind::strcat);
 }
 
-void CStringChecker::evalStrlcat(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalStrlcat(CheckerContext &C,
+                                 const CallEvent &Call) const {
   // size_t strlcat(char *dst, const char *src, size_t size);
   // It will append at most size - strlen(dst) - 1 bytes,
   // NULL-terminating the result.
-  evalStrcpyCommon(C, CE,
+  evalStrcpyCommon(C, Call,
                    /* ReturnEnd = */ false,
                    /* IsBounded = */ true,
                    /* appendK = */ ConcatFnKind::strlcat,
                    /* returnPtr = */ false);
 }
 
-void CStringChecker::evalStrcpyCommon(CheckerContext &C, const CallExpr *CE,
+void CStringChecker::evalStrcpyCommon(CheckerContext &C, const CallEvent &Call,
                                       bool ReturnEnd, bool IsBounded,
                                       ConcatFnKind appendK,
                                       bool returnPtr) const {
@@ -1721,14 +1729,14 @@ void CStringChecker::evalStrcpyCommon(CheckerContext &C, const CallExpr *CE,
   const LocationContext *LCtx = C.getLocationContext();
 
   // Check that the destination is non-null.
-  DestinationArgExpr Dst = {{CE->getArg(0), 0}};
+  DestinationArgExpr Dst = {{Call.getArgExpr(0), 0}};
   SVal DstVal = state->getSVal(Dst.Expression, LCtx);
   state = checkNonNull(C, state, Dst, DstVal);
   if (!state)
     return;
 
   // Check that the source is non-null.
-  SourceArgExpr srcExpr = {{CE->getArg(1), 1}};
+  SourceArgExpr srcExpr = {{Call.getArgExpr(1), 1}};
   SVal srcVal = state->getSVal(srcExpr.Expression, LCtx);
   state = checkNonNull(C, state, srcExpr, srcVal);
   if (!state)
@@ -1763,8 +1771,8 @@ void CStringChecker::evalStrcpyCommon(CheckerContext &C, const CallExpr *CE,
       {srcExpr.Expression, srcExpr.ArgumentIndex}};
   state = CheckOverlap(
       C, state,
-      (IsBounded ? SizeArgExpr{{CE->getArg(2), 2}} : SrcExprAsSizeDummy), Dst,
-      srcExpr);
+      (IsBounded ? SizeArgExpr{{Call.getArgExpr(2), 2}} : SrcExprAsSizeDummy),
+      Dst, srcExpr);
 
   if (!state)
     return;
@@ -1772,7 +1780,7 @@ void CStringChecker::evalStrcpyCommon(CheckerContext &C, const CallExpr *CE,
   // If the function is strncpy, strncat, etc... it is bounded.
   if (IsBounded) {
     // Get the max number of characters to copy.
-    SizeArgExpr lenExpr = {{CE->getArg(2), 2}};
+    SizeArgExpr lenExpr = {{Call.getArgExpr(2), 2}};
     SVal lenVal = state->getSVal(lenExpr.Expression, LCtx);
 
     // Protect against misdeclared strncpy().
@@ -1886,16 +1894,19 @@ void CStringChecker::evalStrcpyCommon(CheckerContext &C, const CallExpr *CE,
         // If the size is known to be zero, we're done.
         if (StateZeroSize && !StateNonZeroSize) {
           if (returnPtr) {
-            StateZeroSize = StateZeroSize->BindExpr(CE, LCtx, DstVal);
+            StateZeroSize =
+                StateZeroSize->BindExpr(Call.getOriginExpr(), LCtx, DstVal);
           } else {
             if (appendK == ConcatFnKind::none) {
               // strlcpy returns strlen(src)
-              StateZeroSize = StateZeroSize->BindExpr(CE, LCtx, strLength);
+              StateZeroSize = StateZeroSize->BindExpr(Call.getOriginExpr(),
+                                                      LCtx, strLength);
             } else {
               // strlcat returns strlen(src) + strlen(dst)
               SVal retSize = svalBuilder.evalBinOp(
                   state, BO_Add, strLength, dstStrLength, sizeTy);
-              StateZeroSize = StateZeroSize->BindExpr(CE, LCtx, retSize);
+              StateZeroSize =
+                  StateZeroSize->BindExpr(Call.getOriginExpr(), LCtx, retSize);
             }
           }
           C.addTransition(StateZeroSize);
@@ -1964,7 +1975,8 @@ void CStringChecker::evalStrcpyCommon(CheckerContext &C, const CallExpr *CE,
     if (finalStrLength.isUnknown()) {
       // Try to get a "hypothetical" string length symbol, which we can later
       // set as a real value if that turns out to be the case.
-      finalStrLength = getCStringLength(C, state, CE, DstVal, true);
+      finalStrLength =
+          getCStringLength(C, state, Call.getOriginExpr(), DstVal, true);
       assert(!finalStrLength.isUndef());
 
       if (std::optional<NonLoc> finalStrLengthNL =
@@ -2094,51 +2106,54 @@ void CStringChecker::evalStrcpyCommon(CheckerContext &C, const CallExpr *CE,
     // If this is a stpcpy-style copy, but we were unable to check for a buffer
     // overflow, we still need a result. Conjure a return value.
     if (ReturnEnd && Result.isUnknown()) {
-      Result = svalBuilder.conjureSymbolVal(nullptr, CE, LCtx, C.blockCount());
+      Result = svalBuilder.conjureSymbolVal(nullptr, Call.getOriginExpr(), LCtx,
+                                            C.blockCount());
     }
   }
   // Set the return value.
-  state = state->BindExpr(CE, LCtx, Result);
+  state = state->BindExpr(Call.getOriginExpr(), LCtx, Result);
   C.addTransition(state);
 }
 
-void CStringChecker::evalStrcmp(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalStrcmp(CheckerContext &C,
+                                const CallEvent &Call) const {
   //int strcmp(const char *s1, const char *s2);
-  evalStrcmpCommon(C, CE, /* IsBounded = */ false, /* IgnoreCase = */ false);
+  evalStrcmpCommon(C, Call, /* IsBounded = */ false, /* IgnoreCase = */ false);
 }
 
-void CStringChecker::evalStrncmp(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalStrncmp(CheckerContext &C,
+                                 const CallEvent &Call) const {
   //int strncmp(const char *s1, const char *s2, size_t n);
-  evalStrcmpCommon(C, CE, /* IsBounded = */ true, /* IgnoreCase = */ false);
+  evalStrcmpCommon(C, Call, /* IsBounded = */ true, /* IgnoreCase = */ false);
 }
 
 void CStringChecker::evalStrcasecmp(CheckerContext &C,
-    const CallExpr *CE) const {
+                                    const CallEvent &Call) const {
   //int strcasecmp(const char *s1, const char *s2);
-  evalStrcmpCommon(C, CE, /* IsBounded = */ false, /* IgnoreCase = */ true);
+  evalStrcmpCommon(C, Call, /* IsBounded = */ false, /* IgnoreCase = */ true);
 }
 
 void CStringChecker::evalStrncasecmp(CheckerContext &C,
-    const CallExpr *CE) const {
+                                     const CallEvent &Call) const {
   //int strncasecmp(const char *s1, const char *s2, size_t n);
-  evalStrcmpCommon(C, CE, /* IsBounded = */ true, /* IgnoreCase = */ true);
+  evalStrcmpCommon(C, Call, /* IsBounded = */ true, /* IgnoreCase = */ true);
 }
 
-void CStringChecker::evalStrcmpCommon(CheckerContext &C, const CallExpr *CE,
-    bool IsBounded, bool IgnoreCase) const {
+void CStringChecker::evalStrcmpCommon(CheckerContext &C, const CallEvent &Call,
+                                      bool IsBounded, bool IgnoreCase) const {
   CurrentFunctionDescription = "string comparison function";
   ProgramStateRef state = C.getState();
   const LocationContext *LCtx = C.getLocationContext();
 
   // Check that the first string is non-null
-  AnyArgExpr Left = {CE->getArg(0), 0};
+  AnyArgExpr Left = {Call.getArgExpr(0), 0};
   SVal LeftVal = state->getSVal(Left.Expression, LCtx);
   state = checkNonNull(C, state, Left, LeftVal);
   if (!state)
     return;
 
   // Check that the second string is non-null.
-  AnyArgExpr Right = {CE->getArg(1), 1};
+  AnyArgExpr Right = {Call.getArgExpr(1), 1};
   SVal RightVal = state->getSVal(Right.Expression, LCtx);
   state = checkNonNull(C, state, Right, RightVal);
   if (!state)
@@ -2169,8 +2184,9 @@ void CStringChecker::evalStrcmpCommon(CheckerContext &C, const CallExpr *CE,
   // If the two arguments might be the same buffer, we know the result is 0,
   // and we only need to check one size.
   if (StSameBuf) {
-    StSameBuf = StSameBuf->BindExpr(CE, LCtx,
-        svalBuilder.makeZeroVal(CE->getType()));
+    StSameBuf =
+        StSameBuf->BindExpr(Call.getOriginExpr(), LCtx,
+                            svalBuilder.makeZeroVal(Call.getResultType()));
     C.addTransition(StSameBuf);
 
     // If the two arguments are GUARANTEED to be the same, we're done!
@@ -2190,8 +2206,8 @@ void CStringChecker::evalStrcmpCommon(CheckerContext &C, const CallExpr *CE,
   const StringLiteral *RightStrLiteral =
       getCStringLiteral(C, state, Right.Expression, RightVal);
   bool canComputeResult = false;
-  SVal resultVal = svalBuilder.conjureSymbolVal(nullptr, CE, LCtx,
-      C.blockCount());
+  SVal resultVal = svalBuilder.conjureSymbolVal(nullptr, Call.getOriginExpr(),
+                                                LCtx, C.blockCount());
 
   if (LeftStrLiteral && RightStrLiteral) {
     StringRef LeftStrRef = LeftStrLiteral->getString();
@@ -2199,7 +2215,7 @@ void CStringChecker::evalStrcmpCommon(CheckerContext &C, const CallExpr *CE,
 
     if (IsBounded) {
       // Get the max number of characters to compare.
-      const Expr *lenExpr = CE->getArg(2);
+      const Expr *lenExpr = Call.getArgExpr(2);
       SVal lenVal = state->getSVal(lenExpr, LCtx);
 
       // If the length is known, we can get the right substrings.
@@ -2231,10 +2247,10 @@ void CStringChecker::evalStrcmpCommon(CheckerContext &C, const CallExpr *CE,
       // The strcmp function returns an integer greater than, equal to, or less
       // than zero, [c11, p7.24.4.2].
       if (compareRes == 0) {
-        resultVal = svalBuilder.makeIntVal(compareRes, CE->getType());
+        resultVal = svalBuilder.makeIntVal(compareRes, Call.getResultType());
       }
       else {
-        DefinedSVal zeroVal = svalBuilder.makeIntVal(0, CE->getType());
+        DefinedSVal zeroVal = svalBuilder.makeIntVal(0, Call.getResultType());
         // Constrain strcmp's result range based on the result of StringRef's
         // comparison methods.
         BinaryOperatorKind op = (compareRes > 0) ? BO_GT : BO_LT;
@@ -2247,20 +2263,21 @@ void CStringChecker::evalStrcmpCommon(CheckerContext &C, const CallExpr *CE,
     }
   }
 
-  state = state->BindExpr(CE, LCtx, resultVal);
+  state = state->BindExpr(Call.getOriginExpr(), LCtx, resultVal);
 
   // Record this as a possible path.
   C.addTransition(state);
 }
 
-void CStringChecker::evalStrsep(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalStrsep(CheckerContext &C,
+                                const CallEvent &Call) const {
   // char *strsep(char **stringp, const char *delim);
   // Verify whether the search string parameter matches the return type.
-  SourceArgExpr SearchStrPtr = {{CE->getArg(0), 0}};
+  SourceArgExpr SearchStrPtr = {{Call.getArgExpr(0), 0}};
 
   QualType CharPtrTy = SearchStrPtr.Expression->getType()->getPointeeType();
-  if (CharPtrTy.isNull() ||
-      CE->getType().getUnqualifiedType() != CharPtrTy.getUnqualifiedType())
+  if (CharPtrTy.isNull() || Call.getResultType().getUnqualifiedType() !=
+                                CharPtrTy.getUnqualifiedType())
     return;
 
   CurrentFunctionDescription = "strsep()";
@@ -2275,7 +2292,7 @@ void CStringChecker::evalStrsep(CheckerContext &C, const CallExpr *CE) const {
     return;
 
   // Check that the delimiter string is non-null.
-  AnyArgExpr DelimStr = {CE->getArg(1), 1};
+  AnyArgExpr DelimStr = {Call.getArgExpr(1), 1};
   SVal DelimStrVal = State->getSVal(DelimStr.Expression, LCtx);
   State = checkNonNull(C, State, DelimStr, DelimStrVal);
   if (!State)
@@ -2295,37 +2312,37 @@ void CStringChecker::evalStrsep(CheckerContext &C, const CallExpr *CE) const {
 
     // Overwrite the search string pointer. The new value is either an address
     // further along in the same string, or NULL if there are no more tokens.
-    State = State->bindLoc(*SearchStrLoc,
-        SVB.conjureSymbolVal(getTag(),
-          CE,
-          LCtx,
-          CharPtrTy,
-          C.blockCount()),
-        LCtx);
+    State =
+        State->bindLoc(*SearchStrLoc,
+                       SVB.conjureSymbolVal(getTag(), Call.getOriginExpr(),
+                                            LCtx, CharPtrTy, C.blockCount()),
+                       LCtx);
   } else {
     assert(SearchStrVal.isUnknown());
     // Conjure a symbolic value. It's the best we can do.
-    Result = SVB.conjureSymbolVal(nullptr, CE, LCtx, C.blockCount());
+    Result = SVB.conjureSymbolVal(nullptr, Call.getOriginExpr(), LCtx,
+                                  C.blockCount());
   }
 
   // Set the return value, and finish.
-  State = State->BindExpr(CE, LCtx, Result);
+  State = State->BindExpr(Call.getOriginExpr(), LCtx, Result);
   C.addTransition(State);
 }
 
 // These should probably be moved into a C++ standard library checker.
-void CStringChecker::evalStdCopy(CheckerContext &C, const CallExpr *CE) const {
-  evalStdCopyCommon(C, CE);
+void CStringChecker::evalStdCopy(CheckerContext &C,
+                                 const CallEvent &Call) const {
+  evalStdCopyCommon(C, Call);
 }
 
 void CStringChecker::evalStdCopyBackward(CheckerContext &C,
-    const CallExpr *CE) const {
-  evalStdCopyCommon(C, CE);
+                                         const CallEvent &Call) const {
+  evalStdCopyCommon(C, Call);
 }
 
 void CStringChecker::evalStdCopyCommon(CheckerContext &C,
-    const CallExpr *CE) const {
-  if (!CE->getArg(2)->getType()->isPointerType())
+                                       const CallEvent &Call) const {
+  if (!Call.getArgExpr(2)->getType()->isPointerType())
     return;
 
   ProgramStateRef State = C.getState();
@@ -2338,7 +2355,7 @@ void CStringChecker::evalStdCopyCommon(CheckerContext &C,
   //        _OutputIterator __result)
 
   // Invalidate the destination buffer
-  const Expr *Dst = CE->getArg(2);
+  const Expr *Dst = Call.getArgExpr(2);
   SVal DstVal = State->getSVal(Dst, LCtx);
   // FIXME: As we do not know how many items are copied, we also invalidate the
   // super region containing the target location.
@@ -2347,19 +2364,21 @@ void CStringChecker::evalStdCopyCommon(CheckerContext &C,
 
   SValBuilder &SVB = C.getSValBuilder();
 
-  SVal ResultVal = SVB.conjureSymbolVal(nullptr, CE, LCtx, C.blockCount());
-  State = State->BindExpr(CE, LCtx, ResultVal);
+  SVal ResultVal =
+      SVB.conjureSymbolVal(nullptr, Call.getOriginExpr(), LCtx, C.blockCount());
+  State = State->BindExpr(Call.getOriginExpr(), LCtx, ResultVal);
 
   C.addTransition(State);
 }
 
-void CStringChecker::evalMemset(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalMemset(CheckerContext &C,
+                                const CallEvent &Call) const {
   // void *memset(void *s, int c, size_t n);
   CurrentFunctionDescription = "memory set function";
 
-  DestinationArgExpr Buffer = {{CE->getArg(0), 0}};
-  AnyArgExpr CharE = {CE->getArg(1), 1};
-  SizeArgExpr Size = {{CE->getArg(2), 2}};
+  DestinationArgExpr Buffer = {{Call.getArgExpr(0), 0}};
+  AnyArgExpr CharE = {Call.getArgExpr(1), 1};
+  SizeArgExpr Size = {{Call.getArgExpr(2), 2}};
 
   ProgramStateRef State = C.getState();
 
@@ -2377,7 +2396,7 @@ void CStringChecker::evalMemset(CheckerContext &C, const CallExpr *CE) const {
   // If the size is zero, there won't be any actual memory access, so
   // just bind the return value to the buffer and return.
   if (ZeroSize && !NonZeroSize) {
-    ZeroSize = ZeroSize->BindExpr(CE, LCtx, BufferPtrVal);
+    ZeroSize = ZeroSize->BindExpr(Call.getOriginExpr(), LCtx, BufferPtrVal);
     C.addTransition(ZeroSize);
     return;
   }
@@ -2399,15 +2418,15 @@ void CStringChecker::evalMemset(CheckerContext &C, const CallExpr *CE) const {
                  Size.Expression, C, State))
     return;
 
-  State = State->BindExpr(CE, LCtx, BufferPtrVal);
+  State = State->BindExpr(Call.getOriginExpr(), LCtx, BufferPtrVal);
   C.addTransition(State);
 }
 
-void CStringChecker::evalBzero(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalBzero(CheckerContext &C, const CallEvent &Call) const {
   CurrentFunctionDescription = "memory clearance function";
 
-  DestinationArgExpr Buffer = {{CE->getArg(0), 0}};
-  SizeArgExpr Size = {{CE->getArg(1), 1}};
+  DestinationArgExpr Buffer = {{Call.getArgExpr(0), 0}};
+  SizeArgExpr Size = {{Call.getArgExpr(1), 1}};
   SVal Zero = C.getSValBuilder().makeZeroVal(C.getASTContext().IntTy);
 
   ProgramStateRef State = C.getState();
@@ -2446,24 +2465,29 @@ void CStringChecker::evalBzero(CheckerContext &C, const CallExpr *CE) const {
   C.addTransition(State);
 }
 
-void CStringChecker::evalSprintf(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalSprintf(CheckerContext &C,
+                                 const CallEvent &Call) const {
   CurrentFunctionDescription = "'sprintf'";
+  const auto *CE = cast<CallExpr>(Call.getOriginExpr());
   bool IsBI = CE->getBuiltinCallee() == Builtin::BI__builtin___sprintf_chk;
-  evalSprintfCommon(C, CE, /* IsBounded */ false, IsBI);
+  evalSprintfCommon(C, Call, /* IsBounded */ false, IsBI);
 }
 
-void CStringChecker::evalSnprintf(CheckerContext &C, const CallExpr *CE) const {
+void CStringChecker::evalSnprintf(CheckerContext &C,
+                                  const CallEvent &Call) const {
   CurrentFunctionDescription = "'snprintf'";
+  const auto *CE = cast<CallExpr>(Call.getOriginExpr());
   bool IsBI = CE->getBuiltinCallee() == Builtin::BI__builtin___snprintf_chk;
-  evalSprintfCommon(C, CE, /* IsBounded */ true, IsBI);
+  evalSprintfCommon(C, Call, /* IsBounded */ true, IsBI);
 }
 
-void CStringChecker::evalSprintfCommon(CheckerContext &C, const CallExpr *CE,
+void CStringChecker::evalSprintfCommon(CheckerContext &C, const CallEvent &Call,
                                        bool IsBounded, bool IsBuiltin) const {
   ProgramStateRef State = C.getState();
-  DestinationArgExpr Dest = {{CE->getArg(0), 0}};
+  const auto *CE = cast<CallExpr>(Call.getOriginExpr());
+  DestinationArgExpr Dest = {{Call.getArgExpr(0), 0}};
 
-  const auto NumParams = CE->getCalleeDecl()->getAsFunction()->getNumParams();
+  const auto NumParams = Call.parameters().size();
   assert(CE->getNumArgs() >= NumParams);
 
   const auto AllArguments =
@@ -2483,7 +2507,7 @@ void CStringChecker::evalSprintfCommon(CheckerContext &C, const CallExpr *CE,
         {Source.Expression, Source.ArgumentIndex}};
     State = CheckOverlap(
         C, State,
-        (IsBounded ? SizeArgExpr{{CE->getArg(1), 1}} : SrcExprAsSizeDummy),
+        (IsBounded ? SizeArgExpr{{Call.getArgExpr(1), 1}} : SrcExprAsSizeDummy),
         Dest, Source);
     if (!State)
       return;
@@ -2536,8 +2560,8 @@ bool CStringChecker::evalCall(const CallEvent &Call, CheckerContext &C) const {
     return false;
 
   // Check and evaluate the call.
-  const auto *CE = cast<CallExpr>(Call.getOriginExpr());
-  Callback(this, C, CE);
+  assert(isa<CallExpr>(Call.getOriginExpr()));
+  Callback(this, C, Call);
 
   // If the evaluate call resulted in no change, chain to the next eval call
   // handler.
diff --git a/clang/lib/StaticAnalyzer/Checkers/CXXDeleteChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CXXDeleteChecker.cpp
index eb265f4dde68..b4dee1e300e8 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CXXDeleteChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CXXDeleteChecker.cpp
@@ -31,6 +31,7 @@
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugReporter.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/CommonBugCategories.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
@@ -65,7 +66,8 @@ public:
 };
 
 class DeleteWithNonVirtualDtorChecker : public CXXDeleteChecker {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{
+      this, "Destruction of a polymorphic object with no virtual destructor"};
 
   void
   checkTypedDeleteExpr(const CXXDeleteExpr *DE, CheckerContext &C,
@@ -74,7 +76,8 @@ class DeleteWithNonVirtualDtorChecker : public CXXDeleteChecker {
 };
 
 class CXXArrayDeleteChecker : public CXXDeleteChecker {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this,
+                   "Deleting an array of polymorphic objects is undefined"};
 
   void
   checkTypedDeleteExpr(const CXXDeleteExpr *DE, CheckerContext &C,
@@ -123,17 +126,10 @@ void DeleteWithNonVirtualDtorChecker::checkTypedDeleteExpr(
   if (!DerivedClass->isDerivedFrom(BaseClass))
     return;
 
-  if (!BT)
-    BT.reset(new BugType(this,
-                         "Destruction of a polymorphic object with no "
-                         "virtual destructor",
-                         "Logic error"));
-
   ExplodedNode *N = C.generateNonFatalErrorNode();
   if (!N)
     return;
-  auto R =
-      std::make_unique<PathSensitiveBugReport>(*BT, BT->getDescription(), N);
+  auto R = std::make_unique<PathSensitiveBugReport>(BT, BT.getDescription(), N);
 
   // Mark region of problematic base class for later use in the BugVisitor.
   R->markInteresting(BaseClassRegion);
@@ -160,12 +156,6 @@ void CXXArrayDeleteChecker::checkTypedDeleteExpr(
   if (!DerivedClass->isDerivedFrom(BaseClass))
     return;
 
-  if (!BT)
-    BT.reset(new BugType(this,
-                         "Deleting an array of polymorphic objects "
-                         "is undefined",
-                         "Logic error"));
-
   ExplodedNode *N = C.generateNonFatalErrorNode();
   if (!N)
     return;
@@ -182,7 +172,7 @@ void CXXArrayDeleteChecker::checkTypedDeleteExpr(
      << SourceType.getAsString(C.getASTContext().getPrintingPolicy())
      << "' is undefined";
 
-  auto R = std::make_unique<PathSensitiveBugReport>(*BT, OS.str(), N);
+  auto R = std::make_unique<PathSensitiveBugReport>(BT, OS.str(), N);
 
   // Mark region of problematic base class for later use in the BugVisitor.
   R->markInteresting(BaseClassRegion);
diff --git a/clang/lib/StaticAnalyzer/Checkers/CallAndMessageChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CallAndMessageChecker.cpp
index ea74256935ca..f2e1f69c32cf 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CallAndMessageChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CallAndMessageChecker.cpp
@@ -125,9 +125,8 @@ private:
     if (!BT)
       BT.reset(new BugType(OriginalName, desc));
   }
-  bool uninitRefOrPointer(CheckerContext &C, const SVal &V,
-                          SourceRange ArgRange, const Expr *ArgEx,
-                          std::unique_ptr<BugType> &BT,
+  bool uninitRefOrPointer(CheckerContext &C, SVal V, SourceRange ArgRange,
+                          const Expr *ArgEx, std::unique_ptr<BugType> &BT,
                           const ParmVarDecl *ParamDecl, const char *BD,
                           int ArgumentNumber) const;
 };
@@ -185,7 +184,7 @@ static void describeUninitializedArgumentInCall(const CallEvent &Call,
 }
 
 bool CallAndMessageChecker::uninitRefOrPointer(
-    CheckerContext &C, const SVal &V, SourceRange ArgRange, const Expr *ArgEx,
+    CheckerContext &C, SVal V, SourceRange ArgRange, const Expr *ArgEx,
     std::unique_ptr<BugType> &BT, const ParmVarDecl *ParamDecl, const char *BD,
     int ArgumentNumber) const {
 
@@ -263,7 +262,7 @@ public:
           if (Find(FR))
             return true;
         } else {
-          const SVal &V = StoreMgr.getBinding(store, loc::MemRegionVal(FR));
+          SVal V = StoreMgr.getBinding(store, loc::MemRegionVal(FR));
           if (V.isUndef())
             return true;
         }
diff --git a/clang/lib/StaticAnalyzer/Checkers/CastSizeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CastSizeChecker.cpp
index d1d4f3baf6a8..a50772f881f7 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CastSizeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CastSizeChecker.cpp
@@ -24,7 +24,7 @@ using namespace ento;
 
 namespace {
 class CastSizeChecker : public Checker< check::PreStmt<CastExpr> > {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Cast region with wrong size."};
 
 public:
   void checkPreStmt(const CastExpr *CE, CheckerContext &C) const;
@@ -131,12 +131,10 @@ void CastSizeChecker::checkPreStmt(const CastExpr *CE,CheckerContext &C) const {
     return;
 
   if (ExplodedNode *errorNode = C.generateErrorNode()) {
-    if (!BT)
-      BT.reset(new BugType(this, "Cast region with wrong size."));
     constexpr llvm::StringLiteral Msg =
         "Cast a region whose size is not a multiple of the destination type "
         "size.";
-    auto R = std::make_unique<PathSensitiveBugReport>(*BT, Msg, errorNode);
+    auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, errorNode);
     R->addRange(CE->getSourceRange());
     C.emitReport(std::move(R));
   }
diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
index fedc6db3723a..978bc0bb082f 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
@@ -99,18 +99,23 @@ class ObjCDeallocChecker
                      check::PointerEscape,
                      check::PreStmt<ReturnStmt>> {
 
-  mutable IdentifierInfo *NSObjectII, *SenTestCaseII, *XCTestCaseII,
-      *Block_releaseII, *CIFilterII;
-
-  mutable Selector DeallocSel, ReleaseSel;
-
-  std::unique_ptr<BugType> MissingReleaseBugType;
-  std::unique_ptr<BugType> ExtraReleaseBugType;
-  std::unique_ptr<BugType> MistakenDeallocBugType;
+  mutable const IdentifierInfo *NSObjectII = nullptr;
+  mutable const IdentifierInfo *SenTestCaseII = nullptr;
+  mutable const IdentifierInfo *XCTestCaseII = nullptr;
+  mutable const IdentifierInfo *Block_releaseII = nullptr;
+  mutable const IdentifierInfo *CIFilterII = nullptr;
+
+  mutable Selector DeallocSel;
+  mutable Selector ReleaseSel;
+
+  const BugType MissingReleaseBugType{this, "Missing ivar release (leak)",
+                                      categories::MemoryRefCount};
+  const BugType ExtraReleaseBugType{this, "Extra ivar release",
+                                    categories::MemoryRefCount};
+  const BugType MistakenDeallocBugType{this, "Mistaken dealloc",
+                                       categories::MemoryRefCount};
 
 public:
-  ObjCDeallocChecker();
-
   void checkASTDecl(const ObjCImplementationDecl *D, AnalysisManager& Mgr,
                     BugReporter &BR) const;
   void checkBeginFunction(CheckerContext &Ctx) const;
@@ -579,7 +584,7 @@ void ObjCDeallocChecker::diagnoseMissingReleases(CheckerContext &C) const {
     OS << " by a synthesized property but not released"
           " before '[super dealloc]'";
 
-    auto BR = std::make_unique<PathSensitiveBugReport>(*MissingReleaseBugType,
+    auto BR = std::make_unique<PathSensitiveBugReport>(MissingReleaseBugType,
                                                        OS.str(), ErrNode);
     C.emitReport(std::move(BR));
   }
@@ -701,7 +706,7 @@ bool ObjCDeallocChecker::diagnoseExtraRelease(SymbolRef ReleasedValue,
     OS <<  " property but was released in 'dealloc'";
   }
 
-  auto BR = std::make_unique<PathSensitiveBugReport>(*ExtraReleaseBugType,
+  auto BR = std::make_unique<PathSensitiveBugReport>(ExtraReleaseBugType,
                                                      OS.str(), ErrNode);
   BR->addRange(M.getOriginExpr()->getSourceRange());
 
@@ -743,7 +748,7 @@ bool ObjCDeallocChecker::diagnoseMistakenDealloc(SymbolRef DeallocedValue,
   OS << "'" << *PropImpl->getPropertyIvarDecl()
      << "' should be released rather than deallocated";
 
-  auto BR = std::make_unique<PathSensitiveBugReport>(*MistakenDeallocBugType,
+  auto BR = std::make_unique<PathSensitiveBugReport>(MistakenDeallocBugType,
                                                      OS.str(), ErrNode);
   BR->addRange(M.getOriginExpr()->getSourceRange());
 
@@ -752,23 +757,6 @@ bool ObjCDeallocChecker::diagnoseMistakenDealloc(SymbolRef DeallocedValue,
   return true;
 }
 
-ObjCDeallocChecker::ObjCDeallocChecker()
-    : NSObjectII(nullptr), SenTestCaseII(nullptr), XCTestCaseII(nullptr),
-      Block_releaseII(nullptr), CIFilterII(nullptr) {
-
-  MissingReleaseBugType.reset(
-      new BugType(this, "Missing ivar release (leak)",
-                  categories::MemoryRefCount));
-
-  ExtraReleaseBugType.reset(
-      new BugType(this, "Extra ivar release",
-                  categories::MemoryRefCount));
-
-  MistakenDeallocBugType.reset(
-      new BugType(this, "Mistaken dealloc",
-                  categories::MemoryRefCount));
-}
-
 void ObjCDeallocChecker::initIdentifierInfoAndSelectors(
     ASTContext &Ctx) const {
   if (NSObjectII)
diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp
index afc5e6b48008..ce05d2d3c905 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp
@@ -140,8 +140,7 @@ void WalkAST::VisitCallExpr(CallExpr *CE) {
   if (!II)   // if no identifier, not a simple C function
     return;
   StringRef Name = II->getName();
-  if (Name.starts_with("__builtin_"))
-    Name = Name.substr(10);
+  Name.consume_front("__builtin_");
 
   // Set the evaluation function by switching on the callee name.
   FnCheck evalFunction =
@@ -763,8 +762,7 @@ void WalkAST::checkDeprecatedOrUnsafeBufferHandling(const CallExpr *CE,
   enum { DEPR_ONLY = -1, UNKNOWN_CALL = -2 };
 
   StringRef Name = FD->getIdentifier()->getName();
-  if (Name.starts_with("__builtin_"))
-    Name = Name.substr(10);
+  Name.consume_front("__builtin_");
 
   int ArgIndex =
       llvm::StringSwitch<int>(Name)
diff --git a/clang/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp
index 9e11d8d9ecbc..be7be15022d3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp
@@ -41,7 +41,7 @@ bool isRootChanged(intptr_t k) { return k == ROOT_CHANGED; }
 //                      bug<--foo()--          JAIL_ENTERED<--foo()--
 class ChrootChecker : public Checker<eval::Call, check::PreCall> {
   // This bug refers to possibly break out of a chroot() jail.
-  mutable std::unique_ptr<BugType> BT_BreakJail;
+  const BugType BT_BreakJail{this, "Break out of jail"};
 
   const CallDescription Chroot{{"chroot"}, 1}, Chdir{{"chdir"}, 1};
 
@@ -124,12 +124,10 @@ void ChrootChecker::checkPreCall(const CallEvent &Call,
   if (k)
     if (isRootChanged((intptr_t) *k))
       if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
-        if (!BT_BreakJail)
-          BT_BreakJail.reset(new BugType(this, "Break out of jail"));
         constexpr llvm::StringLiteral Msg =
             "No call of chdir(\"/\") immediately after chroot";
         C.emitReport(
-            std::make_unique<PathSensitiveBugReport>(*BT_BreakJail, Msg, N));
+            std::make_unique<PathSensitiveBugReport>(BT_BreakJail, Msg, N));
       }
 }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/CloneChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CloneChecker.cpp
index 0e21ea7e90c9..6692a45a09f7 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CloneChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CloneChecker.cpp
@@ -35,7 +35,8 @@ public:
 
 private:
   mutable CloneDetector Detector;
-  mutable std::unique_ptr<BugType> BT_Exact, BT_Suspicious;
+  const BugType BT_Exact{this, "Exact code clone", "Code clone"};
+  const BugType BT_Suspicious{this, "Suspicious code clone", "Code clone"};
 
 public:
   void checkASTCodeBody(const Decl *D, AnalysisManager &Mgr,
@@ -107,15 +108,11 @@ static PathDiagnosticLocation makeLocation(const StmtSequence &S,
 void CloneChecker::reportClones(
     BugReporter &BR, AnalysisManager &Mgr,
     std::vector<CloneDetector::CloneGroup> &CloneGroups) const {
-
-  if (!BT_Exact)
-    BT_Exact.reset(new BugType(this, "Exact code clone", "Code clone"));
-
   for (const CloneDetector::CloneGroup &Group : CloneGroups) {
     // We group the clones by printing the first as a warning and all others
     // as a note.
     auto R = std::make_unique<BasicBugReport>(
-        *BT_Exact, "Duplicate code detected", makeLocation(Group.front(), Mgr));
+        BT_Exact, "Duplicate code detected", makeLocation(Group.front(), Mgr));
     R->addRange(Group.front().getSourceRange());
 
     for (unsigned i = 1; i < Group.size(); ++i)
@@ -154,10 +151,6 @@ void CloneChecker::reportSuspiciousClones(
     }
   }
 
-  if (!BT_Suspicious)
-    BT_Suspicious.reset(
-        new BugType(this, "Suspicious code clone", "Code clone"));
-
   ASTContext &ACtx = BR.getContext();
   SourceManager &SM = ACtx.getSourceManager();
   AnalysisDeclContext *ADC =
@@ -170,7 +163,7 @@ void CloneChecker::reportSuspiciousClones(
     // Think how to perform more accurate suggestions?
 
     auto R = std::make_unique<BasicBugReport>(
-        *BT_Suspicious,
+        BT_Suspicious,
         "Potential copy-paste error; did you really mean to use '" +
             Pair.FirstCloneInfo.Variable->getNameAsString() + "' here?",
         PathDiagnosticLocation::createBegin(Pair.FirstCloneInfo.Mention, SM,
diff --git a/clang/lib/StaticAnalyzer/Checkers/ConversionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ConversionChecker.cpp
index 8b34b41bab21..eca8d3cc0722 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ConversionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ConversionChecker.cpp
@@ -42,7 +42,7 @@ public:
   void checkPreStmt(const ImplicitCastExpr *Cast, CheckerContext &C) const;
 
 private:
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Conversion"};
 
   bool isLossOfPrecision(const ImplicitCastExpr *Cast, QualType DestType,
                          CheckerContext &C) const;
@@ -126,11 +126,8 @@ void ConversionChecker::checkPreStmt(const ImplicitCastExpr *Cast,
 
 void ConversionChecker::reportBug(ExplodedNode *N, const Expr *E,
                                   CheckerContext &C, const char Msg[]) const {
-  if (!BT)
-    BT.reset(new BugType(this, "Conversion"));
-
   // Generate a report for this bug.
-  auto R = std::make_unique<PathSensitiveBugReport>(*BT, Msg, N);
+  auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
   bugreporter::trackExpressionValue(N, E, *R);
   C.emitReport(std::move(R));
 }
diff --git a/clang/lib/StaticAnalyzer/Checkers/DebugContainerModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/DebugContainerModeling.cpp
index 832bb78c4803..97f769b1c451 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DebugContainerModeling.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DebugContainerModeling.cpp
@@ -28,7 +28,8 @@ namespace {
 class DebugContainerModeling
   : public Checker<eval::Call> {
 
-  std::unique_ptr<BugType> DebugMsgBugType;
+  const BugType DebugMsgBugType{this, "Checking analyzer assumptions", "debug",
+                                /*SuppressOnSink=*/true};
 
   template <typename Getter>
   void analyzerContainerDataField(const CallExpr *CE, CheckerContext &C,
@@ -48,19 +49,11 @@ class DebugContainerModeling
   };
 
 public:
-  DebugContainerModeling();
-
   bool evalCall(const CallEvent &Call, CheckerContext &C) const;
 };
 
 } //namespace
 
-DebugContainerModeling::DebugContainerModeling() {
-  DebugMsgBugType.reset(
-      new BugType(this, "Checking analyzer assumptions", "debug",
-                  /*SuppressOnSink=*/true));
-}
-
 bool DebugContainerModeling::evalCall(const CallEvent &Call,
                                       CheckerContext &C) const {
   const auto *CE = dyn_cast_or_null<CallExpr>(Call.getOriginExpr());
@@ -137,8 +130,8 @@ ExplodedNode *DebugContainerModeling::reportDebugMsg(llvm::StringRef Msg,
     return nullptr;
 
   auto &BR = C.getBugReporter();
-  BR.emitReport(std::make_unique<PathSensitiveBugReport>(*DebugMsgBugType,
-                                                         Msg, N));
+  BR.emitReport(
+      std::make_unique<PathSensitiveBugReport>(DebugMsgBugType, Msg, N));
   return N;
 }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/DebugIteratorModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/DebugIteratorModeling.cpp
index d05298b42c55..ff479c7b0ac8 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DebugIteratorModeling.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DebugIteratorModeling.cpp
@@ -28,7 +28,8 @@ namespace {
 class DebugIteratorModeling
   : public Checker<eval::Call> {
 
-  std::unique_ptr<BugType> DebugMsgBugType;
+  const BugType DebugMsgBugType{this, "Checking analyzer assumptions", "debug",
+                                /*SuppressOnSink=*/true};
 
   template <typename Getter>
   void analyzerIteratorDataField(const CallExpr *CE, CheckerContext &C,
@@ -51,19 +52,11 @@ class DebugIteratorModeling
   };
 
 public:
-  DebugIteratorModeling();
-
   bool evalCall(const CallEvent &Call, CheckerContext &C) const;
 };
 
 } //namespace
 
-DebugIteratorModeling::DebugIteratorModeling() {
-  DebugMsgBugType.reset(
-      new BugType(this, "Checking analyzer assumptions", "debug",
-                  /*SuppressOnSink=*/true));
-}
-
 bool DebugIteratorModeling::evalCall(const CallEvent &Call,
                                      CheckerContext &C) const {
   const auto *CE = dyn_cast_or_null<CallExpr>(Call.getOriginExpr());
@@ -131,8 +124,8 @@ ExplodedNode *DebugIteratorModeling::reportDebugMsg(llvm::StringRef Msg,
     return nullptr;
 
   auto &BR = C.getBugReporter();
-  BR.emitReport(std::make_unique<PathSensitiveBugReport>(*DebugMsgBugType,
-                                                         Msg, N));
+  BR.emitReport(
+      std::make_unique<PathSensitiveBugReport>(DebugMsgBugType, Msg, N));
   return N;
 }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
index 5331d9574743..5496f087447f 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
@@ -14,6 +14,7 @@
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Checkers/Taint.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/CommonBugCategories.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
@@ -25,8 +26,8 @@ using namespace taint;
 
 namespace {
 class DivZeroChecker : public Checker< check::PreStmt<BinaryOperator> > {
-  mutable std::unique_ptr<BugType> BT;
-  mutable std::unique_ptr<BugType> TaintBT;
+  const BugType BT{this, "Division by zero"};
+  const BugType TaintBT{this, "Division by zero", categories::TaintedData};
   void reportBug(StringRef Msg, ProgramStateRef StateZero,
                  CheckerContext &C) const;
   void reportTaintBug(StringRef Msg, ProgramStateRef StateZero,
@@ -48,10 +49,7 @@ static const Expr *getDenomExpr(const ExplodedNode *N) {
 void DivZeroChecker::reportBug(StringRef Msg, ProgramStateRef StateZero,
                                CheckerContext &C) const {
   if (ExplodedNode *N = C.generateErrorNode(StateZero)) {
-    if (!BT)
-      BT.reset(new BugType(this, "Division by zero", categories::LogicError));
-
-    auto R = std::make_unique<PathSensitiveBugReport>(*BT, Msg, N);
+    auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
     bugreporter::trackExpressionValue(N, getDenomExpr(N), *R);
     C.emitReport(std::move(R));
   }
@@ -61,11 +59,7 @@ void DivZeroChecker::reportTaintBug(
     StringRef Msg, ProgramStateRef StateZero, CheckerContext &C,
     llvm::ArrayRef<SymbolRef> TaintedSyms) const {
   if (ExplodedNode *N = C.generateErrorNode(StateZero)) {
-    if (!TaintBT)
-      TaintBT.reset(
-          new BugType(this, "Division by zero", categories::TaintedData));
-
-    auto R = std::make_unique<PathSensitiveBugReport>(*TaintBT, Msg, N);
+    auto R = std::make_unique<PathSensitiveBugReport>(TaintBT, Msg, N);
     bugreporter::trackExpressionValue(N, getDenomExpr(N), *R);
     for (auto Sym : TaintedSyms)
       R->markInteresting(Sym);
diff --git a/clang/lib/StaticAnalyzer/Checkers/DynamicTypeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DynamicTypeChecker.cpp
index dbc930d7d37b..0ad307d3ebd5 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DynamicTypeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DynamicTypeChecker.cpp
@@ -30,12 +30,7 @@ using namespace ento;
 
 namespace {
 class DynamicTypeChecker : public Checker<check::PostStmt<ImplicitCastExpr>> {
-  mutable std::unique_ptr<BugType> BT;
-  void initBugType() const {
-    if (!BT)
-      BT.reset(
-          new BugType(this, "Dynamic and static type mismatch", "Type Error"));
-  }
+  const BugType BT{this, "Dynamic and static type mismatch", "Type Error"};
 
   class DynamicTypeBugVisitor : public BugReporterVisitor {
   public:
@@ -70,7 +65,6 @@ void DynamicTypeChecker::reportTypeError(QualType DynamicType,
                                          const MemRegion *Reg,
                                          const Stmt *ReportedNode,
                                          CheckerContext &C) const {
-  initBugType();
   SmallString<192> Buf;
   llvm::raw_svector_ostream OS(Buf);
   OS << "Object has a dynamic type '";
@@ -81,7 +75,7 @@ void DynamicTypeChecker::reportTypeError(QualType DynamicType,
                   llvm::Twine());
   OS << "'";
   auto R = std::make_unique<PathSensitiveBugReport>(
-      *BT, OS.str(), C.generateNonFatalErrorNode());
+      BT, OS.str(), C.generateNonFatalErrorNode());
   R->markInteresting(Reg);
   R->addVisitor(std::make_unique<DynamicTypeBugVisitor>(Reg));
   R->addRange(ReportedNode->getSourceRange());
diff --git a/clang/lib/StaticAnalyzer/Checkers/EnumCastOutOfRangeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/EnumCastOutOfRangeChecker.cpp
index 7c51673422a0..0fa20428c1b5 100644
--- a/clang/lib/StaticAnalyzer/Checkers/EnumCastOutOfRangeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/EnumCastOutOfRangeChecker.cpp
@@ -60,7 +60,7 @@ public:
 // Being conservative, it does not warn if there is slight possibility the
 // value can be matching.
 class EnumCastOutOfRangeChecker : public Checker<check::PreStmt<CastExpr>> {
-  mutable std::unique_ptr<BugType> EnumValueCastOutOfRange;
+  const BugType EnumValueCastOutOfRange{this, "Enum cast out of range"};
   void reportWarning(CheckerContext &C, const CastExpr *CE,
                      const EnumDecl *E) const;
 
@@ -85,10 +85,6 @@ void EnumCastOutOfRangeChecker::reportWarning(CheckerContext &C,
                                               const EnumDecl *E) const {
   assert(E && "valid EnumDecl* is expected");
   if (const ExplodedNode *N = C.generateNonFatalErrorNode()) {
-    if (!EnumValueCastOutOfRange)
-      EnumValueCastOutOfRange.reset(
-          new BugType(this, "Enum cast out of range"));
-
     std::string ValueStr = "", NameStr = "the enum";
 
     // Try to add details to the message:
@@ -105,7 +101,7 @@ void EnumCastOutOfRangeChecker::reportWarning(CheckerContext &C,
                               "not in the valid range of values for {1}",
                               ValueStr, NameStr);
 
-    auto BR = std::make_unique<PathSensitiveBugReport>(*EnumValueCastOutOfRange,
+    auto BR = std::make_unique<PathSensitiveBugReport>(EnumValueCastOutOfRange,
                                                        Msg, N);
     bugreporter::trackExpressionValue(N, CE->getSubExpr(), *BR);
     BR->addNote("enum declared here",
diff --git a/clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp
index 2c7bacac33a6..3096999e9fd1 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp
@@ -25,7 +25,7 @@ using namespace ento;
 namespace {
 class ExprInspectionChecker
     : public Checker<eval::Call, check::DeadSymbols, check::EndAnalysis> {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Checking analyzer assumptions", "debug"};
 
   // These stats are per-analysis, not per-branch, hence they shouldn't
   // stay inside the program state.
@@ -176,11 +176,7 @@ ExprInspectionChecker::reportBug(llvm::StringRef Msg, BugReporter &BR,
                                  std::optional<SVal> ExprVal) const {
   if (!N)
     return nullptr;
-
-  if (!BT)
-    BT.reset(new BugType(this, "Checking analyzer assumptions", "debug"));
-
-  auto R = std::make_unique<PathSensitiveBugReport>(*BT, Msg, N);
+  auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
   if (ExprVal) {
     R->markInteresting(*ExprVal);
   }
diff --git a/clang/lib/StaticAnalyzer/Checkers/FixedAddressChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/FixedAddressChecker.cpp
index 2ee201b64008..7aefcdc6d358 100644
--- a/clang/lib/StaticAnalyzer/Checkers/FixedAddressChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/FixedAddressChecker.cpp
@@ -24,7 +24,7 @@ using namespace ento;
 namespace {
 class FixedAddressChecker
   : public Checker< check::PreStmt<BinaryOperator> > {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Use fixed address"};
 
 public:
   void checkPreStmt(const BinaryOperator *B, CheckerContext &C) const;
@@ -50,12 +50,10 @@ void FixedAddressChecker::checkPreStmt(const BinaryOperator *B,
 
   if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
     // FIXME: improve grammar in the following strings:
-    if (!BT)
-      BT.reset(new BugType(this, "Use fixed address"));
     constexpr llvm::StringLiteral Msg =
         "Using a fixed address is not portable because that address will "
         "probably not be valid in all environments or platforms.";
-    auto R = std::make_unique<PathSensitiveBugReport>(*BT, Msg, N);
+    auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
     R->addRange(B->getRHS()->getSourceRange());
     C.emitReport(std::move(R));
   }
diff --git a/clang/lib/StaticAnalyzer/Checkers/InvalidatedIteratorChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/InvalidatedIteratorChecker.cpp
index 6955ba11a28f..3f5856a3efbe 100644
--- a/clang/lib/StaticAnalyzer/Checkers/InvalidatedIteratorChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/InvalidatedIteratorChecker.cpp
@@ -31,14 +31,14 @@ class InvalidatedIteratorChecker
                    check::PreStmt<ArraySubscriptExpr>,
                    check::PreStmt<MemberExpr>> {
 
-  std::unique_ptr<BugType> InvalidatedBugType;
+  const BugType InvalidatedBugType{this, "Iterator invalidated",
+                                   "Misuse of STL APIs"};
 
-  void verifyAccess(CheckerContext &C, const SVal &Val) const;
-  void reportBug(const StringRef &Message, const SVal &Val,
-                 CheckerContext &C, ExplodedNode *ErrNode) const;
-public:
-  InvalidatedIteratorChecker();
+  void verifyAccess(CheckerContext &C, SVal Val) const;
+  void reportBug(StringRef Message, SVal Val, CheckerContext &C,
+                 ExplodedNode *ErrNode) const;
 
+public:
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
   void checkPreStmt(const UnaryOperator *UO, CheckerContext &C) const;
   void checkPreStmt(const BinaryOperator *BO, CheckerContext &C) const;
@@ -49,11 +49,6 @@ public:
 
 } //namespace
 
-InvalidatedIteratorChecker::InvalidatedIteratorChecker() {
-  InvalidatedBugType.reset(
-      new BugType(this, "Iterator invalidated", "Misuse of STL APIs"));
-}
-
 void InvalidatedIteratorChecker::checkPreCall(const CallEvent &Call,
                                               CheckerContext &C) const {
   // Check for access of invalidated position
@@ -114,7 +109,8 @@ void InvalidatedIteratorChecker::checkPreStmt(const MemberExpr *ME,
   verifyAccess(C, BaseVal);
 }
 
-void InvalidatedIteratorChecker::verifyAccess(CheckerContext &C, const SVal &Val) const {
+void InvalidatedIteratorChecker::verifyAccess(CheckerContext &C,
+                                              SVal Val) const {
   auto State = C.getState();
   const auto *Pos = getIteratorPosition(State, Val);
   if (Pos && !Pos->isValid()) {
@@ -126,11 +122,11 @@ void InvalidatedIteratorChecker::verifyAccess(CheckerContext &C, const SVal &Val
   }
 }
 
-void InvalidatedIteratorChecker::reportBug(const StringRef &Message,
-                                           const SVal &Val, CheckerContext &C,
+void InvalidatedIteratorChecker::reportBug(StringRef Message, SVal Val,
+                                           CheckerContext &C,
                                            ExplodedNode *ErrNode) const {
-  auto R = std::make_unique<PathSensitiveBugReport>(*InvalidatedBugType,
-                                                    Message, ErrNode);
+  auto R = std::make_unique<PathSensitiveBugReport>(InvalidatedBugType, Message,
+                                                    ErrNode);
   R->markInteresting(Val);
   C.emitReport(std::move(R));
 }
diff --git a/clang/lib/StaticAnalyzer/Checkers/Iterator.cpp b/clang/lib/StaticAnalyzer/Checkers/Iterator.cpp
index 90047a2899a7..e8d35aac2efd 100644
--- a/clang/lib/StaticAnalyzer/Checkers/Iterator.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/Iterator.cpp
@@ -181,8 +181,7 @@ const ContainerData *getContainerData(ProgramStateRef State,
   return State->get<ContainerMap>(Cont);
 }
 
-const IteratorPosition *getIteratorPosition(ProgramStateRef State,
-                                            const SVal &Val) {
+const IteratorPosition *getIteratorPosition(ProgramStateRef State, SVal Val) {
   if (auto Reg = Val.getAsRegion()) {
     Reg = Reg->getMostDerivedObjectRegion();
     return State->get<IteratorRegionMap>(Reg);
@@ -194,7 +193,7 @@ const IteratorPosition *getIteratorPosition(ProgramStateRef State,
   return nullptr;
 }
 
-ProgramStateRef setIteratorPosition(ProgramStateRef State, const SVal &Val,
+ProgramStateRef setIteratorPosition(ProgramStateRef State, SVal Val,
                                     const IteratorPosition &Pos) {
   if (auto Reg = Val.getAsRegion()) {
     Reg = Reg->getMostDerivedObjectRegion();
@@ -207,8 +206,8 @@ ProgramStateRef setIteratorPosition(ProgramStateRef State, const SVal &Val,
   return nullptr;
 }
 
-ProgramStateRef createIteratorPosition(ProgramStateRef State, const SVal &Val,
-                                       const MemRegion *Cont, const Stmt* S,
+ProgramStateRef createIteratorPosition(ProgramStateRef State, SVal Val,
+                                       const MemRegion *Cont, const Stmt *S,
                                        const LocationContext *LCtx,
                                        unsigned blockCount) {
   auto &StateMgr = State->getStateManager();
@@ -221,9 +220,8 @@ ProgramStateRef createIteratorPosition(ProgramStateRef State, const SVal &Val,
                              IteratorPosition::getPosition(Cont, Sym));
 }
 
-ProgramStateRef advancePosition(ProgramStateRef State, const SVal &Iter,
-                                OverloadedOperatorKind Op,
-                                const SVal &Distance) {
+ProgramStateRef advancePosition(ProgramStateRef State, SVal Iter,
+                                OverloadedOperatorKind Op, SVal Distance) {
   const auto *Pos = getIteratorPosition(State, Iter);
   if (!Pos)
     return nullptr;
diff --git a/clang/lib/StaticAnalyzer/Checkers/Iterator.h b/clang/lib/StaticAnalyzer/Checkers/Iterator.h
index 353daf0bed08..46de8ea01d77 100644
--- a/clang/lib/StaticAnalyzer/Checkers/Iterator.h
+++ b/clang/lib/StaticAnalyzer/Checkers/Iterator.h
@@ -161,18 +161,15 @@ bool isRandomIncrOrDecrOperator(OverloadedOperatorKind OK);
 bool isRandomIncrOrDecrOperator(BinaryOperatorKind OK);
 const ContainerData *getContainerData(ProgramStateRef State,
                                       const MemRegion *Cont);
-const IteratorPosition *getIteratorPosition(ProgramStateRef State,
-                                            const SVal &Val);
-ProgramStateRef setIteratorPosition(ProgramStateRef State, const SVal &Val,
+const IteratorPosition *getIteratorPosition(ProgramStateRef State, SVal Val);
+ProgramStateRef setIteratorPosition(ProgramStateRef State, SVal Val,
                                     const IteratorPosition &Pos);
-ProgramStateRef createIteratorPosition(ProgramStateRef State, const SVal &Val,
-                                       const MemRegion *Cont, const Stmt* S,
+ProgramStateRef createIteratorPosition(ProgramStateRef State, SVal Val,
+                                       const MemRegion *Cont, const Stmt *S,
                                        const LocationContext *LCtx,
                                        unsigned blockCount);
-ProgramStateRef advancePosition(ProgramStateRef State,
-                                const SVal &Iter,
-                                OverloadedOperatorKind Op,
-                                const SVal &Distance);
+ProgramStateRef advancePosition(ProgramStateRef State, SVal Iter,
+                                OverloadedOperatorKind Op, SVal Distance);
 ProgramStateRef assumeNoOverflow(ProgramStateRef State, SymbolRef Sym,
                                  long Scale);
 bool compare(ProgramStateRef State, SymbolRef Sym1, SymbolRef Sym2,
diff --git a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp
index 2d51a000ece3..a95e811c2a41 100644
--- a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp
@@ -100,18 +100,17 @@ class IteratorModeling
                                  const AdvanceFn *Handler) const;
 
   void handleComparison(CheckerContext &C, const Expr *CE, SVal RetVal,
-                        const SVal &LVal, const SVal &RVal,
-                        OverloadedOperatorKind Op) const;
+                        SVal LVal, SVal RVal, OverloadedOperatorKind Op) const;
   void processComparison(CheckerContext &C, ProgramStateRef State,
-                         SymbolRef Sym1, SymbolRef Sym2, const SVal &RetVal,
+                         SymbolRef Sym1, SymbolRef Sym2, SVal RetVal,
                          OverloadedOperatorKind Op) const;
-  void handleIncrement(CheckerContext &C, const SVal &RetVal, const SVal &Iter,
+  void handleIncrement(CheckerContext &C, SVal RetVal, SVal Iter,
                        bool Postfix) const;
-  void handleDecrement(CheckerContext &C, const SVal &RetVal, const SVal &Iter,
+  void handleDecrement(CheckerContext &C, SVal RetVal, SVal Iter,
                        bool Postfix) const;
   void handleRandomIncrOrDecr(CheckerContext &C, const Expr *CE,
-                              OverloadedOperatorKind Op, const SVal &RetVal,
-                              const SVal &Iterator, const SVal &Amount) const;
+                              OverloadedOperatorKind Op, SVal RetVal,
+                              SVal Iterator, SVal Amount) const;
   void handlePtrIncrOrDecr(CheckerContext &C, const Expr *Iterator,
                            OverloadedOperatorKind OK, SVal Offset) const;
   void handleAdvance(CheckerContext &C, const Expr *CE, SVal RetVal, SVal Iter,
@@ -120,7 +119,7 @@ class IteratorModeling
                   SVal Amount) const;
   void handleNext(CheckerContext &C, const Expr *CE, SVal RetVal, SVal Iter,
                   SVal Amount) const;
-  void assignToContainer(CheckerContext &C, const Expr *CE, const SVal &RetVal,
+  void assignToContainer(CheckerContext &C, const Expr *CE, SVal RetVal,
                          const MemRegion *Cont) const;
   bool noChangeInAdvance(CheckerContext &C, SVal Iter, const Expr *CE) const;
   void printState(raw_ostream &Out, ProgramStateRef State, const char *NL,
@@ -160,7 +159,7 @@ public:
 
 bool isSimpleComparisonOperator(OverloadedOperatorKind OK);
 bool isSimpleComparisonOperator(BinaryOperatorKind OK);
-ProgramStateRef removeIteratorPosition(ProgramStateRef State, const SVal &Val);
+ProgramStateRef removeIteratorPosition(ProgramStateRef State, SVal Val);
 ProgramStateRef relateSymbols(ProgramStateRef State, SymbolRef Sym1,
                               SymbolRef Sym2, bool Equal);
 bool isBoundThroughLazyCompoundVal(const Environment &Env,
@@ -283,7 +282,7 @@ void IteratorModeling::checkPostStmt(const BinaryOperator *BO,
     // The non-iterator side must have an integral or enumeration type.
     if (!AmountExpr->getType()->isIntegralOrEnumerationType())
       return;
-    const SVal &AmountVal = IsIterOnLHS ? RVal : LVal;
+    SVal AmountVal = IsIterOnLHS ? RVal : LVal;
     handlePtrIncrOrDecr(C, IterExpr, BinaryOperator::getOverloadedOperator(OK),
                         AmountVal);
   }
@@ -388,8 +387,8 @@ IteratorModeling::handleOverloadedOperator(CheckerContext &C,
           const bool IsIterFirst = FirstType->isStructureOrClassType();
           const SVal FirstArg = Call.getArgSVal(0);
           const SVal SecondArg = Call.getArgSVal(1);
-          const SVal &Iterator = IsIterFirst ? FirstArg : SecondArg;
-          const SVal &Amount = IsIterFirst ? SecondArg : FirstArg;
+          SVal Iterator = IsIterFirst ? FirstArg : SecondArg;
+          SVal Amount = IsIterFirst ? SecondArg : FirstArg;
 
           handleRandomIncrOrDecr(C, OrigExpr, Op, Call.getReturnValue(),
                                  Iterator, Amount);
@@ -444,14 +443,13 @@ IteratorModeling::handleAdvanceLikeFunction(CheckerContext &C,
 }
 
 void IteratorModeling::handleComparison(CheckerContext &C, const Expr *CE,
-                                       SVal RetVal, const SVal &LVal,
-                                       const SVal &RVal,
-                                       OverloadedOperatorKind Op) const {
+                                        SVal RetVal, SVal LVal, SVal RVal,
+                                        OverloadedOperatorKind Op) const {
   // Record the operands and the operator of the comparison for the next
   // evalAssume, if the result is a symbolic expression. If it is a concrete
   // value (only one branch is possible), then transfer the state between
   // the operands according to the operator and the result
-   auto State = C.getState();
+  auto State = C.getState();
   const auto *LPos = getIteratorPosition(State, LVal);
   const auto *RPos = getIteratorPosition(State, RVal);
   const MemRegion *Cont = nullptr;
@@ -504,7 +502,7 @@ void IteratorModeling::handleComparison(CheckerContext &C, const Expr *CE,
 
 void IteratorModeling::processComparison(CheckerContext &C,
                                          ProgramStateRef State, SymbolRef Sym1,
-                                         SymbolRef Sym2, const SVal &RetVal,
+                                         SymbolRef Sym2, SVal RetVal,
                                          OverloadedOperatorKind Op) const {
   if (const auto TruthVal = RetVal.getAs<nonloc::ConcreteInt>()) {
     if ((State = relateSymbols(State, Sym1, Sym2,
@@ -532,8 +530,8 @@ void IteratorModeling::processComparison(CheckerContext &C,
   }
 }
 
-void IteratorModeling::handleIncrement(CheckerContext &C, const SVal &RetVal,
-                                       const SVal &Iter, bool Postfix) const {
+void IteratorModeling::handleIncrement(CheckerContext &C, SVal RetVal,
+                                       SVal Iter, bool Postfix) const {
   // Increment the symbolic expressions which represents the position of the
   // iterator
   auto State = C.getState();
@@ -558,8 +556,8 @@ void IteratorModeling::handleIncrement(CheckerContext &C, const SVal &RetVal,
   C.addTransition(State);
 }
 
-void IteratorModeling::handleDecrement(CheckerContext &C, const SVal &RetVal,
-                                       const SVal &Iter, bool Postfix) const {
+void IteratorModeling::handleDecrement(CheckerContext &C, SVal RetVal,
+                                       SVal Iter, bool Postfix) const {
   // Decrement the symbolic expressions which represents the position of the
   // iterator
   auto State = C.getState();
@@ -586,9 +584,8 @@ void IteratorModeling::handleDecrement(CheckerContext &C, const SVal &RetVal,
 
 void IteratorModeling::handleRandomIncrOrDecr(CheckerContext &C, const Expr *CE,
                                               OverloadedOperatorKind Op,
-                                              const SVal &RetVal,
-                                              const SVal &Iterator,
-                                              const SVal &Amount) const {
+                                              SVal RetVal, SVal Iterator,
+                                              SVal Amount) const {
   // Increment or decrement the symbolic expressions which represents the
   // position of the iterator
   auto State = C.getState();
@@ -684,7 +681,7 @@ void IteratorModeling::handleNext(CheckerContext &C, const Expr *CE,
 }
 
 void IteratorModeling::assignToContainer(CheckerContext &C, const Expr *CE,
-                                         const SVal &RetVal,
+                                         SVal RetVal,
                                          const MemRegion *Cont) const {
   Cont = Cont->getMostDerivedObjectRegion();
 
@@ -772,7 +769,7 @@ bool isSimpleComparisonOperator(BinaryOperatorKind OK) {
   return OK == BO_EQ || OK == BO_NE;
 }
 
-ProgramStateRef removeIteratorPosition(ProgramStateRef State, const SVal &Val) {
+ProgramStateRef removeIteratorPosition(ProgramStateRef State, SVal Val) {
   if (auto Reg = Val.getAsRegion()) {
     Reg = Reg->getMostDerivedObjectRegion();
     return State->remove<IteratorRegionMap>(Reg);
diff --git a/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp
index 7740c3d4da1e..c8828219dd73 100644
--- a/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp
@@ -32,7 +32,8 @@ class IteratorRangeChecker
                    check::PreStmt<ArraySubscriptExpr>,
                    check::PreStmt<MemberExpr>> {
 
-  std::unique_ptr<BugType> OutOfRangeBugType;
+  const BugType OutOfRangeBugType{this, "Iterator out of range",
+                                  "Misuse of STL APIs"};
 
   void verifyDereference(CheckerContext &C, SVal Val) const;
   void verifyIncrement(CheckerContext &C, SVal Iter) const;
@@ -42,12 +43,10 @@ class IteratorRangeChecker
   void verifyAdvance(CheckerContext &C, SVal LHS, SVal RHS) const;
   void verifyPrev(CheckerContext &C, SVal LHS, SVal RHS) const;
   void verifyNext(CheckerContext &C, SVal LHS, SVal RHS) const;
-  void reportBug(const StringRef &Message, SVal Val, CheckerContext &C,
+  void reportBug(StringRef Message, SVal Val, CheckerContext &C,
                  ExplodedNode *ErrNode) const;
 
 public:
-  IteratorRangeChecker();
-
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
   void checkPreStmt(const UnaryOperator *UO, CheckerContext &C) const;
   void checkPreStmt(const BinaryOperator *BO, CheckerContext &C) const;
@@ -67,15 +66,10 @@ public:
 bool isPastTheEnd(ProgramStateRef State, const IteratorPosition &Pos);
 bool isAheadOfRange(ProgramStateRef State, const IteratorPosition &Pos);
 bool isBehindPastTheEnd(ProgramStateRef State, const IteratorPosition &Pos);
-bool isZero(ProgramStateRef State, const NonLoc &Val);
+bool isZero(ProgramStateRef State, NonLoc Val);
 
 } //namespace
 
-IteratorRangeChecker::IteratorRangeChecker() {
-  OutOfRangeBugType.reset(
-      new BugType(this, "Iterator out of range", "Misuse of STL APIs"));
-}
-
 void IteratorRangeChecker::checkPreCall(const CallEvent &Call,
                                         CheckerContext &C) const {
   // Check for out of range access
@@ -275,10 +269,10 @@ void IteratorRangeChecker::verifyNext(CheckerContext &C, SVal LHS,
   verifyRandomIncrOrDecr(C, OO_Plus, LHS, RHS);
 }
 
-void IteratorRangeChecker::reportBug(const StringRef &Message, SVal Val,
+void IteratorRangeChecker::reportBug(StringRef Message, SVal Val,
                                      CheckerContext &C,
                                      ExplodedNode *ErrNode) const {
-  auto R = std::make_unique<PathSensitiveBugReport>(*OutOfRangeBugType, Message,
+  auto R = std::make_unique<PathSensitiveBugReport>(OutOfRangeBugType, Message,
                                                     ErrNode);
 
   const auto *Pos = getIteratorPosition(C.getState(), Val);
@@ -295,7 +289,7 @@ bool isLess(ProgramStateRef State, SymbolRef Sym1, SymbolRef Sym2);
 bool isGreater(ProgramStateRef State, SymbolRef Sym1, SymbolRef Sym2);
 bool isEqual(ProgramStateRef State, SymbolRef Sym1, SymbolRef Sym2);
 
-bool isZero(ProgramStateRef State, const NonLoc &Val) {
+bool isZero(ProgramStateRef State, NonLoc Val) {
   auto &BVF = State->getBasicVals();
   return compare(State, Val,
                  nonloc::ConcreteInt(BVF.getValue(llvm::APSInt::get(0))),
diff --git a/clang/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
index 70f911fc66ab..de9efe17d220 100644
--- a/clang/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
@@ -62,7 +62,8 @@ class NonLocalizedStringChecker
                      check::PostObjCMessage,
                      check::PostStmt<ObjCStringLiteral>> {
 
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Unlocalizable string",
+                   "Localizability Issue (Apple)"};
 
   // Methods that require a localized string
   mutable llvm::DenseMap<const IdentifierInfo *,
@@ -89,8 +90,6 @@ class NonLocalizedStringChecker
                                       Selector S) const;
 
 public:
-  NonLocalizedStringChecker();
-
   // When this parameter is set to true, the checker assumes all
   // methods that return NSStrings are unlocalized. Thus, more false
   // positives will be reported.
@@ -108,11 +107,6 @@ public:
 REGISTER_MAP_WITH_PROGRAMSTATE(LocalizedMemMap, const MemRegion *,
                                LocalizedState)
 
-NonLocalizedStringChecker::NonLocalizedStringChecker() {
-  BT.reset(new BugType(this, "Unlocalizable string",
-                       "Localizability Issue (Apple)"));
-}
-
 namespace {
 class NonLocalizedStringBRVisitor final : public BugReporterVisitor {
 
@@ -764,7 +758,7 @@ void NonLocalizedStringChecker::reportLocalizationError(
 
   // Generate the bug report.
   auto R = std::make_unique<PathSensitiveBugReport>(
-      *BT, "User-facing text should use localized string macro", ErrNode);
+      BT, "User-facing text should use localized string macro", ErrNode);
   if (argumentNumber) {
     R->addRange(M.getArgExpr(argumentNumber - 1)->getSourceRange());
   } else {
diff --git a/clang/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.cpp b/clang/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.cpp
index bbf2ddec5762..3e374e6c240e 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.cpp
@@ -31,7 +31,7 @@ void MPIBugReporter::reportDoubleNonblocking(
               RequestRegion->getDescriptiveName() + ". ";
 
   auto Report = std::make_unique<PathSensitiveBugReport>(
-      *DoubleNonblockingBugType, ErrorText, ExplNode);
+      DoubleNonblockingBugType, ErrorText, ExplNode);
 
   Report->addRange(MPICallEvent.getSourceRange());
   SourceRange Range = RequestRegion->sourceRange();
@@ -53,7 +53,7 @@ void MPIBugReporter::reportMissingWait(
   std::string ErrorText{"Request " + RequestRegion->getDescriptiveName() +
                         " has no matching wait. "};
 
-  auto Report = std::make_unique<PathSensitiveBugReport>(*MissingWaitBugType,
+  auto Report = std::make_unique<PathSensitiveBugReport>(MissingWaitBugType,
                                                          ErrorText, ExplNode);
 
   SourceRange Range = RequestRegion->sourceRange();
@@ -73,7 +73,7 @@ void MPIBugReporter::reportUnmatchedWait(
   std::string ErrorText{"Request " + RequestRegion->getDescriptiveName() +
                         " has no matching nonblocking call. "};
 
-  auto Report = std::make_unique<PathSensitiveBugReport>(*UnmatchedWaitBugType,
+  auto Report = std::make_unique<PathSensitiveBugReport>(UnmatchedWaitBugType,
                                                          ErrorText, ExplNode);
 
   Report->addRange(CE.getSourceRange());
diff --git a/clang/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.h b/clang/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.h
index 9871da026b04..0222a2120b34 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.h
+++ b/clang/lib/StaticAnalyzer/Checkers/MPI-Checker/MPIBugReporter.h
@@ -17,6 +17,7 @@
 
 #include "MPITypes.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace clang {
 namespace ento {
@@ -24,12 +25,10 @@ namespace mpi {
 
 class MPIBugReporter {
 public:
-  MPIBugReporter(const CheckerBase &CB) {
-    UnmatchedWaitBugType.reset(new BugType(&CB, "Unmatched wait", MPIError));
-    DoubleNonblockingBugType.reset(
-        new BugType(&CB, "Double nonblocking", MPIError));
-    MissingWaitBugType.reset(new BugType(&CB, "Missing wait", MPIError));
-  }
+  MPIBugReporter(const CheckerBase &CB)
+      : UnmatchedWaitBugType(&CB, "Unmatched wait", MPIError),
+        MissingWaitBugType(&CB, "Missing wait", MPIError),
+        DoubleNonblockingBugType(&CB, "Double nonblocking", MPIError) {}
 
   /// Report duplicate request use by nonblocking calls without intermediate
   /// wait.
@@ -68,12 +67,10 @@ public:
                            BugReporter &BReporter) const;
 
 private:
-  const std::string MPIError = "MPI Error";
-
-  // path-sensitive bug types
-  std::unique_ptr<BugType> UnmatchedWaitBugType;
-  std::unique_ptr<BugType> MissingWaitBugType;
-  std::unique_ptr<BugType> DoubleNonblockingBugType;
+  const llvm::StringLiteral MPIError = "MPI Error";
+  const BugType UnmatchedWaitBugType;
+  const BugType MissingWaitBugType;
+  const BugType DoubleNonblockingBugType;
 
   /// Bug visitor class to find the node where the request region was previously
   /// used in order to include it into the BugReport path.
diff --git a/clang/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp
index 771c0a5fbb8d..12bf12a0b232 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MacOSKeychainAPIChecker.cpp
@@ -33,7 +33,8 @@ class MacOSKeychainAPIChecker : public Checker<check::PreStmt<CallExpr>,
                                                check::DeadSymbols,
                                                check::PointerEscape,
                                                eval::Assume> {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Improper use of SecKeychain API",
+                   categories::AppleAPIMisuse};
 
 public:
   /// AllocationState is a part of the checker specific state together with the
@@ -101,12 +102,6 @@ private:
   /// function.
   static unsigned getTrackedFunctionIndex(StringRef Name, bool IsAllocator);
 
-  inline void initBugType() const {
-    if (!BT)
-      BT.reset(new BugType(this, "Improper use of SecKeychain API",
-                           "API Misuse (Apple)"));
-  }
-
   void generateDeallocatorMismatchReport(const AllocationPair &AP,
                                          const Expr *ArgExpr,
                                          CheckerContext &C) const;
@@ -232,7 +227,6 @@ void MacOSKeychainAPIChecker::
 
   if (!N)
     return;
-  initBugType();
   SmallString<80> sbuf;
   llvm::raw_svector_ostream os(sbuf);
   unsigned int PDeallocIdx =
@@ -240,7 +234,7 @@ void MacOSKeychainAPIChecker::
 
   os << "Deallocator doesn't match the allocator: '"
      << FunctionsToTrack[PDeallocIdx].Name << "' should be used.";
-  auto Report = std::make_unique<PathSensitiveBugReport>(*BT, os.str(), N);
+  auto Report = std::make_unique<PathSensitiveBugReport>(BT, os.str(), N);
   Report->addVisitor(std::make_unique<SecKeychainBugVisitor>(AP.first));
   Report->addRange(ArgExpr->getSourceRange());
   markInteresting(Report.get(), AP);
@@ -276,7 +270,6 @@ void MacOSKeychainAPIChecker::checkPreStmt(const CallExpr *CE,
         ExplodedNode *N = C.generateNonFatalErrorNode(State);
         if (!N)
           return;
-        initBugType();
         SmallString<128> sbuf;
         llvm::raw_svector_ostream os(sbuf);
         unsigned int DIdx = FunctionsToTrack[AS->AllocatorIdx].DeallocatorIdx;
@@ -284,8 +277,7 @@ void MacOSKeychainAPIChecker::checkPreStmt(const CallExpr *CE,
             << "the allocator: missing a call to '"
             << FunctionsToTrack[DIdx].Name
             << "'.";
-        auto Report =
-            std::make_unique<PathSensitiveBugReport>(*BT, os.str(), N);
+        auto Report = std::make_unique<PathSensitiveBugReport>(BT, os.str(), N);
         Report->addVisitor(std::make_unique<SecKeychainBugVisitor>(V));
         Report->addRange(ArgExpr->getSourceRange());
         Report->markInteresting(AS->Region);
@@ -338,9 +330,8 @@ void MacOSKeychainAPIChecker::checkPreStmt(const CallExpr *CE,
     ExplodedNode *N = C.generateNonFatalErrorNode(State);
     if (!N)
       return;
-    initBugType();
     auto Report = std::make_unique<PathSensitiveBugReport>(
-        *BT, "Trying to free data which has not been allocated.", N);
+        BT, "Trying to free data which has not been allocated.", N);
     Report->addRange(ArgExpr->getSourceRange());
     if (AS)
       Report->markInteresting(AS->Region);
@@ -474,7 +465,6 @@ std::unique_ptr<PathSensitiveBugReport>
 MacOSKeychainAPIChecker::generateAllocatedDataNotReleasedReport(
     const AllocationPair &AP, ExplodedNode *N, CheckerContext &C) const {
   const ADFunctionInfo &FI = FunctionsToTrack[AP.second->AllocatorIdx];
-  initBugType();
   SmallString<70> sbuf;
   llvm::raw_svector_ostream os(sbuf);
   os << "Allocated data is not released: missing a call to '"
@@ -493,7 +483,7 @@ MacOSKeychainAPIChecker::generateAllocatedDataNotReleasedReport(
                                               AllocNode->getLocationContext());
 
   auto Report = std::make_unique<PathSensitiveBugReport>(
-      *BT, os.str(), N, LocUsedForUniqueing,
+      BT, os.str(), N, LocUsedForUniqueing,
       AllocNode->getLocationContext()->getDecl());
 
   Report->addVisitor(std::make_unique<SecKeychainBugVisitor>(AP.first));
diff --git a/clang/lib/StaticAnalyzer/Checkers/MacOSXAPIChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MacOSXAPIChecker.cpp
index 04e7f8dec8d7..754b16764296 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MacOSXAPIChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MacOSXAPIChecker.cpp
@@ -18,6 +18,7 @@
 #include "clang/Basic/TargetInfo.h"
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/CommonBugCategories.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
@@ -31,7 +32,8 @@ using namespace ento;
 
 namespace {
 class MacOSXAPIChecker : public Checker< check::PreStmt<CallExpr> > {
-  mutable std::unique_ptr<BugType> BT_dispatchOnce;
+  const BugType BT_dispatchOnce{this, "Improper use of 'dispatch_once'",
+                                categories::AppleAPIMisuse};
 
   static const ObjCIvarRegion *getParentIvarRegion(const MemRegion *R);
 
@@ -136,12 +138,8 @@ void MacOSXAPIChecker::CheckDispatchOnce(CheckerContext &C, const CallExpr *CE,
   if (!N)
     return;
 
-  if (!BT_dispatchOnce)
-    BT_dispatchOnce.reset(new BugType(this, "Improper use of 'dispatch_once'",
-                                      "API Misuse (Apple)"));
-
   auto report =
-      std::make_unique<PathSensitiveBugReport>(*BT_dispatchOnce, os.str(), N);
+      std::make_unique<PathSensitiveBugReport>(BT_dispatchOnce, os.str(), N);
   report->addRange(CE->getArg(0)->getSourceRange());
   C.emitReport(std::move(report));
 }
diff --git a/clang/lib/StaticAnalyzer/Checkers/MismatchedIteratorChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MismatchedIteratorChecker.cpp
index 2020dc7cc791..82a622831817 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MismatchedIteratorChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MismatchedIteratorChecker.cpp
@@ -30,22 +30,18 @@ namespace {
 class MismatchedIteratorChecker
   : public Checker<check::PreCall, check::PreStmt<BinaryOperator>> {
 
-  std::unique_ptr<BugType> MismatchedBugType;
-
-  void verifyMatch(CheckerContext &C, const SVal &Iter,
-                   const MemRegion *Cont) const;
-  void verifyMatch(CheckerContext &C, const SVal &Iter1,
-                   const SVal &Iter2) const;
-  void reportBug(const StringRef &Message, const SVal &Val1,
-                 const SVal &Val2, CheckerContext &C,
-                 ExplodedNode *ErrNode) const;
-  void reportBug(const StringRef &Message, const SVal &Val,
-                 const MemRegion *Reg, CheckerContext &C,
+  const BugType MismatchedBugType{this, "Iterator(s) mismatched",
+                                  "Misuse of STL APIs",
+                                  /*SuppressOnSink=*/true};
+
+  void verifyMatch(CheckerContext &C, SVal Iter, const MemRegion *Cont) const;
+  void verifyMatch(CheckerContext &C, SVal Iter1, SVal Iter2) const;
+  void reportBug(StringRef Message, SVal Val1, SVal Val2, CheckerContext &C,
                  ExplodedNode *ErrNode) const;
+  void reportBug(StringRef Message, SVal Val, const MemRegion *Reg,
+                 CheckerContext &C, ExplodedNode *ErrNode) const;
 
 public:
-  MismatchedIteratorChecker();
-
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
   void checkPreStmt(const BinaryOperator *BO, CheckerContext &C) const;
 
@@ -53,12 +49,6 @@ public:
 
 } // namespace
 
-MismatchedIteratorChecker::MismatchedIteratorChecker() {
-  MismatchedBugType.reset(
-      new BugType(this, "Iterator(s) mismatched", "Misuse of STL APIs",
-                  /*SuppressOnSink=*/true));
-}
-
 void MismatchedIteratorChecker::checkPreCall(const CallEvent &Call,
                                              CheckerContext &C) const {
   // Check for iterator mismatches
@@ -202,7 +192,7 @@ void MismatchedIteratorChecker::checkPreStmt(const BinaryOperator *BO,
   verifyMatch(C, LVal, RVal);
 }
 
-void MismatchedIteratorChecker::verifyMatch(CheckerContext &C, const SVal &Iter,
+void MismatchedIteratorChecker::verifyMatch(CheckerContext &C, SVal Iter,
                                             const MemRegion *Cont) const {
   // Verify match between a container and the container of an iterator
   Cont = Cont->getMostDerivedObjectRegion();
@@ -238,9 +228,8 @@ void MismatchedIteratorChecker::verifyMatch(CheckerContext &C, const SVal &Iter,
   }
 }
 
-void MismatchedIteratorChecker::verifyMatch(CheckerContext &C,
-                                            const SVal &Iter1,
-                                            const SVal &Iter2) const {
+void MismatchedIteratorChecker::verifyMatch(CheckerContext &C, SVal Iter1,
+                                            SVal Iter2) const {
   // Verify match between the containers of two iterators
   auto State = C.getState();
   const auto *Pos1 = getIteratorPosition(State, Iter1);
@@ -277,23 +266,21 @@ void MismatchedIteratorChecker::verifyMatch(CheckerContext &C,
   }
 }
 
-void MismatchedIteratorChecker::reportBug(const StringRef &Message,
-                                          const SVal &Val1,
-                                          const SVal &Val2,
-                                          CheckerContext &C,
+void MismatchedIteratorChecker::reportBug(StringRef Message, SVal Val1,
+                                          SVal Val2, CheckerContext &C,
                                           ExplodedNode *ErrNode) const {
-  auto R = std::make_unique<PathSensitiveBugReport>(*MismatchedBugType, Message,
+  auto R = std::make_unique<PathSensitiveBugReport>(MismatchedBugType, Message,
                                                     ErrNode);
   R->markInteresting(Val1);
   R->markInteresting(Val2);
   C.emitReport(std::move(R));
 }
 
-void MismatchedIteratorChecker::reportBug(const StringRef &Message,
-                                          const SVal &Val, const MemRegion *Reg,
+void MismatchedIteratorChecker::reportBug(StringRef Message, SVal Val,
+                                          const MemRegion *Reg,
                                           CheckerContext &C,
                                           ExplodedNode *ErrNode) const {
-  auto R = std::make_unique<PathSensitiveBugReport>(*MismatchedBugType, Message,
+  auto R = std::make_unique<PathSensitiveBugReport>(MismatchedBugType, Message,
                                                     ErrNode);
   R->markInteresting(Val);
   R->markInteresting(Reg);
diff --git a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
index 8fc44e78be6f..2e31c16e457c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
@@ -15,6 +15,7 @@
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/CommonBugCategories.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
@@ -31,7 +32,9 @@ class MmapWriteExecChecker : public Checker<check::PreCall> {
   static int ProtWrite;
   static int ProtExec;
   static int ProtRead;
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "W^X check fails, Write Exec prot flags set",
+                   "Security"};
+
 public:
   MmapWriteExecChecker() : MmapFn({"mmap"}, 6), MprotectFn({"mprotect"}, 3) {}
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
@@ -62,17 +65,16 @@ void MmapWriteExecChecker::checkPreCall(const CallEvent &Call,
       return;
 
     if ((Prot & (ProtWrite | ProtExec)) == (ProtWrite | ProtExec)) {
-      if (!BT)
-        BT.reset(new BugType(this, "W^X check fails, Write Exec prot flags set", "Security"));
-
       ExplodedNode *N = C.generateNonFatalErrorNode();
       if (!N)
         return;
 
       auto Report = std::make_unique<PathSensitiveBugReport>(
-          *BT, "Both PROT_WRITE and PROT_EXEC flags are set. This can "
-               "lead to exploitable memory regions, which could be overwritten "
-               "with malicious code", N);
+          BT,
+          "Both PROT_WRITE and PROT_EXEC flags are set. This can "
+          "lead to exploitable memory regions, which could be overwritten "
+          "with malicious code",
+          N);
       Report->addRange(Call.getArgSourceRange(2));
       C.emitReport(std::move(Report));
     }
diff --git a/clang/lib/StaticAnalyzer/Checkers/NSAutoreleasePoolChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NSAutoreleasePoolChecker.cpp
index bb01a3b77617..0648084a7d39 100644
--- a/clang/lib/StaticAnalyzer/Checkers/NSAutoreleasePoolChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/NSAutoreleasePoolChecker.cpp
@@ -31,7 +31,8 @@ using namespace ento;
 namespace {
 class NSAutoreleasePoolChecker
   : public Checker<check::PreObjCMessage> {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Use -drain instead of -release",
+                   "API Upgrade (Apple)"};
   mutable Selector releaseS;
 
 public:
@@ -57,10 +58,6 @@ void NSAutoreleasePoolChecker::checkPreObjCMessage(const ObjCMethodCall &msg,
   if (msg.getSelector() != releaseS)
     return;
 
-  if (!BT)
-    BT.reset(new BugType(this, "Use -drain instead of -release",
-                         "API Upgrade (Apple)"));
-
   ExplodedNode *N = C.generateNonFatalErrorNode();
   if (!N) {
     assert(0);
@@ -68,7 +65,7 @@ void NSAutoreleasePoolChecker::checkPreObjCMessage(const ObjCMethodCall &msg,
   }
 
   auto Report = std::make_unique<PathSensitiveBugReport>(
-      *BT,
+      BT,
       "Use -drain instead of -release when using NSAutoreleasePool and "
       "garbage collection",
       N);
diff --git a/clang/lib/StaticAnalyzer/Checkers/NonNullParamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NonNullParamChecker.cpp
index 44b69ef31911..a9002ee7c966 100644
--- a/clang/lib/StaticAnalyzer/Checkers/NonNullParamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/NonNullParamChecker.cpp
@@ -18,6 +18,7 @@
 #include "clang/Analysis/AnyCall.h"
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/CommonBugCategories.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
@@ -31,8 +32,9 @@ namespace {
 class NonNullParamChecker
     : public Checker<check::PreCall, check::BeginFunction,
                      EventDispatcher<ImplicitNullDerefEvent>> {
-  mutable std::unique_ptr<BugType> BTAttrNonNull;
-  mutable std::unique_ptr<BugType> BTNullRefArg;
+  const BugType BTAttrNonNull{
+      this, "Argument with 'nonnull' attribute passed null", "API"};
+  const BugType BTNullRefArg{this, "Dereference of null pointer"};
 
 public:
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
@@ -278,13 +280,6 @@ std::unique_ptr<PathSensitiveBugReport>
 NonNullParamChecker::genReportNullAttrNonNull(const ExplodedNode *ErrorNode,
                                               const Expr *ArgE,
                                               unsigned IdxOfArg) const {
-  // Lazily allocate the BugType object if it hasn't already been
-  // created. Ownership is transferred to the BugReporter object once
-  // the BugReport is passed to 'EmitWarning'.
-  if (!BTAttrNonNull)
-    BTAttrNonNull.reset(new BugType(
-        this, "Argument with 'nonnull' attribute passed null", "API"));
-
   llvm::SmallString<256> SBuf;
   llvm::raw_svector_ostream OS(SBuf);
   OS << "Null pointer passed to "
@@ -292,7 +287,7 @@ NonNullParamChecker::genReportNullAttrNonNull(const ExplodedNode *ErrorNode,
      << " parameter expecting 'nonnull'";
 
   auto R =
-      std::make_unique<PathSensitiveBugReport>(*BTAttrNonNull, SBuf, ErrorNode);
+      std::make_unique<PathSensitiveBugReport>(BTAttrNonNull, SBuf, ErrorNode);
   if (ArgE)
     bugreporter::trackExpressionValue(ErrorNode, ArgE, *R);
 
@@ -302,11 +297,8 @@ NonNullParamChecker::genReportNullAttrNonNull(const ExplodedNode *ErrorNode,
 std::unique_ptr<PathSensitiveBugReport>
 NonNullParamChecker::genReportReferenceToNullPointer(
     const ExplodedNode *ErrorNode, const Expr *ArgE) const {
-  if (!BTNullRefArg)
-    BTNullRefArg.reset(new BugType(this, "Dereference of null pointer"));
-
   auto R = std::make_unique<PathSensitiveBugReport>(
-      *BTNullRefArg, "Forming reference to null pointer", ErrorNode);
+      BTNullRefArg, "Forming reference to null pointer", ErrorNode);
   if (ArgE) {
     const Expr *ArgEDeref = bugreporter::getDerefExpr(ArgE);
     if (!ArgEDeref)
@@ -314,7 +306,6 @@ NonNullParamChecker::genReportReferenceToNullPointer(
     bugreporter::trackExpressionValue(ErrorNode, ArgEDeref, *R);
   }
   return R;
-
 }
 
 void ento::registerNonNullParamChecker(CheckerManager &mgr) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/ObjCAtSyncChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ObjCAtSyncChecker.cpp
index 7906b787cd53..552c222a251a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ObjCAtSyncChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ObjCAtSyncChecker.cpp
@@ -25,8 +25,10 @@ using namespace ento;
 namespace {
 class ObjCAtSyncChecker
     : public Checker< check::PreStmt<ObjCAtSynchronizedStmt> > {
-  mutable std::unique_ptr<BugType> BT_null;
-  mutable std::unique_ptr<BugType> BT_undef;
+  const BugType BT_null{this, "Nil value used as mutex for @synchronized() "
+                              "(no synchronization will occur)"};
+  const BugType BT_undef{this, "Uninitialized value used as mutex "
+                               "for @synchronized"};
 
 public:
   void checkPreStmt(const ObjCAtSynchronizedStmt *S, CheckerContext &C) const;
@@ -43,11 +45,8 @@ void ObjCAtSyncChecker::checkPreStmt(const ObjCAtSynchronizedStmt *S,
   // Uninitialized value used for the mutex?
   if (isa<UndefinedVal>(V)) {
     if (ExplodedNode *N = C.generateErrorNode()) {
-      if (!BT_undef)
-        BT_undef.reset(new BugType(this, "Uninitialized value used as mutex "
-                                         "for @synchronized"));
       auto report = std::make_unique<PathSensitiveBugReport>(
-          *BT_undef, BT_undef->getDescription(), N);
+          BT_undef, BT_undef.getDescription(), N);
       bugreporter::trackExpressionValue(N, Ex, *report);
       C.emitReport(std::move(report));
     }
@@ -66,12 +65,8 @@ void ObjCAtSyncChecker::checkPreStmt(const ObjCAtSynchronizedStmt *S,
       // Generate an error node.  This isn't a sink since
       // a null mutex just means no synchronization occurs.
       if (ExplodedNode *N = C.generateNonFatalErrorNode(nullState)) {
-        if (!BT_null)
-          BT_null.reset(
-              new BugType(this, "Nil value used as mutex for @synchronized() "
-                                "(no synchronization will occur)"));
         auto report = std::make_unique<PathSensitiveBugReport>(
-            *BT_null, BT_null->getDescription(), N);
+            BT_null, BT_null.getDescription(), N);
         bugreporter::trackExpressionValue(N, Ex, *report);
 
         C.emitReport(std::move(report));
diff --git a/clang/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp
index 0244a7a3ebff..28e88245ca95 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ObjCContainersChecker.cpp
@@ -30,12 +30,7 @@ namespace {
 class ObjCContainersChecker : public Checker< check::PreStmt<CallExpr>,
                                              check::PostStmt<CallExpr>,
                                              check::PointerEscape> {
-  mutable std::unique_ptr<BugType> BT;
-  inline void initBugType() const {
-    if (!BT)
-      BT.reset(new BugType(this, "CFArray API",
-                           categories::CoreFoundationObjectiveC));
-  }
+  const BugType BT{this, "CFArray API", categories::CoreFoundationObjectiveC};
 
   inline SymbolRef getArraySym(const Expr *E, CheckerContext &C) const {
     SVal ArrayRef = C.getSVal(E);
@@ -140,9 +135,9 @@ void ObjCContainersChecker::checkPreStmt(const CallExpr *CE,
       ExplodedNode *N = C.generateErrorNode(StOutBound);
       if (!N)
         return;
-      initBugType();
+
       auto R = std::make_unique<PathSensitiveBugReport>(
-          *BT, "Index is out of bounds", N);
+          BT, "Index is out of bounds", N);
       R->addRange(IdxExpr->getSourceRange());
       bugreporter::trackExpressionValue(N, IdxExpr, *R,
                                         {bugreporter::TrackingKind::Thorough,
diff --git a/clang/lib/StaticAnalyzer/Checkers/ObjCSelfInitChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ObjCSelfInitChecker.cpp
index d88d6a94a30f..217c46451f80 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ObjCSelfInitChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ObjCSelfInitChecker.cpp
@@ -61,13 +61,13 @@ class ObjCSelfInitChecker : public Checker<  check::PostObjCMessage,
                                              check::PostCall,
                                              check::Location,
                                              check::Bind > {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Missing \"self = [(super or self) init...]\"",
+                   categories::CoreFoundationObjectiveC};
 
   void checkForInvalidSelf(const Expr *E, CheckerContext &C,
                            const char *errorStr) const;
 
 public:
-  ObjCSelfInitChecker() {}
   void checkPostObjCMessage(const ObjCMethodCall &Msg, CheckerContext &C) const;
   void checkPostStmt(const ObjCIvarRefExpr *E, CheckerContext &C) const;
   void checkPreStmt(const ReturnStmt *S, CheckerContext &C) const;
@@ -157,10 +157,7 @@ void ObjCSelfInitChecker::checkForInvalidSelf(const Expr *E, CheckerContext &C,
   if (!N)
     return;
 
-  if (!BT)
-    BT.reset(new BugType(this, "Missing \"self = [(super or self) init...]\"",
-                         categories::CoreFoundationObjectiveC));
-  C.emitReport(std::make_unique<PathSensitiveBugReport>(*BT, errorStr, N));
+  C.emitReport(std::make_unique<PathSensitiveBugReport>(BT, errorStr, N));
 }
 
 void ObjCSelfInitChecker::checkPostObjCMessage(const ObjCMethodCall &Msg,
diff --git a/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp
index 3547b7bb61a2..eb40711812e1 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp
@@ -26,18 +26,19 @@ namespace {
 class ObjCSuperDeallocChecker
     : public Checker<check::PostObjCMessage, check::PreObjCMessage,
                      check::PreCall, check::Location> {
-
-  mutable IdentifierInfo *IIdealloc, *IINSObject;
+  mutable IdentifierInfo *IIdealloc = nullptr;
+  mutable IdentifierInfo *IINSObject = nullptr;
   mutable Selector SELdealloc;
 
-  std::unique_ptr<BugType> DoubleSuperDeallocBugType;
+  const BugType DoubleSuperDeallocBugType{
+      this, "[super dealloc] should not be called more than once",
+      categories::CoreFoundationObjectiveC};
 
   void initIdentifierInfoAndSelectors(ASTContext &Ctx) const;
 
   bool isSuperDeallocMessage(const ObjCMethodCall &M) const;
 
 public:
-  ObjCSuperDeallocChecker();
   void checkPostObjCMessage(const ObjCMethodCall &M, CheckerContext &C) const;
   void checkPreObjCMessage(const ObjCMethodCall &M, CheckerContext &C) const;
 
@@ -188,7 +189,7 @@ void ObjCSuperDeallocChecker::reportUseAfterDealloc(SymbolRef Sym,
     Desc = "Use of 'self' after it has been deallocated";
 
   // Generate the report.
-  auto BR = std::make_unique<PathSensitiveBugReport>(*DoubleSuperDeallocBugType,
+  auto BR = std::make_unique<PathSensitiveBugReport>(DoubleSuperDeallocBugType,
                                                      Desc, ErrNode);
   BR->addRange(S->getSourceRange());
   BR->addVisitor(std::make_unique<SuperDeallocBRVisitor>(Sym));
@@ -213,14 +214,6 @@ void ObjCSuperDeallocChecker::diagnoseCallArguments(const CallEvent &CE,
   }
 }
 
-ObjCSuperDeallocChecker::ObjCSuperDeallocChecker()
-    : IIdealloc(nullptr), IINSObject(nullptr) {
-
-  DoubleSuperDeallocBugType.reset(
-      new BugType(this, "[super dealloc] should not be called more than once",
-                  categories::CoreFoundationObjectiveC));
-}
-
 void
 ObjCSuperDeallocChecker::initIdentifierInfoAndSelectors(ASTContext &Ctx) const {
   if (IIdealloc)
diff --git a/clang/lib/StaticAnalyzer/Checkers/PaddingChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/PaddingChecker.cpp
index bd6e1ec3a8fc..eee9449f3180 100644
--- a/clang/lib/StaticAnalyzer/Checkers/PaddingChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/PaddingChecker.cpp
@@ -32,7 +32,7 @@ using namespace ento;
 namespace {
 class PaddingChecker : public Checker<check::ASTDecl<TranslationUnitDecl>> {
 private:
-  mutable std::unique_ptr<BugType> PaddingBug;
+  const BugType PaddingBug{this, "Excessive Padding", "Performance"};
   mutable BugReporter *BR;
 
 public:
@@ -310,10 +310,6 @@ public:
   void reportRecord(
       const RecordDecl *RD, CharUnits BaselinePad, CharUnits OptimalPad,
       const SmallVector<const FieldDecl *, 20> &OptimalFieldsOrder) const {
-    if (!PaddingBug)
-      PaddingBug =
-          std::make_unique<BugType>(this, "Excessive Padding", "Performance");
-
     SmallString<100> Buf;
     llvm::raw_svector_ostream Os(Buf);
     Os << "Excessive padding in '";
@@ -341,8 +337,7 @@ public:
 
     PathDiagnosticLocation CELoc =
         PathDiagnosticLocation::create(RD, BR->getSourceManager());
-    auto Report =
-        std::make_unique<BasicBugReport>(*PaddingBug, Os.str(), CELoc);
+    auto Report = std::make_unique<BasicBugReport>(PaddingBug, Os.str(), CELoc);
     Report->setDeclWithIssue(RD);
     Report->addRange(RD->getSourceRange());
     BR->emitReport(std::move(Report));
diff --git a/clang/lib/StaticAnalyzer/Checkers/PointerArithChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/PointerArithChecker.cpp
index 1d63c0dd01f3..1141f07428b4 100644
--- a/clang/lib/StaticAnalyzer/Checkers/PointerArithChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/PointerArithChecker.cpp
@@ -56,8 +56,8 @@ class PointerArithChecker
                                 bool PointedNeeded = false) const;
   void initAllocIdentifiers(ASTContext &C) const;
 
-  mutable std::unique_ptr<BugType> BT_pointerArith;
-  mutable std::unique_ptr<BugType> BT_polyArray;
+  const BugType BT_pointerArith{this, "Dangerous pointer arithmetic"};
+  const BugType BT_polyArray{this, "Dangerous pointer arithmetic"};
   mutable llvm::SmallSet<IdentifierInfo *, 8> AllocFunctions;
 
 public:
@@ -168,12 +168,10 @@ void PointerArithChecker::reportPointerArithMisuse(const Expr *E,
     if (!IsPolymorphic)
       return;
     if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
-      if (!BT_polyArray)
-        BT_polyArray.reset(new BugType(this, "Dangerous pointer arithmetic"));
       constexpr llvm::StringLiteral Msg =
           "Pointer arithmetic on a pointer to base class is dangerous "
           "because derived and base class may have different size.";
-      auto R = std::make_unique<PathSensitiveBugReport>(*BT_polyArray, Msg, N);
+      auto R = std::make_unique<PathSensitiveBugReport>(BT_polyArray, Msg, N);
       R->addRange(E->getSourceRange());
       R->markInteresting(ArrayRegion);
       C.emitReport(std::move(R));
@@ -190,12 +188,10 @@ void PointerArithChecker::reportPointerArithMisuse(const Expr *E,
     return;
 
   if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
-    if (!BT_pointerArith)
-      BT_pointerArith.reset(new BugType(this, "Dangerous pointer arithmetic"));
     constexpr llvm::StringLiteral Msg =
         "Pointer arithmetic on non-array variables relies on memory layout, "
         "which is dangerous.";
-    auto R = std::make_unique<PathSensitiveBugReport>(*BT_pointerArith, Msg, N);
+    auto R = std::make_unique<PathSensitiveBugReport>(BT_pointerArith, Msg, N);
     R->addRange(SR);
     R->markInteresting(Region);
     C.emitReport(std::move(R));
diff --git a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
index 96d38eef3c03..2438cf30b39b 100644
--- a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
@@ -25,7 +25,7 @@ using namespace ento;
 namespace {
 class PointerSubChecker
   : public Checker< check::PreStmt<BinaryOperator> > {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Pointer subtraction"};
 
 public:
   void checkPreStmt(const BinaryOperator *B, CheckerContext &C) const;
@@ -59,12 +59,10 @@ void PointerSubChecker::checkPreStmt(const BinaryOperator *B,
     return;
 
   if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
-    if (!BT)
-      BT.reset(new BugType(this, "Pointer subtraction"));
     constexpr llvm::StringLiteral Msg =
         "Subtraction of two pointers that do not point to the same memory "
         "chunk may cause incorrect result.";
-    auto R = std::make_unique<PathSensitiveBugReport>(*BT, Msg, N);
+    auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
     R->addRange(B->getSourceRange());
     C.emitReport(std::move(R));
   }
diff --git a/clang/lib/StaticAnalyzer/Checkers/ReturnPointerRangeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ReturnPointerRangeChecker.cpp
index 11dca1ff8831..09d82ebabd4c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ReturnPointerRangeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ReturnPointerRangeChecker.cpp
@@ -26,7 +26,9 @@ using namespace ento;
 namespace {
 class ReturnPointerRangeChecker :
     public Checker< check::PreStmt<ReturnStmt> > {
-  mutable std::unique_ptr<BugType> BT;
+  // FIXME: This bug correspond to CWE-466.  Eventually we should have bug
+  // types explicitly reference such exploit categories (when applicable).
+  const BugType BT{this, "Buffer overflow"};
 
 public:
     void checkPreStmt(const ReturnStmt *RS, CheckerContext &C) const;
@@ -76,16 +78,12 @@ void ReturnPointerRangeChecker::checkPreStmt(const ReturnStmt *RS,
     if (!N)
       return;
 
-    // FIXME: This bug correspond to CWE-466.  Eventually we should have bug
-    // types explicitly reference such exploit categories (when applicable).
-    if (!BT)
-      BT.reset(new BugType(this, "Buffer overflow"));
     constexpr llvm::StringLiteral Msg =
         "Returned pointer value points outside the original object "
         "(potential buffer overflow)";
 
     // Generate a report for this bug.
-    auto Report = std::make_unique<PathSensitiveBugReport>(*BT, Msg, N);
+    auto Report = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
     Report->addRange(RetE->getSourceRange());
 
     const auto ConcreteElementCount = ElementCount.getAs<nonloc::ConcreteInt>();
diff --git a/clang/lib/StaticAnalyzer/Checkers/ReturnUndefChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ReturnUndefChecker.cpp
index 78cd0100bea4..efffbf2ee755 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ReturnUndefChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ReturnUndefChecker.cpp
@@ -24,8 +24,8 @@ using namespace ento;
 
 namespace {
 class ReturnUndefChecker : public Checker< check::PreStmt<ReturnStmt> > {
-  mutable std::unique_ptr<BugType> BT_Undef;
-  mutable std::unique_ptr<BugType> BT_NullReference;
+  const BugType BT_Undef{this, "Garbage return value"};
+  const BugType BT_NullReference{this, "Returning null reference"};
 
   void emitUndef(CheckerContext &C, const Expr *RetE) const;
   void checkReference(CheckerContext &C, const Expr *RetE,
@@ -77,7 +77,7 @@ void ReturnUndefChecker::checkPreStmt(const ReturnStmt *RS,
   }
 }
 
-static void emitBug(CheckerContext &C, BugType &BT, StringRef Msg,
+static void emitBug(CheckerContext &C, const BugType &BT, StringRef Msg,
                     const Expr *RetE, const Expr *TrackingE = nullptr) {
   ExplodedNode *N = C.generateErrorNode();
   if (!N)
@@ -92,9 +92,7 @@ static void emitBug(CheckerContext &C, BugType &BT, StringRef Msg,
 }
 
 void ReturnUndefChecker::emitUndef(CheckerContext &C, const Expr *RetE) const {
-  if (!BT_Undef)
-    BT_Undef.reset(new BugType(this, "Garbage return value"));
-  emitBug(C, *BT_Undef, "Undefined or garbage value returned to caller", RetE);
+  emitBug(C, BT_Undef, "Undefined or garbage value returned to caller", RetE);
 }
 
 void ReturnUndefChecker::checkReference(CheckerContext &C, const Expr *RetE,
@@ -109,10 +107,7 @@ void ReturnUndefChecker::checkReference(CheckerContext &C, const Expr *RetE,
   }
 
   // The return value is known to be null. Emit a bug report.
-  if (!BT_NullReference)
-    BT_NullReference.reset(new BugType(this, "Returning null reference"));
-
-  emitBug(C, *BT_NullReference, BT_NullReference->getDescription(), RetE,
+  emitBug(C, BT_NullReference, BT_NullReference.getDescription(), RetE,
           bugreporter::getDerefExpr(RetE));
 }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/SimpleStreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/SimpleStreamChecker.cpp
index 2ac9f65c9793..7cbe271dfbf9 100644
--- a/clang/lib/StaticAnalyzer/Checkers/SimpleStreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/SimpleStreamChecker.cpp
@@ -52,10 +52,13 @@ class SimpleStreamChecker : public Checker<check::PostCall,
                                            check::PreCall,
                                            check::DeadSymbols,
                                            check::PointerEscape> {
-  CallDescription OpenFn, CloseFn;
+  const CallDescription OpenFn{{"fopen"}, 2};
+  const CallDescription CloseFn{{"fclose"}, 1};
 
-  std::unique_ptr<BugType> DoubleCloseBugType;
-  std::unique_ptr<BugType> LeakBugType;
+  const BugType DoubleCloseBugType{this, "Double fclose",
+                                   "Unix Stream API Error"};
+  const BugType LeakBugType{this, "Resource Leak", "Unix Stream API Error",
+                            /*SuppressOnSink=*/true};
 
   void reportDoubleClose(SymbolRef FileDescSym,
                          const CallEvent &Call,
@@ -67,8 +70,6 @@ class SimpleStreamChecker : public Checker<check::PostCall,
   bool guaranteedNotToCloseFile(const CallEvent &Call) const;
 
 public:
-  SimpleStreamChecker();
-
   /// Process fopen.
   void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
   /// Process fclose.
@@ -89,18 +90,6 @@ public:
 /// state. Let's store it in the ProgramState.
 REGISTER_MAP_WITH_PROGRAMSTATE(StreamMap, SymbolRef, StreamState)
 
-SimpleStreamChecker::SimpleStreamChecker()
-    : OpenFn({"fopen"}, 2), CloseFn({"fclose"}, 1) {
-  // Initialize the bug types.
-  DoubleCloseBugType.reset(
-      new BugType(this, "Double fclose", "Unix Stream API Error"));
-
-  // Sinks are higher importance bugs as well as calls to assert() or exit(0).
-  LeakBugType.reset(
-      new BugType(this, "Resource Leak", "Unix Stream API Error",
-                  /*SuppressOnSink=*/true));
-}
-
 void SimpleStreamChecker::checkPostCall(const CallEvent &Call,
                                         CheckerContext &C) const {
   if (!Call.isGlobalCFunction())
@@ -192,7 +181,7 @@ void SimpleStreamChecker::reportDoubleClose(SymbolRef FileDescSym,
 
   // Generate the report.
   auto R = std::make_unique<PathSensitiveBugReport>(
-      *DoubleCloseBugType, "Closing a previously closed file stream", ErrNode);
+      DoubleCloseBugType, "Closing a previously closed file stream", ErrNode);
   R->addRange(Call.getSourceRange());
   R->markInteresting(FileDescSym);
   C.emitReport(std::move(R));
@@ -205,7 +194,7 @@ void SimpleStreamChecker::reportLeaks(ArrayRef<SymbolRef> LeakedStreams,
   // TODO: Identify the leaked file descriptor.
   for (SymbolRef LeakedStream : LeakedStreams) {
     auto R = std::make_unique<PathSensitiveBugReport>(
-        *LeakBugType, "Opened file is never closed; potential resource leak",
+        LeakBugType, "Opened file is never closed; potential resource leak",
         ErrNode);
     R->markInteresting(LeakedStream);
     C.emitReport(std::move(R));
diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index fffcaf7ed18f..6560fd239ce6 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -823,7 +823,7 @@ class StdLibraryFunctionsChecker
   using FunctionSummaryMapType = llvm::DenseMap<const FunctionDecl *, Summary>;
   mutable FunctionSummaryMapType FunctionSummaryMap;
 
-  mutable std::unique_ptr<BugType> BT_InvalidArg;
+  const BugType BT_InvalidArg{this, "Function call with invalid argument"};
   mutable bool SummariesInitialized = false;
 
   static SVal getArgSVal(const CallEvent &Call, ArgNo ArgN) {
@@ -875,11 +875,7 @@ private:
     VC->describe(ValueConstraint::Violation, Call, C.getState(), Summary,
                  MsgOs);
     Msg[0] = toupper(Msg[0]);
-    if (!BT_InvalidArg)
-      BT_InvalidArg = std::make_unique<BugType>(
-          CheckName, "Function call with invalid argument",
-          categories::LogicError);
-    auto R = std::make_unique<PathSensitiveBugReport>(*BT_InvalidArg, Msg, N);
+    auto R = std::make_unique<PathSensitiveBugReport>(BT_InvalidArg, Msg, N);
 
     for (ArgNo ArgN : VC->getArgsToTrack()) {
       bugreporter::trackExpressionValue(N, Call.getArgExpr(ArgN), *R);
@@ -2244,6 +2240,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
             .ArgConstraint(NotNull(ArgNo(0)))
             .ArgConstraint(NotNull(ArgNo(1))));
 
+    // int fflush(FILE *stream);
+    addToFunctionSummaryMap(
+        "fflush", Signature(ArgTypes{FilePtrTy}, RetType{IntTy}),
+        Summary(NoEvalCall)
+            .Case(ReturnsZero, ErrnoMustNotBeChecked, GenericSuccessMsg)
+            .Case({ReturnValueCondition(WithinRange, SingleValue(EOFv))},
+                  ErrnoNEZeroIrrelevant, GenericFailureMsg));
+
     // long ftell(FILE *stream);
     // From 'The Open Group Base Specifications Issue 7, 2018 edition':
     // "The ftell() function shall not change the setting of errno if
diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 925fc90e3554..254b36ed0396 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -266,6 +266,8 @@ private:
        {&StreamChecker::preFseek, &StreamChecker::evalFseek, 0}},
       {{{"ftell"}, 1},
        {&StreamChecker::preDefault, &StreamChecker::evalFtell, 0}},
+      {{{"fflush"}, 1},
+       {&StreamChecker::preFflush, &StreamChecker::evalFflush, 0}},
       {{{"rewind"}, 1},
        {&StreamChecker::preDefault, &StreamChecker::evalRewind, 0}},
       {{{"fgetpos"}, 2},
@@ -360,6 +362,12 @@ private:
                          CheckerContext &C,
                          const StreamErrorState &ErrorKind) const;
 
+  void preFflush(const FnDescription *Desc, const CallEvent &Call,
+                 CheckerContext &C) const;
+
+  void evalFflush(const FnDescription *Desc, const CallEvent &Call,
+                  CheckerContext &C) const;
+
   /// Check that the stream (in StreamVal) is not NULL.
   /// If it can only be NULL a fatal error is emitted and nullptr returned.
   /// Otherwise the return value is a new state where the stream is constrained
@@ -1191,6 +1199,84 @@ void StreamChecker::evalSetFeofFerror(const FnDescription *Desc,
   C.addTransition(State);
 }
 
+void StreamChecker::preFflush(const FnDescription *Desc, const CallEvent &Call,
+                              CheckerContext &C) const {
+  ProgramStateRef State = C.getState();
+  SVal StreamVal = getStreamArg(Desc, Call);
+  std::optional<DefinedSVal> Stream = StreamVal.getAs<DefinedSVal>();
+  if (!Stream)
+    return;
+
+  ProgramStateRef StateNotNull, StateNull;
+  std::tie(StateNotNull, StateNull) =
+      C.getConstraintManager().assumeDual(State, *Stream);
+  if (StateNotNull && !StateNull)
+    ensureStreamOpened(StreamVal, C, StateNotNull);
+}
+
+void StreamChecker::evalFflush(const FnDescription *Desc, const CallEvent &Call,
+                               CheckerContext &C) const {
+  ProgramStateRef State = C.getState();
+  SVal StreamVal = getStreamArg(Desc, Call);
+  std::optional<DefinedSVal> Stream = StreamVal.getAs<DefinedSVal>();
+  if (!Stream)
+    return;
+
+  // Skip if the stream can be both NULL and non-NULL.
+  ProgramStateRef StateNotNull, StateNull;
+  std::tie(StateNotNull, StateNull) =
+      C.getConstraintManager().assumeDual(State, *Stream);
+  if (StateNotNull && StateNull)
+    return;
+  if (StateNotNull && !StateNull)
+    State = StateNotNull;
+  else
+    State = StateNull;
+
+  const CallExpr *CE = dyn_cast_or_null<CallExpr>(Call.getOriginExpr());
+  if (!CE)
+    return;
+
+  // `fflush` returns EOF on failure, otherwise returns 0.
+  ProgramStateRef StateFailed = bindInt(*EofVal, State, C, CE);
+  ProgramStateRef StateNotFailed = bindInt(0, State, C, CE);
+
+  // Clear error states if `fflush` returns 0, but retain their EOF flags.
+  auto ClearErrorInNotFailed = [&StateNotFailed, Desc](SymbolRef Sym,
+                                                       const StreamState *SS) {
+    if (SS->ErrorState & ErrorFError) {
+      StreamErrorState NewES =
+          (SS->ErrorState & ErrorFEof) ? ErrorFEof : ErrorNone;
+      StreamState NewSS = StreamState::getOpened(Desc, NewES, false);
+      StateNotFailed = StateNotFailed->set<StreamMap>(Sym, NewSS);
+    }
+  };
+
+  if (StateNotNull && !StateNull) {
+    // Skip if the input stream's state is unknown, open-failed or closed.
+    if (SymbolRef StreamSym = StreamVal.getAsSymbol()) {
+      const StreamState *SS = State->get<StreamMap>(StreamSym);
+      if (SS) {
+        assert(SS->isOpened() && "Stream is expected to be opened");
+        ClearErrorInNotFailed(StreamSym, SS);
+      } else
+        return;
+    }
+  } else {
+    // Clear error states for all streams.
+    const StreamMapTy &Map = StateNotFailed->get<StreamMap>();
+    for (const auto &I : Map) {
+      SymbolRef Sym = I.first;
+      const StreamState &SS = I.second;
+      if (SS.isOpened())
+        ClearErrorInNotFailed(Sym, &SS);
+    }
+  }
+
+  C.addTransition(StateNotFailed);
+  C.addTransition(StateFailed);
+}
+
 ProgramStateRef
 StreamChecker::ensureStreamNonNull(SVal StreamVal, const Expr *StreamE,
                                    CheckerContext &C,
diff --git a/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h b/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h
index 557e8a76506e..6de33da107a3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h
+++ b/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h
@@ -52,7 +52,7 @@ removeInformationStoredForDeadInstances(const CallEvent &Call,
 
 template <class TypeMap>
 void handleConstructorAndAssignment(const CallEvent &Call, CheckerContext &C,
-                                    const SVal &ThisSVal) {
+                                    SVal ThisSVal) {
   ProgramStateRef State = Call.getState();
 
   if (!State)
diff --git a/clang/lib/StaticAnalyzer/Checkers/TaintTesterChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/TaintTesterChecker.cpp
index 614a2b2e4ec7..acf4e833095b 100644
--- a/clang/lib/StaticAnalyzer/Checkers/TaintTesterChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/TaintTesterChecker.cpp
@@ -23,8 +23,7 @@ using namespace taint;
 
 namespace {
 class TaintTesterChecker : public Checker<check::PostStmt<Expr>> {
-  std::unique_ptr<BugType> BT =
-      std::make_unique<BugType>(this, "Tainted data", "General");
+  const BugType BT{this, "Tainted data", "General"};
 
 public:
   void checkPostStmt(const Expr *E, CheckerContext &C) const;
@@ -39,7 +38,7 @@ void TaintTesterChecker::checkPostStmt(const Expr *E,
 
   if (isTainted(State, E, C.getLocationContext())) {
     if (ExplodedNode *N = C.generateNonFatalErrorNode()) {
-      auto report = std::make_unique<PathSensitiveBugReport>(*BT, "tainted", N);
+      auto report = std::make_unique<PathSensitiveBugReport>(BT, "tainted", N);
       report->addRange(E->getSourceRange());
       C.emitReport(std::move(report));
     }
diff --git a/clang/lib/StaticAnalyzer/Checkers/TestAfterDivZeroChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/TestAfterDivZeroChecker.cpp
index 5cdcc1075f44..667b19f8120e 100644
--- a/clang/lib/StaticAnalyzer/Checkers/TestAfterDivZeroChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/TestAfterDivZeroChecker.cpp
@@ -78,7 +78,7 @@ public:
 class TestAfterDivZeroChecker
     : public Checker<check::PreStmt<BinaryOperator>, check::BranchCondition,
                      check::EndFunction> {
-  mutable std::unique_ptr<BugType> DivZeroBug;
+  const BugType DivZeroBug{this, "Division by zero"};
   void reportBug(SVal Val, CheckerContext &C) const;
 
 public:
@@ -165,12 +165,10 @@ bool TestAfterDivZeroChecker::hasDivZeroMap(SVal Var,
 
 void TestAfterDivZeroChecker::reportBug(SVal Val, CheckerContext &C) const {
   if (ExplodedNode *N = C.generateErrorNode(C.getState())) {
-    if (!DivZeroBug)
-      DivZeroBug.reset(new BugType(this, "Division by zero"));
-
     auto R = std::make_unique<PathSensitiveBugReport>(
-        *DivZeroBug, "Value being compared against zero has already been used "
-                     "for division",
+        DivZeroBug,
+        "Value being compared against zero has already been used "
+        "for division",
         N);
 
     R->addVisitor(std::make_unique<DivisionBRVisitor>(Val.getAsSymbol(),
diff --git a/clang/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp
index db886501a162..aa478b69aade 100644
--- a/clang/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/UndefBranchChecker.cpp
@@ -27,7 +27,7 @@ using namespace ento;
 namespace {
 
 class UndefBranchChecker : public Checker<check::BranchCondition> {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Branch condition evaluates to a garbage value"};
 
   struct FindUndefExpr {
     ProgramStateRef St;
@@ -64,52 +64,47 @@ void UndefBranchChecker::checkBranchCondition(const Stmt *Condition,
   // ObjCForCollection is a loop, but has no actual condition.
   if (isa<ObjCForCollectionStmt>(Condition))
     return;
-  SVal X = Ctx.getSVal(Condition);
-  if (X.isUndef()) {
-    // Generate a sink node, which implicitly marks both outgoing branches as
-    // infeasible.
-    ExplodedNode *N = Ctx.generateErrorNode();
-    if (N) {
-      if (!BT)
-        BT.reset(
-            new BugType(this, "Branch condition evaluates to a garbage value"));
-
-      // What's going on here: we want to highlight the subexpression of the
-      // condition that is the most likely source of the "uninitialized
-      // branch condition."  We do a recursive walk of the condition's
-      // subexpressions and roughly look for the most nested subexpression
-      // that binds to Undefined.  We then highlight that expression's range.
-
-      // Get the predecessor node and check if is a PostStmt with the Stmt
-      // being the terminator condition.  We want to inspect the state
-      // of that node instead because it will contain main information about
-      // the subexpressions.
-
-      // Note: any predecessor will do.  They should have identical state,
-      // since all the BlockEdge did was act as an error sink since the value
-      // had to already be undefined.
-      assert (!N->pred_empty());
-      const Expr *Ex = cast<Expr>(Condition);
-      ExplodedNode *PrevN = *N->pred_begin();
-      ProgramPoint P = PrevN->getLocation();
-      ProgramStateRef St = N->getState();
-
-      if (std::optional<PostStmt> PS = P.getAs<PostStmt>())
-        if (PS->getStmt() == Ex)
-          St = PrevN->getState();
-
-      FindUndefExpr FindIt(St, Ctx.getLocationContext());
-      Ex = FindIt.FindExpr(Ex);
-
-      // Emit the bug report.
-      auto R = std::make_unique<PathSensitiveBugReport>(
-          *BT, BT->getDescription(), N);
-      bugreporter::trackExpressionValue(N, Ex, *R);
-      R->addRange(Ex->getSourceRange());
-
-      Ctx.emitReport(std::move(R));
-    }
-  }
+  if (!Ctx.getSVal(Condition).isUndef())
+    return;
+
+  // Generate a sink node, which implicitly marks both outgoing branches as
+  // infeasible.
+  ExplodedNode *N = Ctx.generateErrorNode();
+  if (!N)
+    return;
+  // What's going on here: we want to highlight the subexpression of the
+  // condition that is the most likely source of the "uninitialized
+  // branch condition."  We do a recursive walk of the condition's
+  // subexpressions and roughly look for the most nested subexpression
+  // that binds to Undefined.  We then highlight that expression's range.
+
+  // Get the predecessor node and check if is a PostStmt with the Stmt
+  // being the terminator condition.  We want to inspect the state
+  // of that node instead because it will contain main information about
+  // the subexpressions.
+
+  // Note: any predecessor will do.  They should have identical state,
+  // since all the BlockEdge did was act as an error sink since the value
+  // had to already be undefined.
+  assert(!N->pred_empty());
+  const Expr *Ex = cast<Expr>(Condition);
+  ExplodedNode *PrevN = *N->pred_begin();
+  ProgramPoint P = PrevN->getLocation();
+  ProgramStateRef St = N->getState();
+
+  if (std::optional<PostStmt> PS = P.getAs<PostStmt>())
+    if (PS->getStmt() == Ex)
+      St = PrevN->getState();
+
+  FindUndefExpr FindIt(St, Ctx.getLocationContext());
+  Ex = FindIt.FindExpr(Ex);
+
+  // Emit the bug report.
+  auto R = std::make_unique<PathSensitiveBugReport>(BT, BT.getDescription(), N);
+  bugreporter::trackExpressionValue(N, Ex, *R);
+  R->addRange(Ex->getSourceRange());
+
+  Ctx.emitReport(std::move(R));
 }
 
 void ento::registerUndefBranchChecker(CheckerManager &mgr) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/UndefCapturedBlockVarChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/UndefCapturedBlockVarChecker.cpp
index ecb6ed36ee40..2839ef0b6d2e 100644
--- a/clang/lib/StaticAnalyzer/Checkers/UndefCapturedBlockVarChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/UndefCapturedBlockVarChecker.cpp
@@ -27,7 +27,7 @@ using namespace ento;
 namespace {
 class UndefCapturedBlockVarChecker
   : public Checker< check::PostStmt<BlockExpr> > {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "uninitialized variable captured by block"};
 
 public:
   void checkPostStmt(const BlockExpr *BE, CheckerContext &C) const;
@@ -70,10 +70,6 @@ UndefCapturedBlockVarChecker::checkPostStmt(const BlockExpr *BE,
     if (std::optional<UndefinedVal> V =
             state->getSVal(Var.getOriginalRegion()).getAs<UndefinedVal>()) {
       if (ExplodedNode *N = C.generateErrorNode()) {
-        if (!BT)
-          BT.reset(
-              new BugType(this, "uninitialized variable captured by block"));
-
         // Generate a bug report.
         SmallString<128> buf;
         llvm::raw_svector_ostream os(buf);
@@ -81,7 +77,7 @@ UndefCapturedBlockVarChecker::checkPostStmt(const BlockExpr *BE,
         os << "Variable '" << VD->getName()
            << "' is uninitialized when captured by block";
 
-        auto R = std::make_unique<PathSensitiveBugReport>(*BT, os.str(), N);
+        auto R = std::make_unique<PathSensitiveBugReport>(BT, os.str(), N);
         if (const Expr *Ex = FindBlockDeclRefExpr(BE->getBody(), VD))
           R->addRange(Ex->getSourceRange());
         bugreporter::trackStoredValue(*V, VR, *R,
diff --git a/clang/lib/StaticAnalyzer/Checkers/UndefResultChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/UndefResultChecker.cpp
index d593a6bd74cf..4b845bb3ded2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/UndefResultChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/UndefResultChecker.cpp
@@ -28,7 +28,7 @@ namespace {
 class UndefResultChecker
   : public Checker< check::PostStmt<BinaryOperator> > {
 
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Result of operation is garbage or undefined"};
 
 public:
   void checkPostStmt(const BinaryOperator *B, CheckerContext &C) const;
@@ -74,10 +74,6 @@ void UndefResultChecker::checkPostStmt(const BinaryOperator *B,
     if (!N)
       return;
 
-    if (!BT)
-      BT.reset(
-          new BugType(this, "Result of operation is garbage or undefined"));
-
     SmallString<256> sbuf;
     llvm::raw_svector_ostream OS(sbuf);
     const Expr *Ex = nullptr;
@@ -104,7 +100,7 @@ void UndefResultChecker::checkPostStmt(const BinaryOperator *B,
          << BinaryOperator::getOpcodeStr(B->getOpcode())
          << "' expression is undefined";
     }
-    auto report = std::make_unique<PathSensitiveBugReport>(*BT, OS.str(), N);
+    auto report = std::make_unique<PathSensitiveBugReport>(BT, OS.str(), N);
     if (Ex) {
       report->addRange(Ex->getSourceRange());
       bugreporter::trackExpressionValue(N, Ex, *report);
diff --git a/clang/lib/StaticAnalyzer/Checkers/UndefinedArraySubscriptChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/UndefinedArraySubscriptChecker.cpp
index a6cc8cac8c99..baa07fa66764 100644
--- a/clang/lib/StaticAnalyzer/Checkers/UndefinedArraySubscriptChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/UndefinedArraySubscriptChecker.cpp
@@ -24,7 +24,7 @@ using namespace ento;
 namespace {
 class UndefinedArraySubscriptChecker
   : public Checker< check::PreStmt<ArraySubscriptExpr> > {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Array subscript is undefined"};
 
 public:
   void checkPreStmt(const ArraySubscriptExpr *A, CheckerContext &C) const;
@@ -48,11 +48,8 @@ UndefinedArraySubscriptChecker::checkPreStmt(const ArraySubscriptExpr *A,
   ExplodedNode *N = C.generateErrorNode();
   if (!N)
     return;
-  if (!BT)
-    BT.reset(new BugType(this, "Array subscript is undefined"));
-
   // Generate a report for this bug.
-  auto R = std::make_unique<PathSensitiveBugReport>(*BT, BT->getDescription(), N);
+  auto R = std::make_unique<PathSensitiveBugReport>(BT, BT.getDescription(), N);
   R->addRange(A->getIdx()->getSourceRange());
   bugreporter::trackExpressionValue(N, A->getIdx(), *R);
   C.emitReport(std::move(R));
diff --git a/clang/lib/StaticAnalyzer/Checkers/UndefinedAssignmentChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/UndefinedAssignmentChecker.cpp
index 49ac94f65dd0..ddc6cc9e8202 100644
--- a/clang/lib/StaticAnalyzer/Checkers/UndefinedAssignmentChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/UndefinedAssignmentChecker.cpp
@@ -23,7 +23,7 @@ using namespace ento;
 namespace {
 class UndefinedAssignmentChecker
   : public Checker<check::Bind> {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Assigned value is garbage or undefined"};
 
 public:
   void checkBind(SVal location, SVal val, const Stmt *S,
@@ -49,11 +49,6 @@ void UndefinedAssignmentChecker::checkBind(SVal location, SVal val,
   if (!N)
     return;
 
-  static const char *const DefaultMsg =
-      "Assigned value is garbage or undefined";
-  if (!BT)
-    BT.reset(new BugType(this, DefaultMsg));
-
   // Generate a report for this bug.
   llvm::SmallString<128> Str;
   llvm::raw_svector_ostream OS(Str);
@@ -105,9 +100,9 @@ void UndefinedAssignmentChecker::checkBind(SVal location, SVal val,
   }
 
   if (OS.str().empty())
-    OS << DefaultMsg;
+    OS << BT.getDescription();
 
-  auto R = std::make_unique<PathSensitiveBugReport>(*BT, OS.str(), N);
+  auto R = std::make_unique<PathSensitiveBugReport>(BT, OS.str(), N);
   if (ex) {
     R->addRange(ex->getSourceRange());
     bugreporter::trackExpressionValue(N, ex, *R);
diff --git a/clang/lib/StaticAnalyzer/Checkers/UninitializedObject/UninitializedObject.h b/clang/lib/StaticAnalyzer/Checkers/UninitializedObject/UninitializedObject.h
index 2fcdd6086309..e35778e6480c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/UninitializedObject/UninitializedObject.h
+++ b/clang/lib/StaticAnalyzer/Checkers/UninitializedObject/UninitializedObject.h
@@ -299,7 +299,7 @@ private:
   bool isDereferencableUninit(const FieldRegion *FR, FieldChainInfo LocalChain);
 
   /// Returns true if the value of a primitive object is uninitialized.
-  bool isPrimitiveUninit(const SVal &V);
+  bool isPrimitiveUninit(SVal V);
 
   // Note that we don't have a method for arrays -- the elements of an array are
   // often left uninitialized intentionally even when it is of a C++ record
diff --git a/clang/lib/StaticAnalyzer/Checkers/UninitializedObject/UninitializedObjectChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/UninitializedObject/UninitializedObjectChecker.cpp
index 3647c49cf3f9..6e1222fedad3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/UninitializedObject/UninitializedObjectChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/UninitializedObject/UninitializedObjectChecker.cpp
@@ -38,15 +38,12 @@ namespace {
 
 class UninitializedObjectChecker
     : public Checker<check::EndFunction, check::DeadSymbols> {
-  std::unique_ptr<BugType> BT_uninitField;
+  const BugType BT_uninitField{this, "Uninitialized fields"};
 
 public:
   // The fields of this struct will be initialized when registering the checker.
   UninitObjCheckerOptions Opts;
 
-  UninitializedObjectChecker()
-      : BT_uninitField(new BugType(this, "Uninitialized fields")) {}
-
   void checkEndFunction(const ReturnStmt *RS, CheckerContext &C) const;
   void checkDeadSymbols(SymbolReaper &SR, CheckerContext &C) const;
 };
@@ -186,7 +183,7 @@ void UninitializedObjectChecker::checkEndFunction(
     for (const auto &Pair : UninitFields) {
 
       auto Report = std::make_unique<PathSensitiveBugReport>(
-          *BT_uninitField, Pair.second, Node, LocUsedForUniqueing,
+          BT_uninitField, Pair.second, Node, LocUsedForUniqueing,
           Node->getLocationContext()->getDecl());
       Context.emitReport(std::move(Report));
     }
@@ -200,7 +197,7 @@ void UninitializedObjectChecker::checkEndFunction(
             << " at the end of the constructor call";
 
   auto Report = std::make_unique<PathSensitiveBugReport>(
-      *BT_uninitField, WarningOS.str(), Node, LocUsedForUniqueing,
+      BT_uninitField, WarningOS.str(), Node, LocUsedForUniqueing,
       Node->getLocationContext()->getDecl());
 
   for (const auto &Pair : UninitFields) {
@@ -379,7 +376,7 @@ bool FindUninitializedFields::isUnionUninit(const TypedValueRegion *R) {
   return false;
 }
 
-bool FindUninitializedFields::isPrimitiveUninit(const SVal &V) {
+bool FindUninitializedFields::isPrimitiveUninit(SVal V) {
   if (V.isUndef())
     return true;
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/UnixAPIChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/UnixAPIChecker.cpp
index f503b3e88bb3..b05ce610067c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/UnixAPIChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/UnixAPIChecker.cpp
@@ -11,9 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/Basic/TargetInfo.h"
+#include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/CommonBugCategories.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
@@ -39,7 +40,9 @@ enum class OpenVariant {
 namespace {
 
 class UnixAPIMisuseChecker : public Checker< check::PreStmt<CallExpr> > {
-  mutable std::unique_ptr<BugType> BT_open, BT_pthreadOnce;
+  const BugType BT_open{this, "Improper use of 'open'", categories::UnixAPI};
+  const BugType BT_pthreadOnce{this, "Improper use of 'pthread_once'",
+                               categories::UnixAPI};
   mutable std::optional<uint64_t> Val_O_CREAT;
 
 public:
@@ -64,7 +67,9 @@ public:
   void checkPreStmt(const CallExpr *CE, CheckerContext &C) const;
 
 private:
-  mutable std::unique_ptr<BugType> BT_mallocZero;
+  const BugType BT_mallocZero{
+      this, "Undefined allocation of 0 bytes (CERT MEM04-C; CWE-131)",
+      categories::UnixAPI};
 
   void CheckCallocZero(CheckerContext &C, const CallExpr *CE) const;
   void CheckMallocZero(CheckerContext &C, const CallExpr *CE) const;
@@ -87,14 +92,6 @@ private:
 
 } //end anonymous namespace
 
-static void LazyInitialize(const CheckerBase *Checker,
-                           std::unique_ptr<BugType> &BT,
-                           const char *name) {
-  if (BT)
-    return;
-  BT.reset(new BugType(Checker, name, categories::UnixAPI));
-}
-
 //===----------------------------------------------------------------------===//
 // "open" (man 2 open)
 //===----------------------------------------------------------------------===/
@@ -132,9 +129,7 @@ void UnixAPIMisuseChecker::ReportOpenBug(CheckerContext &C,
   if (!N)
     return;
 
-  LazyInitialize(this, BT_open, "Improper use of 'open'");
-
-  auto Report = std::make_unique<PathSensitiveBugReport>(*BT_open, Msg, N);
+  auto Report = std::make_unique<PathSensitiveBugReport>(BT_open, Msg, N);
   Report->addRange(SR);
   C.emitReport(std::move(Report));
 }
@@ -301,10 +296,8 @@ void UnixAPIMisuseChecker::CheckPthreadOnce(CheckerContext &C,
   if (isa<VarRegion>(R) && isa<StackLocalsSpaceRegion>(R->getMemorySpace()))
     os << "  Perhaps you intended to declare the variable as 'static'?";
 
-  LazyInitialize(this, BT_pthreadOnce, "Improper use of 'pthread_once'");
-
   auto report =
-      std::make_unique<PathSensitiveBugReport>(*BT_pthreadOnce, os.str(), N);
+      std::make_unique<PathSensitiveBugReport>(BT_pthreadOnce, os.str(), N);
   report->addRange(CE->getArg(0)->getSourceRange());
   C.emitReport(std::move(report));
 }
@@ -341,14 +334,11 @@ bool UnixAPIPortabilityChecker::ReportZeroByteAllocation(
   if (!N)
     return false;
 
-  LazyInitialize(this, BT_mallocZero,
-                 "Undefined allocation of 0 bytes (CERT MEM04-C; CWE-131)");
-
   SmallString<256> S;
   llvm::raw_svector_ostream os(S);
   os << "Call to '" << fn_name << "' has an allocation size of 0 bytes";
   auto report =
-      std::make_unique<PathSensitiveBugReport>(*BT_mallocZero, os.str(), N);
+      std::make_unique<PathSensitiveBugReport>(BT_mallocZero, os.str(), N);
 
   report->addRange(arg->getSourceRange());
   bugreporter::trackExpressionValue(N, arg, *report);
diff --git a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
index 1d03d1656b3c..d76fe4991869 100644
--- a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
@@ -34,8 +34,10 @@ namespace {
 class VLASizeChecker
     : public Checker<check::PreStmt<DeclStmt>,
                      check::PreStmt<UnaryExprOrTypeTraitExpr>> {
-  mutable std::unique_ptr<BugType> BT;
-  mutable std::unique_ptr<BugType> TaintBT;
+  const BugType BT{this, "Dangerous variable-length array (VLA) declaration"};
+  const BugType TaintBT{this,
+                        "Dangerous variable-length array (VLA) declaration",
+                        categories::TaintedData};
   enum VLASize_Kind { VLA_Garbage, VLA_Zero, VLA_Negative, VLA_Overflow };
 
   /// Check a VLA for validity.
@@ -213,17 +215,12 @@ void VLASizeChecker::reportTaintBug(const Expr *SizeE, ProgramStateRef State,
   if (!N)
     return;
 
-  if (!TaintBT)
-    TaintBT.reset(
-        new BugType(this, "Dangerous variable-length array (VLA) declaration",
-                    categories::TaintedData));
-
   SmallString<256> buf;
   llvm::raw_svector_ostream os(buf);
   os << "Declared variable-length array (VLA) ";
   os << "has tainted size";
 
-  auto report = std::make_unique<PathSensitiveBugReport>(*TaintBT, os.str(), N);
+  auto report = std::make_unique<PathSensitiveBugReport>(TaintBT, os.str(), N);
   report->addRange(SizeE->getSourceRange());
   bugreporter::trackExpressionValue(N, SizeE, *report);
   // The vla size may be a complex expression where multiple memory locations
@@ -240,11 +237,6 @@ void VLASizeChecker::reportBug(VLASize_Kind Kind, const Expr *SizeE,
   if (!N)
     return;
 
-  if (!BT)
-    BT.reset(new BugType(this,
-                         "Dangerous variable-length array (VLA) declaration",
-                         categories::LogicError));
-
   SmallString<256> buf;
   llvm::raw_svector_ostream os(buf);
   os << "Declared variable-length array (VLA) ";
@@ -263,7 +255,7 @@ void VLASizeChecker::reportBug(VLASize_Kind Kind, const Expr *SizeE,
     break;
   }
 
-  auto report = std::make_unique<PathSensitiveBugReport>(*BT, os.str(), N);
+  auto report = std::make_unique<PathSensitiveBugReport>(BT, os.str(), N);
   report->addRange(SizeE->getSourceRange());
   bugreporter::trackExpressionValue(N, SizeE, *report);
   C.emitReport(std::move(report));
diff --git a/clang/lib/StaticAnalyzer/Checkers/VforkChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/VforkChecker.cpp
index 8a1e02748c9b..cb73ac68edd1 100644
--- a/clang/lib/StaticAnalyzer/Checkers/VforkChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/VforkChecker.cpp
@@ -44,7 +44,7 @@ namespace {
 
 class VforkChecker : public Checker<check::PreCall, check::PostCall,
                                     check::Bind, check::PreStmt<ReturnStmt>> {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "Dangerous construct in a vforked process"};
   mutable llvm::SmallSet<const IdentifierInfo *, 10> VforkAllowlist;
   mutable const IdentifierInfo *II_vfork = nullptr;
 
@@ -123,9 +123,6 @@ bool VforkChecker::isCallExplicitelyAllowed(const IdentifierInfo *II,
 void VforkChecker::reportBug(const char *What, CheckerContext &C,
                              const char *Details) const {
   if (ExplodedNode *N = C.generateErrorNode(C.getState())) {
-    if (!BT)
-      BT.reset(new BugType(this, "Dangerous construct in a vforked process"));
-
     SmallString<256> buf;
     llvm::raw_svector_ostream os(buf);
 
@@ -134,7 +131,7 @@ void VforkChecker::reportBug(const char *What, CheckerContext &C,
     if (Details)
       os << "; " << Details;
 
-    auto Report = std::make_unique<PathSensitiveBugReport>(*BT, os.str(), N);
+    auto Report = std::make_unique<PathSensitiveBugReport>(BT, os.str(), N);
     // TODO: mark vfork call in BugReportVisitor
     C.emitReport(std::move(Report));
   }
diff --git a/clang/lib/StaticAnalyzer/Core/CommonBugCategories.cpp b/clang/lib/StaticAnalyzer/Core/CommonBugCategories.cpp
index e9cc080caf5f..66fab523c864 100644
--- a/clang/lib/StaticAnalyzer/Core/CommonBugCategories.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CommonBugCategories.cpp
@@ -13,6 +13,7 @@ namespace clang {
 namespace ento {
 namespace categories {
 
+const char *const AppleAPIMisuse = "API Misuse (Apple)";
 const char *const CoreFoundationObjectiveC = "Core Foundation/Objective-C";
 const char *const LogicError = "Logic error";
 const char *const MemoryRefCount =
diff --git a/clang/lib/StaticAnalyzer/Core/Environment.cpp b/clang/lib/StaticAnalyzer/Core/Environment.cpp
index 0102f743c911..4f989ed59bee 100644
--- a/clang/lib/StaticAnalyzer/Core/Environment.cpp
+++ b/clang/lib/StaticAnalyzer/Core/Environment.cpp
@@ -193,7 +193,7 @@ EnvironmentManager::removeDeadBindings(Environment Env,
   // Iterate over the block-expr bindings.
   for (Environment::iterator I = Env.begin(), End = Env.end(); I != End; ++I) {
     const EnvironmentEntry &BlkExpr = I.getKey();
-    const SVal &X = I.getData();
+    SVal X = I.getData();
 
     const Expr *E = dyn_cast<Expr>(BlkExpr.getStmt());
     if (!E)
diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
index c773cef30d5e..da9a1a1a4d1f 100644
--- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -2016,7 +2016,7 @@ std::optional<SVal> RegionStoreManager::getBindingForDerivedDefaultValue(
     const TypedValueRegion *R, QualType Ty) {
 
   if (const std::optional<SVal> &D = B.getDefaultBinding(superR)) {
-    const SVal &val = *D;
+    SVal val = *D;
     if (SymbolRef parentSym = val.getAsSymbol())
       return svalBuilder.getDerivedRegionValueSymbolVal(parentSym, R);
 
@@ -2331,7 +2331,7 @@ bool RegionStoreManager::includedInBindings(Store store,
     const ClusterBindings &Cluster = RI.getData();
     for (ClusterBindings::iterator CI = Cluster.begin(), CE = Cluster.end();
          CI != CE; ++CI) {
-      const SVal &D = CI.getData();
+      SVal D = CI.getData();
       if (const MemRegion *R = D.getAsRegion())
         if (R->getBaseRegion() == region)
           return true;
@@ -2500,7 +2500,7 @@ RegionStoreManager::bindArray(RegionBindingsConstRef B,
     if (VI == VE)
       break;
 
-    const NonLoc &Idx = svalBuilder.makeArrayIndex(i);
+    NonLoc Idx = svalBuilder.makeArrayIndex(i);
     const ElementRegion *ER = MRMgr.getElementRegion(ElementTy, Idx, R, Ctx);
 
     if (ElementTy->isStructureOrClassType())
diff --git a/clang/lib/Support/RISCVVIntrinsicUtils.cpp b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
index bf47461b59e0..2de977a3dc72 100644
--- a/clang/lib/Support/RISCVVIntrinsicUtils.cpp
+++ b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
@@ -203,7 +203,7 @@ void RVVType::initBuiltinStr() {
     }
     break;
   case ScalarTypeKind::BFloat:
-    BuiltinStr += "b";
+    BuiltinStr += "y";
     break;
   default:
     llvm_unreachable("ScalarType is invalid!");
diff --git a/clang/lib/Tooling/Refactoring/Lookup.cpp b/clang/lib/Tooling/Refactoring/Lookup.cpp
index 52799f16fab2..757fba0404e6 100644
--- a/clang/lib/Tooling/Refactoring/Lookup.cpp
+++ b/clang/lib/Tooling/Refactoring/Lookup.cpp
@@ -98,8 +98,8 @@ static StringRef getBestNamespaceSubstr(const DeclContext *DeclA,
     // from NewName if it has an identical prefix.
     std::string NS =
         "::" + cast<NamespaceDecl>(DeclA)->getQualifiedNameAsString() + "::";
-    if (NewName.starts_with(NS))
-      return NewName.substr(NS.size());
+    if (NewName.consume_front(NS))
+      return NewName;
 
     // No match yet. Strip of a namespace from the end of the chain and try
     // again. This allows to get optimal qualifications even if the old and new
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index 33bfa8d3d81f..d192c7f42939 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -255,9 +255,7 @@ llvm::Expected<std::string> getAbsolutePath(llvm::vfs::FileSystem &FS,
                                             StringRef File) {
   StringRef RelativePath(File);
   // FIXME: Should '.\\' be accepted on Win32?
-  if (RelativePath.starts_with("./")) {
-    RelativePath = RelativePath.substr(strlen("./"));
-  }
+  RelativePath.consume_front("./");
 
   SmallString<1024> AbsolutePath = RelativePath;
   if (auto EC = FS.makeAbsolute(AbsolutePath))
diff --git a/clang/test/Analysis/Inputs/system-header-simulator.h b/clang/test/Analysis/Inputs/system-header-simulator.h
index 7089bd8bfc9d..409a969a0d4c 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator.h
@@ -61,6 +61,7 @@ void clearerr(FILE *stream);
 int feof(FILE *stream);
 int ferror(FILE *stream);
 int fileno(FILE *stream);
+int fflush(FILE *stream);
 
 size_t strlen(const char *);
 
diff --git a/clang/test/Analysis/stream-errno.c b/clang/test/Analysis/stream-errno.c
index bf0a61db2424..f44ee6070708 100644
--- a/clang/test/Analysis/stream-errno.c
+++ b/clang/test/Analysis/stream-errno.c
@@ -222,3 +222,29 @@ void check_fileno(void) {
   }
   if (errno) {} // expected-warning{{An undefined value may be read from 'errno'}}
 }
+
+void check_fflush_opened_file(void) {
+  FILE *F = tmpfile();
+  if (!F)
+    return;
+  int N = fflush(F);
+  if (N == EOF) {
+    clang_analyzer_eval(errno != 0); // expected-warning{{TRUE}}
+    if (errno) {}                    // no-warning
+  } else {
+    clang_analyzer_eval(N == 0);     // expected-warning{{TRUE}}
+    if (errno) {}                    // expected-warning{{An undefined value may be read from 'errno'}}
+  }
+  fclose(F);
+}
+
+void check_fflush_all(void) {
+  int N = fflush(NULL);
+  if (N == 0) {
+    if (errno) {}                    // expected-warning{{An undefined value may be read from 'errno'}}
+  } else {
+    clang_analyzer_eval(N == EOF);   // expected-warning{{TRUE}}
+    clang_analyzer_eval(errno != 0); // expected-warning{{TRUE}}
+    if (errno) {}                    // no-warning
+  }
+}
diff --git a/clang/test/Analysis/stream-error.c b/clang/test/Analysis/stream-error.c
index c8332bcbfa8c..37e1e54dfc89 100644
--- a/clang/test/Analysis/stream-error.c
+++ b/clang/test/Analysis/stream-error.c
@@ -299,6 +299,73 @@ void error_fseek_0(void) {
   fclose(F);
 }
 
+void error_fflush_after_fclose(void) {
+  FILE *F = tmpfile();
+  int Ret;
+  fflush(NULL);                      // no-warning
+  if (!F)
+    return;
+  if ((Ret = fflush(F)) != 0)
+    clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
+  fclose(F);
+  fflush(F);                         // expected-warning {{Stream might be already closed}}
+}
+
+void error_fflush_on_open_failed_stream(void) {
+  FILE *F = tmpfile();
+  if (!F) {
+    fflush(F); // no-warning
+    return;
+  }
+  fclose(F);
+}
+
+void error_fflush_on_unknown_stream(FILE *F) {
+  fflush(F);   // no-warning
+  fclose(F);   // no-warning
+}
+
+void error_fflush_on_non_null_stream_clear_error_states(void) {
+  FILE *F0 = tmpfile(), *F1 = tmpfile();
+  // `fflush` clears a non-EOF stream's error state.
+  if (F0) {
+    StreamTesterChecker_make_ferror_stream(F0);
+    if (fflush(F0) == 0) {             // no-warning
+      clang_analyzer_eval(ferror(F0)); // expected-warning {{FALSE}}
+      clang_analyzer_eval(feof(F0));   // expected-warning {{FALSE}}
+    }
+    fclose(F0);
+  }
+  // `fflush` clears an EOF stream's error state.
+  if (F1) {
+    StreamTesterChecker_make_feof_stream(F1);
+    if (fflush(F1) == 0) {             // no-warning
+      clang_analyzer_eval(ferror(F1)); // expected-warning {{FALSE}}
+      clang_analyzer_eval(feof(F1));   // expected-warning {{TRUE}}
+    }
+    fclose(F1);
+  }
+}
+
+void error_fflush_on_null_stream_clear_error_states(void) {
+  FILE *F0 = tmpfile(), *F1 = tmpfile();
+  // `fflush` clears all stream's error states, while retains their EOF states.
+  if (F0 && F1) {
+    StreamTesterChecker_make_ferror_stream(F0);
+    StreamTesterChecker_make_feof_stream(F1);
+    if (fflush(NULL) == 0) {           // no-warning
+      clang_analyzer_eval(ferror(F0)); // expected-warning {{FALSE}}
+      clang_analyzer_eval(feof(F0));   // expected-warning {{FALSE}}
+      clang_analyzer_eval(ferror(F1)); // expected-warning {{FALSE}}
+      clang_analyzer_eval(feof(F1));   // expected-warning {{TRUE}}
+    }
+  }
+  if (F0)
+    fclose(F0);
+  if (F1)
+    fclose(F1);
+}
+
 void error_indeterminate(void) {
   FILE *F = fopen("file", "r+");
   if (!F)
diff --git a/clang/test/Analysis/string.cpp b/clang/test/Analysis/string.cpp
index f86416da6ee2..1be6c21466cc 100644
--- a/clang/test/Analysis/string.cpp
+++ b/clang/test/Analysis/string.cpp
@@ -1,6 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix -verify %s
-
-// expected-no-diagnostics
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix,debug.ExprInspection -verify %s
 
 // Test functions that are called "memcpy" but aren't the memcpy
 // we're looking for. Unfortunately, this test cannot be put into
@@ -9,6 +7,11 @@
 typedef __typeof(sizeof(int)) size_t;
 void *memcpy(void *, const void *, size_t);
 
+int sprintf(char *str, const char *format, ...);
+int snprintf(char *str, size_t size, const char *format, ...);
+
+void clang_analyzer_warnIfReached();
+
 struct S {
   static S s1, s2;
 
@@ -26,3 +29,19 @@ void *memcpy(void *, const S &, size_t);
 void test_out_of_class_weird_memcpy() {
   memcpy(&S::s1, S::s2, 1); // no-crash
 }
+
+template<typename... Args>
+void log(const char* fmt, const Args&... args) {
+  char buf[100] = {};
+  auto f = snprintf;
+  auto g = sprintf;
+  int n = 0;
+  n += f(buf, 99, fmt, args...); // no-crash: The CalleeDecl is a VarDecl, but it's okay.
+  n += g(buf, fmt, args...); // no-crash: Same.
+  (void)n;
+  clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
+void test_gh_74269_no_crash() {
+  log("%d", 1);
+}
diff --git a/clang/test/CXX/dcl.decl/dcl.init/dcl.init.list/p7-0x.cpp b/clang/test/CXX/dcl.decl/dcl.init/dcl.init.list/p7-0x.cpp
index eac9ac0e8279..2bceb3e26779 100644
--- a/clang/test/CXX/dcl.decl/dcl.init/dcl.init.list/p7-0x.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.init/dcl.init.list/p7-0x.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -std=c++11 -triple x86_64-apple-macosx10.6.7 -verify %s
+// The following narrowing does not involve const references, so
+// -Wno-c++11-narrowing-const-reference does not suppress the errors.
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 -triple x86_64-apple-macosx10.6.7 -Wno-c++11-narrowing-const-reference -verify %s
 
 // Verify that narrowing conversions in initializer lists cause errors in C++0x
 // mode.
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
index 09b2d5fcacf5..9a8ce224bcfd 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
@@ -5,4426 +5,6382 @@
 
 // CHECK-LABEL: @xvsll_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsll_b(v32i8 _1, v32i8 _2) { return __lasx_xvsll_b(_1, _2); }
 // CHECK-LABEL: @xvsll_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsll_h(v16i16 _1, v16i16 _2) { return __lasx_xvsll_h(_1, _2); }
 // CHECK-LABEL: @xvsll_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsll_w(v8i32 _1, v8i32 _2) { return __lasx_xvsll_w(_1, _2); }
 // CHECK-LABEL: @xvsll_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsll_d(v4i64 _1, v4i64 _2) { return __lasx_xvsll_d(_1, _2); }
 // CHECK-LABEL: @xvslli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslli_b(v32i8 _1) { return __lasx_xvslli_b(_1, 1); }
 // CHECK-LABEL: @xvslli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslli_h(v16i16 _1) { return __lasx_xvslli_h(_1, 1); }
 // CHECK-LABEL: @xvslli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslli_w(v8i32 _1) { return __lasx_xvslli_w(_1, 1); }
 // CHECK-LABEL: @xvslli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslli_d(v4i64 _1) { return __lasx_xvslli_d(_1, 1); }
 // CHECK-LABEL: @xvsra_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsra_b(v32i8 _1, v32i8 _2) { return __lasx_xvsra_b(_1, _2); }
 // CHECK-LABEL: @xvsra_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsra_h(v16i16 _1, v16i16 _2) { return __lasx_xvsra_h(_1, _2); }
 // CHECK-LABEL: @xvsra_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsra_w(v8i32 _1, v8i32 _2) { return __lasx_xvsra_w(_1, _2); }
 // CHECK-LABEL: @xvsra_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsra_d(v4i64 _1, v4i64 _2) { return __lasx_xvsra_d(_1, _2); }
 // CHECK-LABEL: @xvsrai_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrai_b(v32i8 _1) { return __lasx_xvsrai_b(_1, 1); }
 // CHECK-LABEL: @xvsrai_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrai_h(v16i16 _1) { return __lasx_xvsrai_h(_1, 1); }
 // CHECK-LABEL: @xvsrai_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrai_w(v8i32 _1) { return __lasx_xvsrai_w(_1, 1); }
 // CHECK-LABEL: @xvsrai_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrai_d(v4i64 _1) { return __lasx_xvsrai_d(_1, 1); }
 // CHECK-LABEL: @xvsrar_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrar_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrar_b(_1, _2); }
 // CHECK-LABEL: @xvsrar_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrar_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrar_h(_1, _2); }
 // CHECK-LABEL: @xvsrar_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrar_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrar_w(_1, _2); }
 // CHECK-LABEL: @xvsrar_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrar_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrar_d(_1, _2); }
 // CHECK-LABEL: @xvsrari_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrari_b(v32i8 _1) { return __lasx_xvsrari_b(_1, 1); }
 // CHECK-LABEL: @xvsrari_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrari_h(v16i16 _1) { return __lasx_xvsrari_h(_1, 1); }
 // CHECK-LABEL: @xvsrari_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrari_w(v8i32 _1) { return __lasx_xvsrari_w(_1, 1); }
 // CHECK-LABEL: @xvsrari_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrari_d(v4i64 _1) { return __lasx_xvsrari_d(_1, 1); }
 // CHECK-LABEL: @xvsrl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrl_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrl_b(_1, _2); }
 // CHECK-LABEL: @xvsrl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrl_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrl_h(_1, _2); }
 // CHECK-LABEL: @xvsrl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrl_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrl_w(_1, _2); }
 // CHECK-LABEL: @xvsrl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrl_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrl_d(_1, _2); }
 // CHECK-LABEL: @xvsrli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrli_b(v32i8 _1) { return __lasx_xvsrli_b(_1, 1); }
 // CHECK-LABEL: @xvsrli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrli_h(v16i16 _1) { return __lasx_xvsrli_h(_1, 1); }
 // CHECK-LABEL: @xvsrli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrli_w(v8i32 _1) { return __lasx_xvsrli_w(_1, 1); }
 // CHECK-LABEL: @xvsrli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrli_d(v4i64 _1) { return __lasx_xvsrli_d(_1, 1); }
 // CHECK-LABEL: @xvsrlr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlr_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrlr_b(_1, _2); }
 // CHECK-LABEL: @xvsrlr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlr_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrlr_h(_1, _2); }
 // CHECK-LABEL: @xvsrlr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlr_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrlr_w(_1, _2); }
 // CHECK-LABEL: @xvsrlr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlr_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrlr_d(_1, _2); }
 // CHECK-LABEL: @xvsrlri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlri_b(v32i8 _1) { return __lasx_xvsrlri_b(_1, 1); }
 // CHECK-LABEL: @xvsrlri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlri_h(v16i16 _1) { return __lasx_xvsrlri_h(_1, 1); }
 // CHECK-LABEL: @xvsrlri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlri_w(v8i32 _1) { return __lasx_xvsrlri_w(_1, 1); }
 // CHECK-LABEL: @xvsrlri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlri_d(v4i64 _1) { return __lasx_xvsrlri_d(_1, 1); }
 // CHECK-LABEL: @xvbitclr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitclr_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitclr_b(_1, _2); }
 // CHECK-LABEL: @xvbitclr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitclr_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitclr_h(_1, _2); }
 // CHECK-LABEL: @xvbitclr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitclr_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitclr_w(_1, _2); }
 // CHECK-LABEL: @xvbitclr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitclr_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitclr_d(_1, _2); }
 // CHECK-LABEL: @xvbitclri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitclri_b(v32u8 _1) { return __lasx_xvbitclri_b(_1, 1); }
 // CHECK-LABEL: @xvbitclri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitclri_h(v16u16 _1) { return __lasx_xvbitclri_h(_1, 1); }
 // CHECK-LABEL: @xvbitclri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitclri_w(v8u32 _1) { return __lasx_xvbitclri_w(_1, 1); }
 // CHECK-LABEL: @xvbitclri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitclri_d(v4u64 _1) { return __lasx_xvbitclri_d(_1, 1); }
 // CHECK-LABEL: @xvbitset_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitset_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitset_b(_1, _2); }
 // CHECK-LABEL: @xvbitset_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitset_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitset_h(_1, _2); }
 // CHECK-LABEL: @xvbitset_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitset_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitset_w(_1, _2); }
 // CHECK-LABEL: @xvbitset_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitset_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitset_d(_1, _2); }
 // CHECK-LABEL: @xvbitseti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitseti_b(v32u8 _1) { return __lasx_xvbitseti_b(_1, 1); }
 // CHECK-LABEL: @xvbitseti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitseti_h(v16u16 _1) { return __lasx_xvbitseti_h(_1, 1); }
 // CHECK-LABEL: @xvbitseti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitseti_w(v8u32 _1) { return __lasx_xvbitseti_w(_1, 1); }
 // CHECK-LABEL: @xvbitseti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitseti_d(v4u64 _1) { return __lasx_xvbitseti_d(_1, 1); }
 // CHECK-LABEL: @xvbitrev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitrev_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitrev_b(_1, _2); }
 // CHECK-LABEL: @xvbitrev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitrev_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitrev_h(_1, _2); }
 // CHECK-LABEL: @xvbitrev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitrev_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitrev_w(_1, _2); }
 // CHECK-LABEL: @xvbitrev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitrev_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitrev_d(_1, _2); }
 // CHECK-LABEL: @xvbitrevi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitrevi_b(v32u8 _1) { return __lasx_xvbitrevi_b(_1, 1); }
 // CHECK-LABEL: @xvbitrevi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitrevi_h(v16u16 _1) { return __lasx_xvbitrevi_h(_1, 1); }
 // CHECK-LABEL: @xvbitrevi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitrevi_w(v8u32 _1) { return __lasx_xvbitrevi_w(_1, 1); }
 // CHECK-LABEL: @xvbitrevi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitrevi_d(v4u64 _1) { return __lasx_xvbitrevi_d(_1, 1); }
 // CHECK-LABEL: @xvadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvadd_b(v32i8 _1, v32i8 _2) { return __lasx_xvadd_b(_1, _2); }
 // CHECK-LABEL: @xvadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvadd_h(v16i16 _1, v16i16 _2) { return __lasx_xvadd_h(_1, _2); }
 // CHECK-LABEL: @xvadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvadd_w(v8i32 _1, v8i32 _2) { return __lasx_xvadd_w(_1, _2); }
 // CHECK-LABEL: @xvadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvadd_d(v4i64 _1, v4i64 _2) { return __lasx_xvadd_d(_1, _2); }
 // CHECK-LABEL: @xvaddi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvaddi_bu(v32i8 _1) { return __lasx_xvaddi_bu(_1, 1); }
 // CHECK-LABEL: @xvaddi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddi_hu(v16i16 _1) { return __lasx_xvaddi_hu(_1, 1); }
 // CHECK-LABEL: @xvaddi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddi_wu(v8i32 _1) { return __lasx_xvaddi_wu(_1, 1); }
 // CHECK-LABEL: @xvaddi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddi_du(v4i64 _1) { return __lasx_xvaddi_du(_1, 1); }
 // CHECK-LABEL: @xvsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsub_b(v32i8 _1, v32i8 _2) { return __lasx_xvsub_b(_1, _2); }
 // CHECK-LABEL: @xvsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsub_h(v16i16 _1, v16i16 _2) { return __lasx_xvsub_h(_1, _2); }
 // CHECK-LABEL: @xvsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsub_w(v8i32 _1, v8i32 _2) { return __lasx_xvsub_w(_1, _2); }
 // CHECK-LABEL: @xvsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsub_d(v4i64 _1, v4i64 _2) { return __lasx_xvsub_d(_1, _2); }
 // CHECK-LABEL: @xvsubi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsubi_bu(v32i8 _1) { return __lasx_xvsubi_bu(_1, 1); }
 // CHECK-LABEL: @xvsubi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubi_hu(v16i16 _1) { return __lasx_xvsubi_hu(_1, 1); }
 // CHECK-LABEL: @xvsubi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubi_wu(v8i32 _1) { return __lasx_xvsubi_wu(_1, 1); }
 // CHECK-LABEL: @xvsubi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubi_du(v4i64 _1) { return __lasx_xvsubi_du(_1, 1); }
 // CHECK-LABEL: @xvmax_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmax_b(v32i8 _1, v32i8 _2) { return __lasx_xvmax_b(_1, _2); }
 // CHECK-LABEL: @xvmax_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmax_h(v16i16 _1, v16i16 _2) { return __lasx_xvmax_h(_1, _2); }
 // CHECK-LABEL: @xvmax_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmax_w(v8i32 _1, v8i32 _2) { return __lasx_xvmax_w(_1, _2); }
 // CHECK-LABEL: @xvmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmax_d(v4i64 _1, v4i64 _2) { return __lasx_xvmax_d(_1, _2); }
 // CHECK-LABEL: @xvmaxi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmaxi_b(v32i8 _1) { return __lasx_xvmaxi_b(_1, 1); }
 // CHECK-LABEL: @xvmaxi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaxi_h(v16i16 _1) { return __lasx_xvmaxi_h(_1, 1); }
 // CHECK-LABEL: @xvmaxi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaxi_w(v8i32 _1) { return __lasx_xvmaxi_w(_1, 1); }
 // CHECK-LABEL: @xvmaxi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaxi_d(v4i64 _1) { return __lasx_xvmaxi_d(_1, 1); }
 // CHECK-LABEL: @xvmax_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmax_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmax_bu(_1, _2); }
 // CHECK-LABEL: @xvmax_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmax_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmax_hu(_1, _2); }
 // CHECK-LABEL: @xvmax_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmax_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmax_wu(_1, _2); }
 // CHECK-LABEL: @xvmax_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmax_du(v4u64 _1, v4u64 _2) { return __lasx_xvmax_du(_1, _2); }
 // CHECK-LABEL: @xvmaxi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmaxi_bu(v32u8 _1) { return __lasx_xvmaxi_bu(_1, 1); }
 // CHECK-LABEL: @xvmaxi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmaxi_hu(v16u16 _1) { return __lasx_xvmaxi_hu(_1, 1); }
 // CHECK-LABEL: @xvmaxi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmaxi_wu(v8u32 _1) { return __lasx_xvmaxi_wu(_1, 1); }
 // CHECK-LABEL: @xvmaxi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaxi_du(v4u64 _1) { return __lasx_xvmaxi_du(_1, 1); }
 // CHECK-LABEL: @xvmin_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmin_b(v32i8 _1, v32i8 _2) { return __lasx_xvmin_b(_1, _2); }
 // CHECK-LABEL: @xvmin_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmin_h(v16i16 _1, v16i16 _2) { return __lasx_xvmin_h(_1, _2); }
 // CHECK-LABEL: @xvmin_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmin_w(v8i32 _1, v8i32 _2) { return __lasx_xvmin_w(_1, _2); }
 // CHECK-LABEL: @xvmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmin_d(v4i64 _1, v4i64 _2) { return __lasx_xvmin_d(_1, _2); }
 // CHECK-LABEL: @xvmini_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmini_b(v32i8 _1) { return __lasx_xvmini_b(_1, 1); }
 // CHECK-LABEL: @xvmini_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmini_h(v16i16 _1) { return __lasx_xvmini_h(_1, 1); }
 // CHECK-LABEL: @xvmini_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmini_w(v8i32 _1) { return __lasx_xvmini_w(_1, 1); }
 // CHECK-LABEL: @xvmini_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmini_d(v4i64 _1) { return __lasx_xvmini_d(_1, 1); }
 // CHECK-LABEL: @xvmin_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmin_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmin_bu(_1, _2); }
 // CHECK-LABEL: @xvmin_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmin_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmin_hu(_1, _2); }
 // CHECK-LABEL: @xvmin_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmin_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmin_wu(_1, _2); }
 // CHECK-LABEL: @xvmin_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmin_du(v4u64 _1, v4u64 _2) { return __lasx_xvmin_du(_1, _2); }
 // CHECK-LABEL: @xvmini_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmini_bu(v32u8 _1) { return __lasx_xvmini_bu(_1, 1); }
 // CHECK-LABEL: @xvmini_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmini_hu(v16u16 _1) { return __lasx_xvmini_hu(_1, 1); }
 // CHECK-LABEL: @xvmini_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmini_wu(v8u32 _1) { return __lasx_xvmini_wu(_1, 1); }
 // CHECK-LABEL: @xvmini_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmini_du(v4u64 _1) { return __lasx_xvmini_du(_1, 1); }
 // CHECK-LABEL: @xvseq_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvseq_b(v32i8 _1, v32i8 _2) { return __lasx_xvseq_b(_1, _2); }
 // CHECK-LABEL: @xvseq_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvseq_h(v16i16 _1, v16i16 _2) { return __lasx_xvseq_h(_1, _2); }
 // CHECK-LABEL: @xvseq_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvseq_w(v8i32 _1, v8i32 _2) { return __lasx_xvseq_w(_1, _2); }
 // CHECK-LABEL: @xvseq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvseq_d(v4i64 _1, v4i64 _2) { return __lasx_xvseq_d(_1, _2); }
 // CHECK-LABEL: @xvseqi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvseqi_b(v32i8 _1) { return __lasx_xvseqi_b(_1, 1); }
 // CHECK-LABEL: @xvseqi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvseqi_h(v16i16 _1) { return __lasx_xvseqi_h(_1, 1); }
 // CHECK-LABEL: @xvseqi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvseqi_w(v8i32 _1) { return __lasx_xvseqi_w(_1, 1); }
 // CHECK-LABEL: @xvseqi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvseqi_d(v4i64 _1) { return __lasx_xvseqi_d(_1, 1); }
 // CHECK-LABEL: @xvslt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslt_b(v32i8 _1, v32i8 _2) { return __lasx_xvslt_b(_1, _2); }
 // CHECK-LABEL: @xvslt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslt_h(v16i16 _1, v16i16 _2) { return __lasx_xvslt_h(_1, _2); }
 // CHECK-LABEL: @xvslt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslt_w(v8i32 _1, v8i32 _2) { return __lasx_xvslt_w(_1, _2); }
 // CHECK-LABEL: @xvslt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslt_d(v4i64 _1, v4i64 _2) { return __lasx_xvslt_d(_1, _2); }
 // CHECK-LABEL: @xvslti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslti_b(v32i8 _1) { return __lasx_xvslti_b(_1, 1); }
 // CHECK-LABEL: @xvslti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslti_h(v16i16 _1) { return __lasx_xvslti_h(_1, 1); }
 // CHECK-LABEL: @xvslti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslti_w(v8i32 _1) { return __lasx_xvslti_w(_1, 1); }
 // CHECK-LABEL: @xvslti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslti_d(v4i64 _1) { return __lasx_xvslti_d(_1, 1); }
 // CHECK-LABEL: @xvslt_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslt_bu(v32u8 _1, v32u8 _2) { return __lasx_xvslt_bu(_1, _2); }
 // CHECK-LABEL: @xvslt_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslt_hu(v16u16 _1, v16u16 _2) { return __lasx_xvslt_hu(_1, _2); }
 // CHECK-LABEL: @xvslt_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslt_wu(v8u32 _1, v8u32 _2) { return __lasx_xvslt_wu(_1, _2); }
 // CHECK-LABEL: @xvslt_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslt_du(v4u64 _1, v4u64 _2) { return __lasx_xvslt_du(_1, _2); }
 // CHECK-LABEL: @xvslti_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslti_bu(v32u8 _1) { return __lasx_xvslti_bu(_1, 1); }
 // CHECK-LABEL: @xvslti_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslti_hu(v16u16 _1) { return __lasx_xvslti_hu(_1, 1); }
 // CHECK-LABEL: @xvslti_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslti_wu(v8u32 _1) { return __lasx_xvslti_wu(_1, 1); }
 // CHECK-LABEL: @xvslti_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslti_du(v4u64 _1) { return __lasx_xvslti_du(_1, 1); }
 // CHECK-LABEL: @xvsle_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsle_b(v32i8 _1, v32i8 _2) { return __lasx_xvsle_b(_1, _2); }
 // CHECK-LABEL: @xvsle_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsle_h(v16i16 _1, v16i16 _2) { return __lasx_xvsle_h(_1, _2); }
 // CHECK-LABEL: @xvsle_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsle_w(v8i32 _1, v8i32 _2) { return __lasx_xvsle_w(_1, _2); }
 // CHECK-LABEL: @xvsle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsle_d(v4i64 _1, v4i64 _2) { return __lasx_xvsle_d(_1, _2); }
 // CHECK-LABEL: @xvslei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslei_b(v32i8 _1) { return __lasx_xvslei_b(_1, 1); }
 // CHECK-LABEL: @xvslei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslei_h(v16i16 _1) { return __lasx_xvslei_h(_1, 1); }
 // CHECK-LABEL: @xvslei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslei_w(v8i32 _1) { return __lasx_xvslei_w(_1, 1); }
 // CHECK-LABEL: @xvslei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslei_d(v4i64 _1) { return __lasx_xvslei_d(_1, 1); }
 // CHECK-LABEL: @xvsle_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsle_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsle_bu(_1, _2); }
 // CHECK-LABEL: @xvsle_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsle_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsle_hu(_1, _2); }
 // CHECK-LABEL: @xvsle_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsle_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsle_wu(_1, _2); }
 // CHECK-LABEL: @xvsle_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsle_du(v4u64 _1, v4u64 _2) { return __lasx_xvsle_du(_1, _2); }
 // CHECK-LABEL: @xvslei_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslei_bu(v32u8 _1) { return __lasx_xvslei_bu(_1, 1); }
 // CHECK-LABEL: @xvslei_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslei_hu(v16u16 _1) { return __lasx_xvslei_hu(_1, 1); }
 // CHECK-LABEL: @xvslei_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslei_wu(v8u32 _1) { return __lasx_xvslei_wu(_1, 1); }
 // CHECK-LABEL: @xvslei_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslei_du(v4u64 _1) { return __lasx_xvslei_du(_1, 1); }
 // CHECK-LABEL: @xvsat_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsat_b(v32i8 _1) { return __lasx_xvsat_b(_1, 1); }
 // CHECK-LABEL: @xvsat_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsat_h(v16i16 _1) { return __lasx_xvsat_h(_1, 1); }
 // CHECK-LABEL: @xvsat_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsat_w(v8i32 _1) { return __lasx_xvsat_w(_1, 1); }
 // CHECK-LABEL: @xvsat_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsat_d(v4i64 _1) { return __lasx_xvsat_d(_1, 1); }
 // CHECK-LABEL: @xvsat_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvsat_bu(v32u8 _1) { return __lasx_xvsat_bu(_1, 1); }
 // CHECK-LABEL: @xvsat_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvsat_hu(v16u16 _1) { return __lasx_xvsat_hu(_1, 1); }
 // CHECK-LABEL: @xvsat_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvsat_wu(v8u32 _1) { return __lasx_xvsat_wu(_1, 1); }
 // CHECK-LABEL: @xvsat_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvsat_du(v4u64 _1) { return __lasx_xvsat_du(_1, 1); }
 // CHECK-LABEL: @xvadda_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvadda_b(v32i8 _1, v32i8 _2) { return __lasx_xvadda_b(_1, _2); }
 // CHECK-LABEL: @xvadda_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvadda_h(v16i16 _1, v16i16 _2) { return __lasx_xvadda_h(_1, _2); }
 // CHECK-LABEL: @xvadda_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvadda_w(v8i32 _1, v8i32 _2) { return __lasx_xvadda_w(_1, _2); }
 // CHECK-LABEL: @xvadda_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvadda_d(v4i64 _1, v4i64 _2) { return __lasx_xvadda_d(_1, _2); }
 // CHECK-LABEL: @xvsadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsadd_b(v32i8 _1, v32i8 _2) { return __lasx_xvsadd_b(_1, _2); }
 // CHECK-LABEL: @xvsadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsadd_h(v16i16 _1, v16i16 _2) { return __lasx_xvsadd_h(_1, _2); }
 // CHECK-LABEL: @xvsadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsadd_w(v8i32 _1, v8i32 _2) { return __lasx_xvsadd_w(_1, _2); }
 // CHECK-LABEL: @xvsadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsadd_d(v4i64 _1, v4i64 _2) { return __lasx_xvsadd_d(_1, _2); }
 // CHECK-LABEL: @xvsadd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvsadd_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsadd_bu(_1, _2); }
 // CHECK-LABEL: @xvsadd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvsadd_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsadd_hu(_1, _2); }
 // CHECK-LABEL: @xvsadd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvsadd_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsadd_wu(_1, _2); }
 // CHECK-LABEL: @xvsadd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvsadd_du(v4u64 _1, v4u64 _2) { return __lasx_xvsadd_du(_1, _2); }
 // CHECK-LABEL: @xvavg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvavg_b(v32i8 _1, v32i8 _2) { return __lasx_xvavg_b(_1, _2); }
 // CHECK-LABEL: @xvavg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvavg_h(v16i16 _1, v16i16 _2) { return __lasx_xvavg_h(_1, _2); }
 // CHECK-LABEL: @xvavg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvavg_w(v8i32 _1, v8i32 _2) { return __lasx_xvavg_w(_1, _2); }
 // CHECK-LABEL: @xvavg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvavg_d(v4i64 _1, v4i64 _2) { return __lasx_xvavg_d(_1, _2); }
 // CHECK-LABEL: @xvavg_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvavg_bu(v32u8 _1, v32u8 _2) { return __lasx_xvavg_bu(_1, _2); }
 // CHECK-LABEL: @xvavg_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvavg_hu(v16u16 _1, v16u16 _2) { return __lasx_xvavg_hu(_1, _2); }
 // CHECK-LABEL: @xvavg_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvavg_wu(v8u32 _1, v8u32 _2) { return __lasx_xvavg_wu(_1, _2); }
 // CHECK-LABEL: @xvavg_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvavg_du(v4u64 _1, v4u64 _2) { return __lasx_xvavg_du(_1, _2); }
 // CHECK-LABEL: @xvavgr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvavgr_b(v32i8 _1, v32i8 _2) { return __lasx_xvavgr_b(_1, _2); }
 // CHECK-LABEL: @xvavgr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvavgr_h(v16i16 _1, v16i16 _2) { return __lasx_xvavgr_h(_1, _2); }
 // CHECK-LABEL: @xvavgr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvavgr_w(v8i32 _1, v8i32 _2) { return __lasx_xvavgr_w(_1, _2); }
 // CHECK-LABEL: @xvavgr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvavgr_d(v4i64 _1, v4i64 _2) { return __lasx_xvavgr_d(_1, _2); }
 // CHECK-LABEL: @xvavgr_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvavgr_bu(v32u8 _1, v32u8 _2) { return __lasx_xvavgr_bu(_1, _2); }
 // CHECK-LABEL: @xvavgr_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvavgr_hu(v16u16 _1, v16u16 _2) { return __lasx_xvavgr_hu(_1, _2); }
 // CHECK-LABEL: @xvavgr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvavgr_wu(v8u32 _1, v8u32 _2) { return __lasx_xvavgr_wu(_1, _2); }
 // CHECK-LABEL: @xvavgr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvavgr_du(v4u64 _1, v4u64 _2) { return __lasx_xvavgr_du(_1, _2); }
 // CHECK-LABEL: @xvssub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssub_b(v32i8 _1, v32i8 _2) { return __lasx_xvssub_b(_1, _2); }
 // CHECK-LABEL: @xvssub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssub_h(v16i16 _1, v16i16 _2) { return __lasx_xvssub_h(_1, _2); }
 // CHECK-LABEL: @xvssub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssub_w(v8i32 _1, v8i32 _2) { return __lasx_xvssub_w(_1, _2); }
 // CHECK-LABEL: @xvssub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssub_d(v4i64 _1, v4i64 _2) { return __lasx_xvssub_d(_1, _2); }
 // CHECK-LABEL: @xvssub_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssub_bu(v32u8 _1, v32u8 _2) { return __lasx_xvssub_bu(_1, _2); }
 // CHECK-LABEL: @xvssub_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssub_hu(v16u16 _1, v16u16 _2) { return __lasx_xvssub_hu(_1, _2); }
 // CHECK-LABEL: @xvssub_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssub_wu(v8u32 _1, v8u32 _2) { return __lasx_xvssub_wu(_1, _2); }
 // CHECK-LABEL: @xvssub_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssub_du(v4u64 _1, v4u64 _2) { return __lasx_xvssub_du(_1, _2); }
 // CHECK-LABEL: @xvabsd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvabsd_b(v32i8 _1, v32i8 _2) { return __lasx_xvabsd_b(_1, _2); }
 // CHECK-LABEL: @xvabsd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvabsd_h(v16i16 _1, v16i16 _2) { return __lasx_xvabsd_h(_1, _2); }
 // CHECK-LABEL: @xvabsd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvabsd_w(v8i32 _1, v8i32 _2) { return __lasx_xvabsd_w(_1, _2); }
 // CHECK-LABEL: @xvabsd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvabsd_d(v4i64 _1, v4i64 _2) { return __lasx_xvabsd_d(_1, _2); }
 // CHECK-LABEL: @xvabsd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvabsd_bu(v32u8 _1, v32u8 _2) { return __lasx_xvabsd_bu(_1, _2); }
 // CHECK-LABEL: @xvabsd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvabsd_hu(v16u16 _1, v16u16 _2) { return __lasx_xvabsd_hu(_1, _2); }
 // CHECK-LABEL: @xvabsd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvabsd_wu(v8u32 _1, v8u32 _2) { return __lasx_xvabsd_wu(_1, _2); }
 // CHECK-LABEL: @xvabsd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvabsd_du(v4u64 _1, v4u64 _2) { return __lasx_xvabsd_du(_1, _2); }
 // CHECK-LABEL: @xvmul_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmul_b(v32i8 _1, v32i8 _2) { return __lasx_xvmul_b(_1, _2); }
 // CHECK-LABEL: @xvmul_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmul_h(v16i16 _1, v16i16 _2) { return __lasx_xvmul_h(_1, _2); }
 // CHECK-LABEL: @xvmul_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmul_w(v8i32 _1, v8i32 _2) { return __lasx_xvmul_w(_1, _2); }
 // CHECK-LABEL: @xvmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmul_d(v4i64 _1, v4i64 _2) { return __lasx_xvmul_d(_1, _2); }
 // CHECK-LABEL: @xvmadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmadd_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmadd_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmadd_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmadd_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_136]], <8 x i32> [[_247]], <8 x i32> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmadd_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmadd_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmadd_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmadd_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmsub_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmsub_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmsub_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmsub_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_136]], <8 x i32> [[_247]], <8 x i32> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmsub_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmsub_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmsub_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmsub_d(_1, _2, _3); }
 // CHECK-LABEL: @xvdiv_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvdiv_b(v32i8 _1, v32i8 _2) { return __lasx_xvdiv_b(_1, _2); }
 // CHECK-LABEL: @xvdiv_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvdiv_h(v16i16 _1, v16i16 _2) { return __lasx_xvdiv_h(_1, _2); }
 // CHECK-LABEL: @xvdiv_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvdiv_w(v8i32 _1, v8i32 _2) { return __lasx_xvdiv_w(_1, _2); }
 // CHECK-LABEL: @xvdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvdiv_d(v4i64 _1, v4i64 _2) { return __lasx_xvdiv_d(_1, _2); }
 // CHECK-LABEL: @xvdiv_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvdiv_bu(v32u8 _1, v32u8 _2) { return __lasx_xvdiv_bu(_1, _2); }
 // CHECK-LABEL: @xvdiv_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvdiv_hu(v16u16 _1, v16u16 _2) { return __lasx_xvdiv_hu(_1, _2); }
 // CHECK-LABEL: @xvdiv_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvdiv_wu(v8u32 _1, v8u32 _2) { return __lasx_xvdiv_wu(_1, _2); }
 // CHECK-LABEL: @xvdiv_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvdiv_du(v4u64 _1, v4u64 _2) { return __lasx_xvdiv_du(_1, _2); }
 // CHECK-LABEL: @xvhaddw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvhaddw_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvhaddw_h_b(_1, _2); }
 // CHECK-LABEL: @xvhaddw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvhaddw_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvhaddw_w_h(_1, _2); }
 // CHECK-LABEL: @xvhaddw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvhaddw_d_w(_1, _2); }
 // CHECK-LABEL: @xvhaddw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvhaddw_hu_bu(v32u8 _1, v32u8 _2) { return __lasx_xvhaddw_hu_bu(_1, _2); }
 // CHECK-LABEL: @xvhaddw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvhaddw_wu_hu(v16u16 _1, v16u16 _2) { return __lasx_xvhaddw_wu_hu(_1, _2); }
 // CHECK-LABEL: @xvhaddw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_du_wu(v8u32 _1, v8u32 _2) { return __lasx_xvhaddw_du_wu(_1, _2); }
 // CHECK-LABEL: @xvhsubw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvhsubw_h_b(_1, _2); }
 // CHECK-LABEL: @xvhsubw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvhsubw_w_h(_1, _2); }
 // CHECK-LABEL: @xvhsubw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvhsubw_d_w(_1, _2); }
 // CHECK-LABEL: @xvhsubw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_hu_bu(v32u8 _1, v32u8 _2) { return __lasx_xvhsubw_hu_bu(_1, _2); }
 // CHECK-LABEL: @xvhsubw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_wu_hu(v16u16 _1, v16u16 _2) { return __lasx_xvhsubw_wu_hu(_1, _2); }
 // CHECK-LABEL: @xvhsubw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_du_wu(v8u32 _1, v8u32 _2) { return __lasx_xvhsubw_du_wu(_1, _2); }
 // CHECK-LABEL: @xvmod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmod_b(v32i8 _1, v32i8 _2) { return __lasx_xvmod_b(_1, _2); }
 // CHECK-LABEL: @xvmod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmod_h(v16i16 _1, v16i16 _2) { return __lasx_xvmod_h(_1, _2); }
 // CHECK-LABEL: @xvmod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmod_w(v8i32 _1, v8i32 _2) { return __lasx_xvmod_w(_1, _2); }
 // CHECK-LABEL: @xvmod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmod_d(v4i64 _1, v4i64 _2) { return __lasx_xvmod_d(_1, _2); }
 // CHECK-LABEL: @xvmod_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmod_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmod_bu(_1, _2); }
 // CHECK-LABEL: @xvmod_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmod_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmod_hu(_1, _2); }
 // CHECK-LABEL: @xvmod_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmod_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmod_wu(_1, _2); }
 // CHECK-LABEL: @xvmod_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmod_du(v4u64 _1, v4u64 _2) { return __lasx_xvmod_du(_1, _2); }
 // CHECK-LABEL: @xvrepl128vei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrepl128vei_b(v32i8 _1) { return __lasx_xvrepl128vei_b(_1, 1); }
 // CHECK-LABEL: @xvrepl128vei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrepl128vei_h(v16i16 _1) { return __lasx_xvrepl128vei_h(_1, 1); }
 // CHECK-LABEL: @xvrepl128vei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrepl128vei_w(v8i32 _1) { return __lasx_xvrepl128vei_w(_1, 1); }
 // CHECK-LABEL: @xvrepl128vei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrepl128vei_d(v4i64 _1) { return __lasx_xvrepl128vei_d(_1, 1); }
 // CHECK-LABEL: @xvpickev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpickev_b(v32i8 _1, v32i8 _2) { return __lasx_xvpickev_b(_1, _2); }
 // CHECK-LABEL: @xvpickev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpickev_h(v16i16 _1, v16i16 _2) { return __lasx_xvpickev_h(_1, _2); }
 // CHECK-LABEL: @xvpickev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpickev_w(v8i32 _1, v8i32 _2) { return __lasx_xvpickev_w(_1, _2); }
 // CHECK-LABEL: @xvpickev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpickev_d(v4i64 _1, v4i64 _2) { return __lasx_xvpickev_d(_1, _2); }
 // CHECK-LABEL: @xvpickod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpickod_b(v32i8 _1, v32i8 _2) { return __lasx_xvpickod_b(_1, _2); }
 // CHECK-LABEL: @xvpickod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpickod_h(v16i16 _1, v16i16 _2) { return __lasx_xvpickod_h(_1, _2); }
 // CHECK-LABEL: @xvpickod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpickod_w(v8i32 _1, v8i32 _2) { return __lasx_xvpickod_w(_1, _2); }
 // CHECK-LABEL: @xvpickod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpickod_d(v4i64 _1, v4i64 _2) { return __lasx_xvpickod_d(_1, _2); }
 // CHECK-LABEL: @xvilvh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvilvh_b(v32i8 _1, v32i8 _2) { return __lasx_xvilvh_b(_1, _2); }
 // CHECK-LABEL: @xvilvh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvilvh_h(v16i16 _1, v16i16 _2) { return __lasx_xvilvh_h(_1, _2); }
 // CHECK-LABEL: @xvilvh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvilvh_w(v8i32 _1, v8i32 _2) { return __lasx_xvilvh_w(_1, _2); }
 // CHECK-LABEL: @xvilvh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvilvh_d(v4i64 _1, v4i64 _2) { return __lasx_xvilvh_d(_1, _2); }
 // CHECK-LABEL: @xvilvl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvilvl_b(v32i8 _1, v32i8 _2) { return __lasx_xvilvl_b(_1, _2); }
 // CHECK-LABEL: @xvilvl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvilvl_h(v16i16 _1, v16i16 _2) { return __lasx_xvilvl_h(_1, _2); }
 // CHECK-LABEL: @xvilvl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvilvl_w(v8i32 _1, v8i32 _2) { return __lasx_xvilvl_w(_1, _2); }
 // CHECK-LABEL: @xvilvl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvilvl_d(v4i64 _1, v4i64 _2) { return __lasx_xvilvl_d(_1, _2); }
 // CHECK-LABEL: @xvpackev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpackev_b(v32i8 _1, v32i8 _2) { return __lasx_xvpackev_b(_1, _2); }
 // CHECK-LABEL: @xvpackev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpackev_h(v16i16 _1, v16i16 _2) { return __lasx_xvpackev_h(_1, _2); }
 // CHECK-LABEL: @xvpackev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpackev_w(v8i32 _1, v8i32 _2) { return __lasx_xvpackev_w(_1, _2); }
 // CHECK-LABEL: @xvpackev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpackev_d(v4i64 _1, v4i64 _2) { return __lasx_xvpackev_d(_1, _2); }
 // CHECK-LABEL: @xvpackod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpackod_b(v32i8 _1, v32i8 _2) { return __lasx_xvpackod_b(_1, _2); }
 // CHECK-LABEL: @xvpackod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpackod_h(v16i16 _1, v16i16 _2) { return __lasx_xvpackod_h(_1, _2); }
 // CHECK-LABEL: @xvpackod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpackod_w(v8i32 _1, v8i32 _2) { return __lasx_xvpackod_w(_1, _2); }
 // CHECK-LABEL: @xvpackod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpackod_d(v4i64 _1, v4i64 _2) { return __lasx_xvpackod_d(_1, _2); }
 // CHECK-LABEL: @xvshuf_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvshuf_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvshuf_b(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvshuf_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvshuf_h(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_136]], <8 x i32> [[_247]], <8 x i32> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvshuf_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvshuf_w(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvshuf_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvshuf_d(_1, _2, _3); }
 // CHECK-LABEL: @xvand_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvand_v(v32u8 _1, v32u8 _2) { return __lasx_xvand_v(_1, _2); }
 // CHECK-LABEL: @xvandi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvandi_b(v32u8 _1) { return __lasx_xvandi_b(_1, 1); }
 // CHECK-LABEL: @xvor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvor_v(v32u8 _1, v32u8 _2) { return __lasx_xvor_v(_1, _2); }
 // CHECK-LABEL: @xvori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvori_b(v32u8 _1) { return __lasx_xvori_b(_1, 1); }
 // CHECK-LABEL: @xvnor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvnor_v(v32u8 _1, v32u8 _2) { return __lasx_xvnor_v(_1, _2); }
 // CHECK-LABEL: @xvnori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvnori_b(v32u8 _1) { return __lasx_xvnori_b(_1, 1); }
 // CHECK-LABEL: @xvxor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvxor_v(v32u8 _1, v32u8 _2) { return __lasx_xvxor_v(_1, _2); }
 // CHECK-LABEL: @xvxori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvxori_b(v32u8 _1) { return __lasx_xvxori_b(_1, 1); }
 // CHECK-LABEL: @xvbitsel_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitsel_v(v32u8 _1, v32u8 _2, v32u8 _3) { return __lasx_xvbitsel_v(_1, _2, _3); }
 // CHECK-LABEL: @xvbitseli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitseli_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitseli_b(_1, _2, 1); }
 // CHECK-LABEL: @xvshuf4i_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvshuf4i_b(v32i8 _1) { return __lasx_xvshuf4i_b(_1, 1); }
 // CHECK-LABEL: @xvshuf4i_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvshuf4i_h(v16i16 _1) { return __lasx_xvshuf4i_h(_1, 1); }
 // CHECK-LABEL: @xvshuf4i_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvshuf4i_w(v8i32 _1) { return __lasx_xvshuf4i_w(_1, 1); }
 // CHECK-LABEL: @xvreplgr2vr_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplgr2vr_b(int _1) { return __lasx_xvreplgr2vr_b(_1); }
 // CHECK-LABEL: @xvreplgr2vr_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvreplgr2vr_h(int _1) { return __lasx_xvreplgr2vr_h(_1); }
 // CHECK-LABEL: @xvreplgr2vr_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvreplgr2vr_w(int _1) { return __lasx_xvreplgr2vr_w(_1); }
 // CHECK-LABEL: @xvreplgr2vr_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[_1:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 [[CONV]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvreplgr2vr_d(int _1) { return __lasx_xvreplgr2vr_d(_1); }
 // CHECK-LABEL: @xvpcnt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpcnt_b(v32i8 _1) { return __lasx_xvpcnt_b(_1); }
 // CHECK-LABEL: @xvpcnt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpcnt_h(v16i16 _1) { return __lasx_xvpcnt_h(_1); }
 // CHECK-LABEL: @xvpcnt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpcnt_w(v8i32 _1) { return __lasx_xvpcnt_w(_1); }
 // CHECK-LABEL: @xvpcnt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpcnt_d(v4i64 _1) { return __lasx_xvpcnt_d(_1); }
 // CHECK-LABEL: @xvclo_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvclo_b(v32i8 _1) { return __lasx_xvclo_b(_1); }
 // CHECK-LABEL: @xvclo_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvclo_h(v16i16 _1) { return __lasx_xvclo_h(_1); }
 // CHECK-LABEL: @xvclo_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvclo_w(v8i32 _1) { return __lasx_xvclo_w(_1); }
 // CHECK-LABEL: @xvclo_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvclo_d(v4i64 _1) { return __lasx_xvclo_d(_1); }
 // CHECK-LABEL: @xvclz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvclz_b(v32i8 _1) { return __lasx_xvclz_b(_1); }
 // CHECK-LABEL: @xvclz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvclz_h(v16i16 _1) { return __lasx_xvclz_h(_1); }
 // CHECK-LABEL: @xvclz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvclz_w(v8i32 _1) { return __lasx_xvclz_w(_1); }
 // CHECK-LABEL: @xvclz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvclz_d(v4i64 _1) { return __lasx_xvclz_d(_1); }
 // CHECK-LABEL: @xvfadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfadd_s(v8f32 _1, v8f32 _2) { return __lasx_xvfadd_s(_1, _2); }
 // CHECK-LABEL: @xvfadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfadd_d(v4f64 _1, v4f64 _2) { return __lasx_xvfadd_d(_1, _2); }
 // CHECK-LABEL: @xvfsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfsub_s(v8f32 _1, v8f32 _2) { return __lasx_xvfsub_s(_1, _2); }
 // CHECK-LABEL: @xvfsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfsub_d(v4f64 _1, v4f64 _2) { return __lasx_xvfsub_d(_1, _2); }
 // CHECK-LABEL: @xvfmul_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmul_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmul_s(_1, _2); }
 // CHECK-LABEL: @xvfmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmul_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmul_d(_1, _2); }
 // CHECK-LABEL: @xvfdiv_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfdiv_s(v8f32 _1, v8f32 _2) { return __lasx_xvfdiv_s(_1, _2); }
 // CHECK-LABEL: @xvfdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfdiv_d(v4f64 _1, v4f64 _2) { return __lasx_xvfdiv_d(_1, _2); }
 // CHECK-LABEL: @xvfcvt_h_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvfcvt_h_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcvt_h_s(_1, _2); }
 // CHECK-LABEL: @xvfcvt_s_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfcvt_s_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcvt_s_d(_1, _2); }
 // CHECK-LABEL: @xvfmin_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmin_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmin_s(_1, _2); }
 // CHECK-LABEL: @xvfmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmin_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmin_d(_1, _2); }
 // CHECK-LABEL: @xvfmina_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmina_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmina_s(_1, _2); }
 // CHECK-LABEL: @xvfmina_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmina_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmina_d(_1, _2); }
 // CHECK-LABEL: @xvfmax_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmax_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmax_s(_1, _2); }
 // CHECK-LABEL: @xvfmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmax_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmax_d(_1, _2); }
 // CHECK-LABEL: @xvfmaxa_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmaxa_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmaxa_s(_1, _2); }
 // CHECK-LABEL: @xvfmaxa_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmaxa_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmaxa_d(_1, _2); }
 // CHECK-LABEL: @xvfclass_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfclass_s(v8f32 _1) { return __lasx_xvfclass_s(_1); }
 // CHECK-LABEL: @xvfclass_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfclass_d(v4f64 _1) { return __lasx_xvfclass_d(_1); }
 // CHECK-LABEL: @xvfsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfsqrt_s(v8f32 _1) { return __lasx_xvfsqrt_s(_1); }
 // CHECK-LABEL: @xvfsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfsqrt_d(v4f64 _1) { return __lasx_xvfsqrt_d(_1); }
 // CHECK-LABEL: @xvfrecip_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfrecip_s(v8f32 _1) { return __lasx_xvfrecip_s(_1); }
 // CHECK-LABEL: @xvfrecip_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfrecip_d(v4f64 _1) { return __lasx_xvfrecip_d(_1); }
 // CHECK-LABEL: @xvfrint_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfrint_s(v8f32 _1) { return __lasx_xvfrint_s(_1); }
 // CHECK-LABEL: @xvfrint_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfrint_d(v4f64 _1) { return __lasx_xvfrint_d(_1); }
 // CHECK-LABEL: @xvfrsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfrsqrt_s(v8f32 _1) { return __lasx_xvfrsqrt_s(_1); }
 // CHECK-LABEL: @xvfrsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfrsqrt_d(v4f64 _1) { return __lasx_xvfrsqrt_d(_1); }
 // CHECK-LABEL: @xvflogb_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvflogb_s(v8f32 _1) { return __lasx_xvflogb_s(_1); }
 // CHECK-LABEL: @xvflogb_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvflogb_d(v4f64 _1) { return __lasx_xvflogb_d(_1); }
 // CHECK-LABEL: @xvfcvth_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfcvth_s_h(v16i16 _1) { return __lasx_xvfcvth_s_h(_1); }
 // CHECK-LABEL: @xvfcvth_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfcvth_d_s(v8f32 _1) { return __lasx_xvfcvth_d_s(_1); }
 // CHECK-LABEL: @xvfcvtl_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfcvtl_s_h(v16i16 _1) { return __lasx_xvfcvtl_s_h(_1); }
 // CHECK-LABEL: @xvfcvtl_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfcvtl_d_s(v8f32 _1) { return __lasx_xvfcvtl_d_s(_1); }
 // CHECK-LABEL: @xvftint_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_s(v8f32 _1) { return __lasx_xvftint_w_s(_1); }
 // CHECK-LABEL: @xvftint_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftint_l_d(v4f64 _1) { return __lasx_xvftint_l_d(_1); }
 // CHECK-LABEL: @xvftint_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvftint_wu_s(v8f32 _1) { return __lasx_xvftint_wu_s(_1); }
 // CHECK-LABEL: @xvftint_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvftint_lu_d(v4f64 _1) { return __lasx_xvftint_lu_d(_1); }
 // CHECK-LABEL: @xvftintrz_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_s(v8f32 _1) { return __lasx_xvftintrz_w_s(_1); }
 // CHECK-LABEL: @xvftintrz_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrz_l_d(v4f64 _1) { return __lasx_xvftintrz_l_d(_1); }
 // CHECK-LABEL: @xvftintrz_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvftintrz_wu_s(v8f32 _1) { return __lasx_xvftintrz_wu_s(_1); }
 // CHECK-LABEL: @xvftintrz_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvftintrz_lu_d(v4f64 _1) { return __lasx_xvftintrz_lu_d(_1); }
 // CHECK-LABEL: @xvffint_s_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_w(v8i32 _1) { return __lasx_xvffint_s_w(_1); }
 // CHECK-LABEL: @xvffint_d_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_l(v4i64 _1) { return __lasx_xvffint_d_l(_1); }
 // CHECK-LABEL: @xvffint_s_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_wu(v8u32 _1) { return __lasx_xvffint_s_wu(_1); }
 // CHECK-LABEL: @xvffint_d_lu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_lu(v4u64 _1) { return __lasx_xvffint_d_lu(_1); }
 // CHECK-LABEL: @xvreplve_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_112]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplve_b(v32i8 _1, int _2) { return __lasx_xvreplve_b(_1, _2); }
 // CHECK-LABEL: @xvreplve_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_112]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvreplve_h(v16i16 _1, int _2) { return __lasx_xvreplve_h(_1, _2); }
 // CHECK-LABEL: @xvreplve_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_112]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvreplve_w(v8i32 _1, int _2) { return __lasx_xvreplve_w(_1, _2); }
 // CHECK-LABEL: @xvreplve_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvreplve_d(v4i64 _1, int _2) { return __lasx_xvreplve_d(_1, _2); }
 // CHECK-LABEL: @xvpermi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpermi_w(v8i32 _1, v8i32 _2) { return __lasx_xvpermi_w(_1, _2, 1); }
 // CHECK-LABEL: @xvandn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvandn_v(v32u8 _1, v32u8 _2) { return __lasx_xvandn_v(_1, _2); }
 // CHECK-LABEL: @xvneg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvneg_b(v32i8 _1) { return __lasx_xvneg_b(_1); }
 // CHECK-LABEL: @xvneg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvneg_h(v16i16 _1) { return __lasx_xvneg_h(_1); }
 // CHECK-LABEL: @xvneg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvneg_w(v8i32 _1) { return __lasx_xvneg_w(_1); }
 // CHECK-LABEL: @xvneg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvneg_d(v4i64 _1) { return __lasx_xvneg_d(_1); }
 // CHECK-LABEL: @xvmuh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmuh_b(v32i8 _1, v32i8 _2) { return __lasx_xvmuh_b(_1, _2); }
 // CHECK-LABEL: @xvmuh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmuh_h(v16i16 _1, v16i16 _2) { return __lasx_xvmuh_h(_1, _2); }
 // CHECK-LABEL: @xvmuh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmuh_w(v8i32 _1, v8i32 _2) { return __lasx_xvmuh_w(_1, _2); }
 // CHECK-LABEL: @xvmuh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmuh_d(v4i64 _1, v4i64 _2) { return __lasx_xvmuh_d(_1, _2); }
 // CHECK-LABEL: @xvmuh_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmuh_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmuh_bu(_1, _2); }
 // CHECK-LABEL: @xvmuh_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmuh_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmuh_hu(_1, _2); }
 // CHECK-LABEL: @xvmuh_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmuh_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmuh_wu(_1, _2); }
 // CHECK-LABEL: @xvmuh_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmuh_du(v4u64 _1, v4u64 _2) { return __lasx_xvmuh_du(_1, _2); }
 // CHECK-LABEL: @xvsllwil_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsllwil_h_b(v32i8 _1) { return __lasx_xvsllwil_h_b(_1, 1); }
 // CHECK-LABEL: @xvsllwil_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsllwil_w_h(v16i16 _1) { return __lasx_xvsllwil_w_h(_1, 1); }
 // CHECK-LABEL: @xvsllwil_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsllwil_d_w(v8i32 _1) { return __lasx_xvsllwil_d_w(_1, 1); }
 // CHECK-LABEL: @xvsllwil_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvsllwil_hu_bu(v32u8 _1) { return __lasx_xvsllwil_hu_bu(_1, 1); }
 // CHECK-LABEL: @xvsllwil_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvsllwil_wu_hu(v16u16 _1) { return __lasx_xvsllwil_wu_hu(_1, 1); }
 // CHECK-LABEL: @xvsllwil_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvsllwil_du_wu(v8u32 _1) { return __lasx_xvsllwil_du_wu(_1, 1); }
 // CHECK-LABEL: @xvsran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsran_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsran_b_h(_1, _2); }
 // CHECK-LABEL: @xvsran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsran_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsran_h_w(_1, _2); }
 // CHECK-LABEL: @xvsran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsran_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsran_w_d(_1, _2); }
 // CHECK-LABEL: @xvssran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssran_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssran_b_h(_1, _2); }
 // CHECK-LABEL: @xvssran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssran_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssran_h_w(_1, _2); }
 // CHECK-LABEL: @xvssran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssran_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssran_w_d(_1, _2); }
 // CHECK-LABEL: @xvssran_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssran_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssran_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssran_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssran_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssran_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssran_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssran_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssran_wu_d(_1, _2); }
 // CHECK-LABEL: @xvsrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrarn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrarn_b_h(_1, _2); }
 // CHECK-LABEL: @xvsrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrarn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrarn_h_w(_1, _2); }
 // CHECK-LABEL: @xvsrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrarn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrarn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrarn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrarn_b_h(_1, _2); }
 // CHECK-LABEL: @xvssrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrarn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrarn_h_w(_1, _2); }
 // CHECK-LABEL: @xvssrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrarn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrarn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrarn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrarn_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrarn_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssrarn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrarn_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrarn_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssrarn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrarn_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrarn_wu_d(_1, _2); }
 // CHECK-LABEL: @xvsrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrln_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrln_b_h(_1, _2); }
 // CHECK-LABEL: @xvsrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrln_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrln_h_w(_1, _2); }
 // CHECK-LABEL: @xvsrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrln_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrln_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrln_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrln_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrln_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssrln_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrln_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrln_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssrln_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrln_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrln_wu_d(_1, _2); }
 // CHECK-LABEL: @xvsrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrlrn_b_h(_1, _2); }
 // CHECK-LABEL: @xvsrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrlrn_h_w(_1, _2); }
 // CHECK-LABEL: @xvsrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrlrn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrn_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrlrn_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrn_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrlrn_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrn_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrlrn_wu_d(_1, _2); }
 // CHECK-LABEL: @xvfrstpi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvfrstpi_b(v32i8 _1, v32i8 _2) { return __lasx_xvfrstpi_b(_1, _2, 1); }
 // CHECK-LABEL: @xvfrstpi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvfrstpi_h(v16i16 _1, v16i16 _2) { return __lasx_xvfrstpi_h(_1, _2, 1); }
 // CHECK-LABEL: @xvfrstp_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvfrstp_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvfrstp_b(_1, _2, _3); }
 // CHECK-LABEL: @xvfrstp_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvfrstp_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvfrstp_h(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf4i_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvshuf4i_d(v4i64 _1, v4i64 _2) { return __lasx_xvshuf4i_d(_1, _2, 1); }
 // CHECK-LABEL: @xvbsrl_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvbsrl_v(v32i8 _1) { return __lasx_xvbsrl_v(_1, 1); }
 // CHECK-LABEL: @xvbsll_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvbsll_v(v32i8 _1) { return __lasx_xvbsll_v(_1, 1); }
 // CHECK-LABEL: @xvextrins_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvextrins_b(v32i8 _1, v32i8 _2) { return __lasx_xvextrins_b(_1, _2, 1); }
 // CHECK-LABEL: @xvextrins_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvextrins_h(v16i16 _1, v16i16 _2) { return __lasx_xvextrins_h(_1, _2, 1); }
 // CHECK-LABEL: @xvextrins_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvextrins_w(v8i32 _1, v8i32 _2) { return __lasx_xvextrins_w(_1, _2, 1); }
 // CHECK-LABEL: @xvextrins_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvextrins_d(v4i64 _1, v4i64 _2) { return __lasx_xvextrins_d(_1, _2, 1); }
 // CHECK-LABEL: @xvmskltz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmskltz_b(v32i8 _1) { return __lasx_xvmskltz_b(_1); }
 // CHECK-LABEL: @xvmskltz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmskltz_h(v16i16 _1) { return __lasx_xvmskltz_h(_1); }
 // CHECK-LABEL: @xvmskltz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmskltz_w(v8i32 _1) { return __lasx_xvmskltz_w(_1); }
 // CHECK-LABEL: @xvmskltz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmskltz_d(v4i64 _1) { return __lasx_xvmskltz_d(_1); }
 // CHECK-LABEL: @xvsigncov_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsigncov_b(v32i8 _1, v32i8 _2) { return __lasx_xvsigncov_b(_1, _2); }
 // CHECK-LABEL: @xvsigncov_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsigncov_h(v16i16 _1, v16i16 _2) { return __lasx_xvsigncov_h(_1, _2); }
 // CHECK-LABEL: @xvsigncov_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsigncov_w(v8i32 _1, v8i32 _2) { return __lasx_xvsigncov_w(_1, _2); }
 // CHECK-LABEL: @xvsigncov_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsigncov_d(v4i64 _1, v4i64 _2) { return __lasx_xvsigncov_d(_1, _2); }
 // CHECK-LABEL: @xvfmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfmadd_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfmadd_d(_1, _2, _3); }
 // CHECK-LABEL: @xvfmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfmsub_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfmsub_d(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfnmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfnmadd_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfnmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfnmadd_d(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfnmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfnmsub_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfnmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfnmsub_d(_1, _2, _3); }
 // CHECK-LABEL: @xvftintrne_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_s(v8f32 _1) { return __lasx_xvftintrne_w_s(_1); }
 // CHECK-LABEL: @xvftintrne_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrne_l_d(v4f64 _1) { return __lasx_xvftintrne_l_d(_1); }
 // CHECK-LABEL: @xvftintrp_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_s(v8f32 _1) { return __lasx_xvftintrp_w_s(_1); }
 // CHECK-LABEL: @xvftintrp_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrp_l_d(v4f64 _1) { return __lasx_xvftintrp_l_d(_1); }
 // CHECK-LABEL: @xvftintrm_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_s(v8f32 _1) { return __lasx_xvftintrm_w_s(_1); }
 // CHECK-LABEL: @xvftintrm_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrm_l_d(v4f64 _1) { return __lasx_xvftintrm_l_d(_1); }
 // CHECK-LABEL: @xvftint_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftint_w_d(_1, _2); }
 // CHECK-LABEL: @xvffint_s_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_l(v4i64 _1, v4i64 _2) { return __lasx_xvffint_s_l(_1, _2); }
 // CHECK-LABEL: @xvftintrz_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrz_w_d(_1, _2); }
 // CHECK-LABEL: @xvftintrp_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrp_w_d(_1, _2); }
 // CHECK-LABEL: @xvftintrm_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrm_w_d(_1, _2); }
 // CHECK-LABEL: @xvftintrne_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrne_w_d(_1, _2); }
 // CHECK-LABEL: @xvftinth_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftinth_l_s(v8f32 _1) { return __lasx_xvftinth_l_s(_1); }
 // CHECK-LABEL: @xvftintl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintl_l_s(v8f32 _1) { return __lasx_xvftintl_l_s(_1); }
 // CHECK-LABEL: @xvffinth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffinth_d_w(v8i32 _1) { return __lasx_xvffinth_d_w(_1); }
 // CHECK-LABEL: @xvffintl_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffintl_d_w(v8i32 _1) { return __lasx_xvffintl_d_w(_1); }
 // CHECK-LABEL: @xvftintrzh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzh_l_s(v8f32 _1) { return __lasx_xvftintrzh_l_s(_1); }
 // CHECK-LABEL: @xvftintrzl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzl_l_s(v8f32 _1) { return __lasx_xvftintrzl_l_s(_1); }
 // CHECK-LABEL: @xvftintrph_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrph_l_s(v8f32 _1) { return __lasx_xvftintrph_l_s(_1); }
 // CHECK-LABEL: @xvftintrpl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrpl_l_s(v8f32 _1) { return __lasx_xvftintrpl_l_s(_1); }
 // CHECK-LABEL: @xvftintrmh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrmh_l_s(v8f32 _1) { return __lasx_xvftintrmh_l_s(_1); }
 // CHECK-LABEL: @xvftintrml_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrml_l_s(v8f32 _1) { return __lasx_xvftintrml_l_s(_1); }
 // CHECK-LABEL: @xvftintrneh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrneh_l_s(v8f32 _1) { return __lasx_xvftintrneh_l_s(_1); }
 // CHECK-LABEL: @xvftintrnel_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrnel_l_s(v8f32 _1) { return __lasx_xvftintrnel_l_s(_1); }
 // CHECK-LABEL: @xvfrintrne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrne_s(v8f32 _1) { return __lasx_xvfrintrne_s(_1); }
 // CHECK-LABEL: @xvfrintrne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrne_d(v4f64 _1) { return __lasx_xvfrintrne_d(_1); }
 // CHECK-LABEL: @xvfrintrz_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrz_s(v8f32 _1) { return __lasx_xvfrintrz_s(_1); }
 // CHECK-LABEL: @xvfrintrz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrz_d(v4f64 _1) { return __lasx_xvfrintrz_d(_1); }
 // CHECK-LABEL: @xvfrintrp_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrp_s(v8f32 _1) { return __lasx_xvfrintrp_s(_1); }
 // CHECK-LABEL: @xvfrintrp_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrp_d(v4f64 _1) { return __lasx_xvfrintrp_d(_1); }
 // CHECK-LABEL: @xvfrintrm_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrm_s(v8f32 _1) { return __lasx_xvfrintrm_s(_1); }
 // CHECK-LABEL: @xvfrintrm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrm_d(v4f64 _1) { return __lasx_xvfrintrm_d(_1); }
 // CHECK-LABEL: @xvld(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvld(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvld(void * _1) { return __lasx_xvld(_1, 1); }
 // CHECK-LABEL: @xvst(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1]], ptr [[_2:%.*]], i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvst(v32i8 _1, void * _2) { return __lasx_xvst(_1, _2, 1); }
 // CHECK-LABEL: @xvstelm_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1]], ptr [[_2:%.*]], i32 1, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_b(v32i8 _1, void * _2) { return __lasx_xvstelm_b(_1, _2, 1, 1); }
 // CHECK-LABEL: @xvstelm_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1:%.*]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1]], ptr [[_2:%.*]], i32 2, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_h(v16i16 _1, void * _2) { return __lasx_xvstelm_h(_1, _2, 2, 1); }
 // CHECK-LABEL: @xvstelm_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1:%.*]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1]], ptr [[_2:%.*]], i32 4, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_w(v8i32 _1, void * _2) { return __lasx_xvstelm_w(_1, _2, 4, 1); }
 // CHECK-LABEL: @xvstelm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1:%.*]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1]], ptr [[_2:%.*]], i32 8, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_d(v4i64 _1, void * _2) { return __lasx_xvstelm_d(_1, _2, 8, 1); }
 // CHECK-LABEL: @xvinsve0_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvinsve0_w(v8i32 _1, v8i32 _2) { return __lasx_xvinsve0_w(_1, _2, 1); }
 // CHECK-LABEL: @xvinsve0_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvinsve0_d(v4i64 _1, v4i64 _2) { return __lasx_xvinsve0_d(_1, _2, 1); }
 // CHECK-LABEL: @xvpickve_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpickve_w(v8i32 _1) { return __lasx_xvpickve_w(_1, 1); }
 // CHECK-LABEL: @xvpickve_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpickve_d(v4i64 _1) { return __lasx_xvpickve_d(_1, 1); }
 // CHECK-LABEL: @xvssrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrlrn_b_h(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrlrn_h_w(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrlrn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrln_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrln_b_h(_1, _2); }
 // CHECK-LABEL: @xvssrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrln_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrln_h_w(_1, _2); }
 // CHECK-LABEL: @xvssrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrln_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrln_w_d(_1, _2); }
 // CHECK-LABEL: @xvorn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvorn_v(v32i8 _1, v32i8 _2) { return __lasx_xvorn_v(_1, _2); }
 // CHECK-LABEL: @xvldi(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvldi() { return __lasx_xvldi(1); }
 // CHECK-LABEL: @xvldx(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1:%.*]], i64 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1:%.*]], i64 1), !noalias [[META5:![0-9]+]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvldx(void * _1) { return __lasx_xvldx(_1, 1); }
 // CHECK-LABEL: @xvstx(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i64 1)
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_112]], ptr [[_2:%.*]], i64 1)
 // CHECK-NEXT:    ret void
 //
 void xvstx(v32i8 _1, void * _2) { return __lasx_xvstx(_1, _2, 1); }
 // CHECK-LABEL: @xvextl_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvextl_qu_du(v4u64 _1) { return __lasx_xvextl_qu_du(_1); }
 // CHECK-LABEL: @xvinsgr2vr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1]], i32 1, i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvinsgr2vr_w(v8i32 _1) { return __lasx_xvinsgr2vr_w(_1, 1, 1); }
 // CHECK-LABEL: @xvinsgr2vr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1:%.*]], i64 1, i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1]], i64 1, i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvinsgr2vr_d(v4i64 _1) { return __lasx_xvinsgr2vr_d(_1, 1, 1); }
 // CHECK-LABEL: @xvreplve0_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_b(v32i8 _1) { return __lasx_xvreplve0_b(_1); }
 // CHECK-LABEL: @xvreplve0_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvreplve0_h(v16i16 _1) { return __lasx_xvreplve0_h(_1); }
 // CHECK-LABEL: @xvreplve0_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvreplve0_w(v8i32 _1) { return __lasx_xvreplve0_w(_1); }
 // CHECK-LABEL: @xvreplve0_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvreplve0_d(v4i64 _1) { return __lasx_xvreplve0_d(_1); }
 // CHECK-LABEL: @xvreplve0_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_q(v32i8 _1) { return __lasx_xvreplve0_q(_1); }
 // CHECK-LABEL: @vext2xv_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_h_b(v32i8 _1) { return __lasx_vext2xv_h_b(_1); }
 // CHECK-LABEL: @vext2xv_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_h(v16i16 _1) { return __lasx_vext2xv_w_h(_1); }
 // CHECK-LABEL: @vext2xv_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_w(v8i32 _1) { return __lasx_vext2xv_d_w(_1); }
 // CHECK-LABEL: @vext2xv_w_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_b(v32i8 _1) { return __lasx_vext2xv_w_b(_1); }
 // CHECK-LABEL: @vext2xv_d_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_h(v16i16 _1) { return __lasx_vext2xv_d_h(_1); }
 // CHECK-LABEL: @vext2xv_d_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_b(v32i8 _1) { return __lasx_vext2xv_d_b(_1); }
 // CHECK-LABEL: @vext2xv_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_hu_bu(v32i8 _1) { return __lasx_vext2xv_hu_bu(_1); }
 // CHECK-LABEL: @vext2xv_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_hu(v16i16 _1) { return __lasx_vext2xv_wu_hu(_1); }
 // CHECK-LABEL: @vext2xv_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_wu(v8i32 _1) { return __lasx_vext2xv_du_wu(_1); }
 // CHECK-LABEL: @vext2xv_wu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_bu(v32i8 _1) { return __lasx_vext2xv_wu_bu(_1); }
 // CHECK-LABEL: @vext2xv_du_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_hu(v16i16 _1) { return __lasx_vext2xv_du_hu(_1); }
 // CHECK-LABEL: @vext2xv_du_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_bu(v32i8 _1) { return __lasx_vext2xv_du_bu(_1); }
 // CHECK-LABEL: @xvpermi_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpermi_q(v32i8 _1, v32i8 _2) { return __lasx_xvpermi_q(_1, _2, 1); }
 // CHECK-LABEL: @xvpermi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpermi_d(v4i64 _1) { return __lasx_xvpermi_d(_1, 1); }
 // CHECK-LABEL: @xvperm_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvperm_w(v8i32 _1, v8i32 _2) { return __lasx_xvperm_w(_1, _2); }
 // CHECK-LABEL: @xvldrepl_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvldrepl_b(void * _1) { return __lasx_xvldrepl_b(_1, 1); }
 // CHECK-LABEL: @xvldrepl_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(ptr [[_1:%.*]], i32 2)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvldrepl_h(void * _1) { return __lasx_xvldrepl_h(_1, 2); }
 // CHECK-LABEL: @xvldrepl_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(ptr [[_1:%.*]], i32 4)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvldrepl_w(void * _1) { return __lasx_xvldrepl_w(_1, 4); }
 // CHECK-LABEL: @xvldrepl_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(ptr [[_1:%.*]], i32 8)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvldrepl_d(void * _1) { return __lasx_xvldrepl_d(_1, 8); }
 // CHECK-LABEL: @xvpickve2gr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xvpickve2gr_w(v8i32 _1) { return __lasx_xvpickve2gr_w(_1, 1); }
 // CHECK-LABEL: @xvpickve2gr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int xvpickve2gr_wu(v8i32 _1) { return __lasx_xvpickve2gr_wu(_1, 1); }
 // CHECK-LABEL: @xvpickve2gr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 long xvpickve2gr_d(v4i64 _1) { return __lasx_xvpickve2gr_d(_1, 1); }
 // CHECK-LABEL: @xvpickve2gr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 unsigned long int xvpickve2gr_du(v4i64 _1) { return __lasx_xvpickve2gr_du(_1, 1); }
 // CHECK-LABEL: @xvaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvaddwev_q_d(_1, _2); }
 // CHECK-LABEL: @xvaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvaddwev_d_w(_1, _2); }
 // CHECK-LABEL: @xvaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvaddwev_w_h(_1, _2); }
 // CHECK-LABEL: @xvaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvaddwev_h_b(_1, _2); }
 // CHECK-LABEL: @xvaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvaddwev_q_du(_1, _2); }
 // CHECK-LABEL: @xvaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvaddwev_d_wu(_1, _2); }
 // CHECK-LABEL: @xvaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvaddwev_w_hu(_1, _2); }
 // CHECK-LABEL: @xvaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvaddwev_h_bu(_1, _2); }
 // CHECK-LABEL: @xvsubwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvsubwev_q_d(_1, _2); }
 // CHECK-LABEL: @xvsubwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvsubwev_d_w(_1, _2); }
 // CHECK-LABEL: @xvsubwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvsubwev_w_h(_1, _2); }
 // CHECK-LABEL: @xvsubwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvsubwev_h_b(_1, _2); }
 // CHECK-LABEL: @xvsubwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvsubwev_q_du(_1, _2); }
 // CHECK-LABEL: @xvsubwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsubwev_d_wu(_1, _2); }
 // CHECK-LABEL: @xvsubwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsubwev_w_hu(_1, _2); }
 // CHECK-LABEL: @xvsubwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsubwev_h_bu(_1, _2); }
 // CHECK-LABEL: @xvmulwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvmulwev_q_d(_1, _2); }
 // CHECK-LABEL: @xvmulwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvmulwev_d_w(_1, _2); }
 // CHECK-LABEL: @xvmulwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvmulwev_w_h(_1, _2); }
 // CHECK-LABEL: @xvmulwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvmulwev_h_b(_1, _2); }
 // CHECK-LABEL: @xvmulwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvmulwev_q_du(_1, _2); }
 // CHECK-LABEL: @xvmulwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmulwev_d_wu(_1, _2); }
 // CHECK-LABEL: @xvmulwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmulwev_w_hu(_1, _2); }
 // CHECK-LABEL: @xvmulwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmulwev_h_bu(_1, _2); }
 // CHECK-LABEL: @xvaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvaddwod_q_d(_1, _2); }
 // CHECK-LABEL: @xvaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvaddwod_d_w(_1, _2); }
 // CHECK-LABEL: @xvaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvaddwod_w_h(_1, _2); }
 // CHECK-LABEL: @xvaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvaddwod_h_b(_1, _2); }
 // CHECK-LABEL: @xvaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvaddwod_q_du(_1, _2); }
 // CHECK-LABEL: @xvaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvaddwod_d_wu(_1, _2); }
 // CHECK-LABEL: @xvaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvaddwod_w_hu(_1, _2); }
 // CHECK-LABEL: @xvaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvaddwod_h_bu(_1, _2); }
 // CHECK-LABEL: @xvsubwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvsubwod_q_d(_1, _2); }
 // CHECK-LABEL: @xvsubwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvsubwod_d_w(_1, _2); }
 // CHECK-LABEL: @xvsubwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvsubwod_w_h(_1, _2); }
 // CHECK-LABEL: @xvsubwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvsubwod_h_b(_1, _2); }
 // CHECK-LABEL: @xvsubwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvsubwod_q_du(_1, _2); }
 // CHECK-LABEL: @xvsubwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsubwod_d_wu(_1, _2); }
 // CHECK-LABEL: @xvsubwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsubwod_w_hu(_1, _2); }
 // CHECK-LABEL: @xvsubwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsubwod_h_bu(_1, _2); }
 // CHECK-LABEL: @xvmulwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvmulwod_q_d(_1, _2); }
 // CHECK-LABEL: @xvmulwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvmulwod_d_w(_1, _2); }
 // CHECK-LABEL: @xvmulwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvmulwod_w_h(_1, _2); }
 // CHECK-LABEL: @xvmulwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvmulwod_h_b(_1, _2); }
 // CHECK-LABEL: @xvmulwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvmulwod_q_du(_1, _2); }
 // CHECK-LABEL: @xvmulwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmulwod_d_wu(_1, _2); }
 // CHECK-LABEL: @xvmulwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmulwod_w_hu(_1, _2); }
 // CHECK-LABEL: @xvmulwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmulwod_h_bu(_1, _2); }
 // CHECK-LABEL: @xvaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvaddwev_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvaddwev_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvaddwev_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvmulwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvmulwev_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvmulwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvmulwev_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvmulwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvmulwev_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvaddwod_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvaddwod_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvaddwod_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvmulwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvmulwod_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvmulwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvmulwod_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvmulwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvmulwod_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvhaddw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvhaddw_q_d(_1, _2); }
 // CHECK-LABEL: @xvhaddw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_qu_du(v4u64 _1, v4u64 _2) { return __lasx_xvhaddw_qu_du(_1, _2); }
 // CHECK-LABEL: @xvhsubw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvhsubw_q_d(_1, _2); }
 // CHECK-LABEL: @xvhsubw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvhsubw_qu_du(v4u64 _1, v4u64 _2) { return __lasx_xvhsubw_qu_du(_1, _2); }
 // CHECK-LABEL: @xvmaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmaddwev_q_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmaddwev_d_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmaddwev_w_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmaddwev_h_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __lasx_xvmaddwev_q_du(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __lasx_xvmaddwev_d_wu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwev_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __lasx_xvmaddwev_w_hu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwev_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __lasx_xvmaddwev_h_bu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmaddwod_q_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmaddwod_d_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmaddwod_w_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmaddwod_h_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __lasx_xvmaddwod_q_du(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __lasx_xvmaddwod_d_wu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwod_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __lasx_xvmaddwod_w_hu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwod_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __lasx_xvmaddwod_h_bu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __lasx_xvmaddwev_q_du_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __lasx_xvmaddwev_d_wu_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __lasx_xvmaddwev_w_hu_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __lasx_xvmaddwev_h_bu_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __lasx_xvmaddwod_q_du_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __lasx_xvmaddwod_d_wu_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __lasx_xvmaddwod_w_hu_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __lasx_xvmaddwod_h_bu_b(_1, _2, _3); }
 // CHECK-LABEL: @xvrotr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrotr_b(v32i8 _1, v32i8 _2) { return __lasx_xvrotr_b(_1, _2); }
 // CHECK-LABEL: @xvrotr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrotr_h(v16i16 _1, v16i16 _2) { return __lasx_xvrotr_h(_1, _2); }
 // CHECK-LABEL: @xvrotr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrotr_w(v8i32 _1, v8i32 _2) { return __lasx_xvrotr_w(_1, _2); }
 // CHECK-LABEL: @xvrotr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrotr_d(v4i64 _1, v4i64 _2) { return __lasx_xvrotr_d(_1, _2); }
 // CHECK-LABEL: @xvadd_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvadd_q(v4i64 _1, v4i64 _2) { return __lasx_xvadd_q(_1, _2); }
 // CHECK-LABEL: @xvsub_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsub_q(v4i64 _1, v4i64 _2) { return __lasx_xvsub_q(_1, _2); }
 // CHECK-LABEL: @xvaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvaddwev_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvaddwod_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvmulwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvmulwev_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvmulwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvmulwod_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvmskgez_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmskgez_b(v32i8 _1) { return __lasx_xvmskgez_b(_1); }
 // CHECK-LABEL: @xvmsknz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmsknz_b(v32i8 _1) { return __lasx_xvmsknz_b(_1); }
 // CHECK-LABEL: @xvexth_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvexth_h_b(v32i8 _1) { return __lasx_xvexth_h_b(_1); }
 // CHECK-LABEL: @xvexth_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvexth_w_h(v16i16 _1) { return __lasx_xvexth_w_h(_1); }
 // CHECK-LABEL: @xvexth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvexth_d_w(v8i32 _1) { return __lasx_xvexth_d_w(_1); }
 // CHECK-LABEL: @xvexth_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvexth_q_d(v4i64 _1) { return __lasx_xvexth_q_d(_1); }
 // CHECK-LABEL: @xvexth_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_112]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvexth_hu_bu(v32u8 _1) { return __lasx_xvexth_hu_bu(_1); }
 // CHECK-LABEL: @xvexth_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_112]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvexth_wu_hu(v16u16 _1) { return __lasx_xvexth_wu_hu(_1); }
 // CHECK-LABEL: @xvexth_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_112]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvexth_du_wu(v8u32 _1) { return __lasx_xvexth_du_wu(_1); }
 // CHECK-LABEL: @xvexth_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvexth_qu_du(v4u64 _1) { return __lasx_xvexth_qu_du(_1); }
 // CHECK-LABEL: @xvrotri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrotri_b(v32i8 _1) { return __lasx_xvrotri_b(_1, 1); }
 // CHECK-LABEL: @xvrotri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrotri_h(v16i16 _1) { return __lasx_xvrotri_h(_1, 1); }
 // CHECK-LABEL: @xvrotri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrotri_w(v8i32 _1) { return __lasx_xvrotri_w(_1, 1); }
 // CHECK-LABEL: @xvrotri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrotri_d(v4i64 _1) { return __lasx_xvrotri_d(_1, 1); }
 // CHECK-LABEL: @xvextl_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvextl_q_d(v4i64 _1) { return __lasx_xvextl_q_d(_1); }
 // CHECK-LABEL: @xvsrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlrni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrlni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrlni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrlni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrlni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrlni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrlni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrlni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrlni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrlni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrlni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrlni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrlni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrlrni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrlrni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrlrni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrlrni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrlrni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrlrni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrani_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrani_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrani_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrani_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrarni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrarni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrarni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrarni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrani_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrani_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrani_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrani_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrani_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrani_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrani_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrani_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrani_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrani_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrani_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrani_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrarni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrarni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrarni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrarni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrarni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrarni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrarni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrarni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrarni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrarni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrarni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrarni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xbnz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_b(v32u8 _1) { return __lasx_xbnz_b(_1); }
 // CHECK-LABEL: @xbnz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_d(v4u64 _1) { return __lasx_xbnz_d(_1); }
 // CHECK-LABEL: @xbnz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_h(v16u16 _1) { return __lasx_xbnz_h(_1); }
 // CHECK-LABEL: @xbnz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_v(v32u8 _1) { return __lasx_xbnz_v(_1); }
 // CHECK-LABEL: @xbnz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_w(v8u32 _1) { return __lasx_xbnz_w(_1); }
 // CHECK-LABEL: @xbz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_b(v32u8 _1) { return __lasx_xbz_b(_1); }
 // CHECK-LABEL: @xbz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_d(v4u64 _1) { return __lasx_xbz_d(_1); }
 // CHECK-LABEL: @xbz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_h(v16u16 _1) { return __lasx_xbz_h(_1); }
 // CHECK-LABEL: @xbz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_v(v32u8 _1) { return __lasx_xbz_v(_1); }
 // CHECK-LABEL: @xbz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_w(v8u32 _1) { return __lasx_xbz_w(_1); }
 // CHECK-LABEL: @xvfcmp_caf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_caf_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_caf_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_caf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_caf_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_caf_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_ceq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_ceq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_ceq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_ceq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_ceq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_ceq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cle_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cle_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cle_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cle_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_clt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_clt_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_clt_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_clt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_clt_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_clt_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cne_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cne_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cne_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cne_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cor_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cor_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cor_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cor_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cueq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cueq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cueq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cueq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cule_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cule_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cule_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cule_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cult_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cult_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cult_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cult_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cun_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cun_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cune_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cune_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cune_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cune_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cun_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cun_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_saf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_saf_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_saf_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_saf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_saf_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_saf_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_seq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_seq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_seq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_seq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_seq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_seq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sle_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sle_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sle_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sle_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_slt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_slt_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_slt_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_slt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_slt_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_slt_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sne_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sne_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sne_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sne_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sor_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sor_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sor_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sor_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sueq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sueq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sueq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sueq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sule_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sule_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sule_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sule_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sult_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sult_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sult_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sult_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sun_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sun_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sune_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sune_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sune_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sune_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sun_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sun_s(_1, _2); }
 // CHECK-LABEL: @xvpickve_d_f(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvpickve_d_f(v4f64 _1) { return __lasx_xvpickve_d_f(_1, 1); }
 // CHECK-LABEL: @xvpickve_w_f(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvpickve_w_f(v8f32 _1) { return __lasx_xvpickve_w_f(_1, 1); }
 // CHECK-LABEL: @xvrepli_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrepli_b() { return __lasx_xvrepli_b(1); }
 // CHECK-LABEL: @xvrepli_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrepli_d() { return __lasx_xvrepli_d(1); }
 // CHECK-LABEL: @xvrepli_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrepli_h() { return __lasx_xvrepli_h(1); }
 // CHECK-LABEL: @xvrepli_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrepli_w() { return __lasx_xvrepli_w(1); }
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin.c b/clang/test/CodeGen/LoongArch/lasx/builtin.c
index 0185f2004d52..f52a23a5faea 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin.c
@@ -27,4426 +27,6382 @@ typedef double v4f64_d __attribute__((vector_size(32), aligned(8)));
 
 // CHECK-LABEL: @xvsll_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsll_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsll_b(_1, _2); }
 // CHECK-LABEL: @xvsll_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsll_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsll_h(_1, _2); }
 // CHECK-LABEL: @xvsll_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsll_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsll_w(_1, _2); }
 // CHECK-LABEL: @xvsll_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsll_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsll_d(_1, _2); }
 // CHECK-LABEL: @xvslli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslli_b(v32i8 _1) { return __builtin_lasx_xvslli_b(_1, 1); }
 // CHECK-LABEL: @xvslli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslli_h(v16i16 _1) { return __builtin_lasx_xvslli_h(_1, 1); }
 // CHECK-LABEL: @xvslli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslli_w(v8i32 _1) { return __builtin_lasx_xvslli_w(_1, 1); }
 // CHECK-LABEL: @xvslli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslli_d(v4i64 _1) { return __builtin_lasx_xvslli_d(_1, 1); }
 // CHECK-LABEL: @xvsra_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsra_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsra_b(_1, _2); }
 // CHECK-LABEL: @xvsra_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsra_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsra_h(_1, _2); }
 // CHECK-LABEL: @xvsra_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsra_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsra_w(_1, _2); }
 // CHECK-LABEL: @xvsra_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsra_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsra_d(_1, _2); }
 // CHECK-LABEL: @xvsrai_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrai_b(v32i8 _1) { return __builtin_lasx_xvsrai_b(_1, 1); }
 // CHECK-LABEL: @xvsrai_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrai_h(v16i16 _1) { return __builtin_lasx_xvsrai_h(_1, 1); }
 // CHECK-LABEL: @xvsrai_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrai_w(v8i32 _1) { return __builtin_lasx_xvsrai_w(_1, 1); }
 // CHECK-LABEL: @xvsrai_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrai_d(v4i64 _1) { return __builtin_lasx_xvsrai_d(_1, 1); }
 // CHECK-LABEL: @xvsrar_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrar_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrar_b(_1, _2); }
 // CHECK-LABEL: @xvsrar_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrar_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrar_h(_1, _2); }
 // CHECK-LABEL: @xvsrar_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrar_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrar_w(_1, _2); }
 // CHECK-LABEL: @xvsrar_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrar_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrar_d(_1, _2); }
 // CHECK-LABEL: @xvsrari_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrari_b(v32i8 _1) { return __builtin_lasx_xvsrari_b(_1, 1); }
 // CHECK-LABEL: @xvsrari_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrari_h(v16i16 _1) { return __builtin_lasx_xvsrari_h(_1, 1); }
 // CHECK-LABEL: @xvsrari_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrari_w(v8i32 _1) { return __builtin_lasx_xvsrari_w(_1, 1); }
 // CHECK-LABEL: @xvsrari_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrari_d(v4i64 _1) { return __builtin_lasx_xvsrari_d(_1, 1); }
 // CHECK-LABEL: @xvsrl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrl_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrl_b(_1, _2); }
 // CHECK-LABEL: @xvsrl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrl_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrl_h(_1, _2); }
 // CHECK-LABEL: @xvsrl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrl_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrl_w(_1, _2); }
 // CHECK-LABEL: @xvsrl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrl_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrl_d(_1, _2); }
 // CHECK-LABEL: @xvsrli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrli_b(v32i8 _1) { return __builtin_lasx_xvsrli_b(_1, 1); }
 // CHECK-LABEL: @xvsrli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrli_h(v16i16 _1) { return __builtin_lasx_xvsrli_h(_1, 1); }
 // CHECK-LABEL: @xvsrli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrli_w(v8i32 _1) { return __builtin_lasx_xvsrli_w(_1, 1); }
 // CHECK-LABEL: @xvsrli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrli_d(v4i64 _1) { return __builtin_lasx_xvsrli_d(_1, 1); }
 // CHECK-LABEL: @xvsrlr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlr_b(_1, _2); }
 // CHECK-LABEL: @xvsrlr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlr_h(_1, _2); }
 // CHECK-LABEL: @xvsrlr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlr_w(_1, _2); }
 // CHECK-LABEL: @xvsrlr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlr_d(_1, _2); }
 // CHECK-LABEL: @xvsrlri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlri_b(v32i8 _1) { return __builtin_lasx_xvsrlri_b(_1, 1); }
 // CHECK-LABEL: @xvsrlri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlri_h(v16i16 _1) { return __builtin_lasx_xvsrlri_h(_1, 1); }
 // CHECK-LABEL: @xvsrlri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlri_w(v8i32 _1) { return __builtin_lasx_xvsrlri_w(_1, 1); }
 // CHECK-LABEL: @xvsrlri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlri_d(v4i64 _1) { return __builtin_lasx_xvsrlri_d(_1, 1); }
 // CHECK-LABEL: @xvbitclr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitclr_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitclr_b(_1, _2); }
 // CHECK-LABEL: @xvbitclr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitclr_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitclr_h(_1, _2); }
 // CHECK-LABEL: @xvbitclr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitclr_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitclr_w(_1, _2); }
 // CHECK-LABEL: @xvbitclr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitclr_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitclr_d(_1, _2); }
 // CHECK-LABEL: @xvbitclri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitclri_b(v32u8 _1) { return __builtin_lasx_xvbitclri_b(_1, 1); }
 // CHECK-LABEL: @xvbitclri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitclri_h(v16u16 _1) { return __builtin_lasx_xvbitclri_h(_1, 1); }
 // CHECK-LABEL: @xvbitclri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitclri_w(v8u32 _1) { return __builtin_lasx_xvbitclri_w(_1, 1); }
 // CHECK-LABEL: @xvbitclri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitclri_d(v4u64 _1) { return __builtin_lasx_xvbitclri_d(_1, 1); }
 // CHECK-LABEL: @xvbitset_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitset_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitset_b(_1, _2); }
 // CHECK-LABEL: @xvbitset_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitset_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitset_h(_1, _2); }
 // CHECK-LABEL: @xvbitset_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitset_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitset_w(_1, _2); }
 // CHECK-LABEL: @xvbitset_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitset_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitset_d(_1, _2); }
 // CHECK-LABEL: @xvbitseti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitseti_b(v32u8 _1) { return __builtin_lasx_xvbitseti_b(_1, 1); }
 // CHECK-LABEL: @xvbitseti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitseti_h(v16u16 _1) { return __builtin_lasx_xvbitseti_h(_1, 1); }
 // CHECK-LABEL: @xvbitseti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitseti_w(v8u32 _1) { return __builtin_lasx_xvbitseti_w(_1, 1); }
 // CHECK-LABEL: @xvbitseti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitseti_d(v4u64 _1) { return __builtin_lasx_xvbitseti_d(_1, 1); }
 // CHECK-LABEL: @xvbitrev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitrev_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitrev_b(_1, _2); }
 // CHECK-LABEL: @xvbitrev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitrev_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitrev_h(_1, _2); }
 // CHECK-LABEL: @xvbitrev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitrev_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitrev_w(_1, _2); }
 // CHECK-LABEL: @xvbitrev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitrev_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitrev_d(_1, _2); }
 // CHECK-LABEL: @xvbitrevi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitrevi_b(v32u8 _1) { return __builtin_lasx_xvbitrevi_b(_1, 1); }
 // CHECK-LABEL: @xvbitrevi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvbitrevi_h(v16u16 _1) { return __builtin_lasx_xvbitrevi_h(_1, 1); }
 // CHECK-LABEL: @xvbitrevi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvbitrevi_w(v8u32 _1) { return __builtin_lasx_xvbitrevi_w(_1, 1); }
 // CHECK-LABEL: @xvbitrevi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvbitrevi_d(v4u64 _1) { return __builtin_lasx_xvbitrevi_d(_1, 1); }
 // CHECK-LABEL: @xvadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvadd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvadd_b(_1, _2); }
 // CHECK-LABEL: @xvadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvadd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvadd_h(_1, _2); }
 // CHECK-LABEL: @xvadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvadd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvadd_w(_1, _2); }
 // CHECK-LABEL: @xvadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvadd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadd_d(_1, _2); }
 // CHECK-LABEL: @xvaddi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvaddi_bu(v32i8 _1) { return __builtin_lasx_xvaddi_bu(_1, 1); }
 // CHECK-LABEL: @xvaddi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddi_hu(v16i16 _1) { return __builtin_lasx_xvaddi_hu(_1, 1); }
 // CHECK-LABEL: @xvaddi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddi_wu(v8i32 _1) { return __builtin_lasx_xvaddi_wu(_1, 1); }
 // CHECK-LABEL: @xvaddi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddi_du(v4i64 _1) { return __builtin_lasx_xvaddi_du(_1, 1); }
 // CHECK-LABEL: @xvsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsub_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsub_b(_1, _2); }
 // CHECK-LABEL: @xvsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsub_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsub_h(_1, _2); }
 // CHECK-LABEL: @xvsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsub_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsub_w(_1, _2); }
 // CHECK-LABEL: @xvsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsub_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsub_d(_1, _2); }
 // CHECK-LABEL: @xvsubi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsubi_bu(v32i8 _1) { return __builtin_lasx_xvsubi_bu(_1, 1); }
 // CHECK-LABEL: @xvsubi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubi_hu(v16i16 _1) { return __builtin_lasx_xvsubi_hu(_1, 1); }
 // CHECK-LABEL: @xvsubi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubi_wu(v8i32 _1) { return __builtin_lasx_xvsubi_wu(_1, 1); }
 // CHECK-LABEL: @xvsubi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubi_du(v4i64 _1) { return __builtin_lasx_xvsubi_du(_1, 1); }
 // CHECK-LABEL: @xvmax_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmax_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmax_b(_1, _2); }
 // CHECK-LABEL: @xvmax_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmax_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmax_h(_1, _2); }
 // CHECK-LABEL: @xvmax_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmax_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmax_w(_1, _2); }
 // CHECK-LABEL: @xvmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmax_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmax_d(_1, _2); }
 // CHECK-LABEL: @xvmaxi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmaxi_b(v32i8 _1) { return __builtin_lasx_xvmaxi_b(_1, 1); }
 // CHECK-LABEL: @xvmaxi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaxi_h(v16i16 _1) { return __builtin_lasx_xvmaxi_h(_1, 1); }
 // CHECK-LABEL: @xvmaxi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaxi_w(v8i32 _1) { return __builtin_lasx_xvmaxi_w(_1, 1); }
 // CHECK-LABEL: @xvmaxi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaxi_d(v4i64 _1) { return __builtin_lasx_xvmaxi_d(_1, 1); }
 // CHECK-LABEL: @xvmax_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmax_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmax_bu(_1, _2); }
 // CHECK-LABEL: @xvmax_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmax_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmax_hu(_1, _2); }
 // CHECK-LABEL: @xvmax_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmax_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmax_wu(_1, _2); }
 // CHECK-LABEL: @xvmax_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmax_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmax_du(_1, _2); }
 // CHECK-LABEL: @xvmaxi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmaxi_bu(v32u8 _1) { return __builtin_lasx_xvmaxi_bu(_1, 1); }
 // CHECK-LABEL: @xvmaxi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmaxi_hu(v16u16 _1) { return __builtin_lasx_xvmaxi_hu(_1, 1); }
 // CHECK-LABEL: @xvmaxi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmaxi_wu(v8u32 _1) { return __builtin_lasx_xvmaxi_wu(_1, 1); }
 // CHECK-LABEL: @xvmaxi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaxi_du(v4u64 _1) { return __builtin_lasx_xvmaxi_du(_1, 1); }
 // CHECK-LABEL: @xvmin_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmin_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmin_b(_1, _2); }
 // CHECK-LABEL: @xvmin_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmin_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmin_h(_1, _2); }
 // CHECK-LABEL: @xvmin_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmin_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmin_w(_1, _2); }
 // CHECK-LABEL: @xvmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmin_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmin_d(_1, _2); }
 // CHECK-LABEL: @xvmini_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmini_b(v32i8 _1) { return __builtin_lasx_xvmini_b(_1, 1); }
 // CHECK-LABEL: @xvmini_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmini_h(v16i16 _1) { return __builtin_lasx_xvmini_h(_1, 1); }
 // CHECK-LABEL: @xvmini_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmini_w(v8i32 _1) { return __builtin_lasx_xvmini_w(_1, 1); }
 // CHECK-LABEL: @xvmini_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmini_d(v4i64 _1) { return __builtin_lasx_xvmini_d(_1, 1); }
 // CHECK-LABEL: @xvmin_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmin_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmin_bu(_1, _2); }
 // CHECK-LABEL: @xvmin_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmin_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmin_hu(_1, _2); }
 // CHECK-LABEL: @xvmin_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmin_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmin_wu(_1, _2); }
 // CHECK-LABEL: @xvmin_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmin_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmin_du(_1, _2); }
 // CHECK-LABEL: @xvmini_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmini_bu(v32u8 _1) { return __builtin_lasx_xvmini_bu(_1, 1); }
 // CHECK-LABEL: @xvmini_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmini_hu(v16u16 _1) { return __builtin_lasx_xvmini_hu(_1, 1); }
 // CHECK-LABEL: @xvmini_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmini_wu(v8u32 _1) { return __builtin_lasx_xvmini_wu(_1, 1); }
 // CHECK-LABEL: @xvmini_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmini_du(v4u64 _1) { return __builtin_lasx_xvmini_du(_1, 1); }
 // CHECK-LABEL: @xvseq_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvseq_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvseq_b(_1, _2); }
 // CHECK-LABEL: @xvseq_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvseq_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvseq_h(_1, _2); }
 // CHECK-LABEL: @xvseq_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvseq_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvseq_w(_1, _2); }
 // CHECK-LABEL: @xvseq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvseq_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvseq_d(_1, _2); }
 // CHECK-LABEL: @xvseqi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvseqi_b(v32i8 _1) { return __builtin_lasx_xvseqi_b(_1, 1); }
 // CHECK-LABEL: @xvseqi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvseqi_h(v16i16 _1) { return __builtin_lasx_xvseqi_h(_1, 1); }
 // CHECK-LABEL: @xvseqi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvseqi_w(v8i32 _1) { return __builtin_lasx_xvseqi_w(_1, 1); }
 // CHECK-LABEL: @xvseqi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvseqi_d(v4i64 _1) { return __builtin_lasx_xvseqi_d(_1, 1); }
 // CHECK-LABEL: @xvslt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslt_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvslt_b(_1, _2); }
 // CHECK-LABEL: @xvslt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslt_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvslt_h(_1, _2); }
 // CHECK-LABEL: @xvslt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslt_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvslt_w(_1, _2); }
 // CHECK-LABEL: @xvslt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslt_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvslt_d(_1, _2); }
 // CHECK-LABEL: @xvslti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslti_b(v32i8 _1) { return __builtin_lasx_xvslti_b(_1, 1); }
 // CHECK-LABEL: @xvslti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslti_h(v16i16 _1) { return __builtin_lasx_xvslti_h(_1, 1); }
 // CHECK-LABEL: @xvslti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslti_w(v8i32 _1) { return __builtin_lasx_xvslti_w(_1, 1); }
 // CHECK-LABEL: @xvslti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslti_d(v4i64 _1) { return __builtin_lasx_xvslti_d(_1, 1); }
 // CHECK-LABEL: @xvslt_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslt_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvslt_bu(_1, _2); }
 // CHECK-LABEL: @xvslt_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslt_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvslt_hu(_1, _2); }
 // CHECK-LABEL: @xvslt_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslt_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvslt_wu(_1, _2); }
 // CHECK-LABEL: @xvslt_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslt_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvslt_du(_1, _2); }
 // CHECK-LABEL: @xvslti_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslti_bu(v32u8 _1) { return __builtin_lasx_xvslti_bu(_1, 1); }
 // CHECK-LABEL: @xvslti_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslti_hu(v16u16 _1) { return __builtin_lasx_xvslti_hu(_1, 1); }
 // CHECK-LABEL: @xvslti_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslti_wu(v8u32 _1) { return __builtin_lasx_xvslti_wu(_1, 1); }
 // CHECK-LABEL: @xvslti_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslti_du(v4u64 _1) { return __builtin_lasx_xvslti_du(_1, 1); }
 // CHECK-LABEL: @xvsle_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsle_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsle_b(_1, _2); }
 // CHECK-LABEL: @xvsle_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsle_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsle_h(_1, _2); }
 // CHECK-LABEL: @xvsle_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsle_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsle_w(_1, _2); }
 // CHECK-LABEL: @xvsle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsle_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsle_d(_1, _2); }
 // CHECK-LABEL: @xvslei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslei_b(v32i8 _1) { return __builtin_lasx_xvslei_b(_1, 1); }
 // CHECK-LABEL: @xvslei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslei_h(v16i16 _1) { return __builtin_lasx_xvslei_h(_1, 1); }
 // CHECK-LABEL: @xvslei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslei_w(v8i32 _1) { return __builtin_lasx_xvslei_w(_1, 1); }
 // CHECK-LABEL: @xvslei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslei_d(v4i64 _1) { return __builtin_lasx_xvslei_d(_1, 1); }
 // CHECK-LABEL: @xvsle_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsle_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsle_bu(_1, _2); }
 // CHECK-LABEL: @xvsle_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsle_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsle_hu(_1, _2); }
 // CHECK-LABEL: @xvsle_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsle_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsle_wu(_1, _2); }
 // CHECK-LABEL: @xvsle_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsle_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsle_du(_1, _2); }
 // CHECK-LABEL: @xvslei_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvslei_bu(v32u8 _1) { return __builtin_lasx_xvslei_bu(_1, 1); }
 // CHECK-LABEL: @xvslei_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvslei_hu(v16u16 _1) { return __builtin_lasx_xvslei_hu(_1, 1); }
 // CHECK-LABEL: @xvslei_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvslei_wu(v8u32 _1) { return __builtin_lasx_xvslei_wu(_1, 1); }
 // CHECK-LABEL: @xvslei_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvslei_du(v4u64 _1) { return __builtin_lasx_xvslei_du(_1, 1); }
 // CHECK-LABEL: @xvsat_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsat_b(v32i8 _1) { return __builtin_lasx_xvsat_b(_1, 1); }
 // CHECK-LABEL: @xvsat_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsat_h(v16i16 _1) { return __builtin_lasx_xvsat_h(_1, 1); }
 // CHECK-LABEL: @xvsat_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsat_w(v8i32 _1) { return __builtin_lasx_xvsat_w(_1, 1); }
 // CHECK-LABEL: @xvsat_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsat_d(v4i64 _1) { return __builtin_lasx_xvsat_d(_1, 1); }
 // CHECK-LABEL: @xvsat_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvsat_bu(v32u8 _1) { return __builtin_lasx_xvsat_bu(_1, 1); }
 // CHECK-LABEL: @xvsat_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvsat_hu(v16u16 _1) { return __builtin_lasx_xvsat_hu(_1, 1); }
 // CHECK-LABEL: @xvsat_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvsat_wu(v8u32 _1) { return __builtin_lasx_xvsat_wu(_1, 1); }
 // CHECK-LABEL: @xvsat_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvsat_du(v4u64 _1) { return __builtin_lasx_xvsat_du(_1, 1); }
 // CHECK-LABEL: @xvadda_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvadda_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvadda_b(_1, _2); }
 // CHECK-LABEL: @xvadda_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvadda_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvadda_h(_1, _2); }
 // CHECK-LABEL: @xvadda_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvadda_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvadda_w(_1, _2); }
 // CHECK-LABEL: @xvadda_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvadda_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadda_d(_1, _2); }
 // CHECK-LABEL: @xvsadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsadd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsadd_b(_1, _2); }
 // CHECK-LABEL: @xvsadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsadd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsadd_h(_1, _2); }
 // CHECK-LABEL: @xvsadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsadd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsadd_w(_1, _2); }
 // CHECK-LABEL: @xvsadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsadd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsadd_d(_1, _2); }
 // CHECK-LABEL: @xvsadd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvsadd_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsadd_bu(_1, _2); }
 // CHECK-LABEL: @xvsadd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvsadd_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsadd_hu(_1, _2); }
 // CHECK-LABEL: @xvsadd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvsadd_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsadd_wu(_1, _2); }
 // CHECK-LABEL: @xvsadd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvsadd_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsadd_du(_1, _2); }
 // CHECK-LABEL: @xvavg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvavg_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvavg_b(_1, _2); }
 // CHECK-LABEL: @xvavg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvavg_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvavg_h(_1, _2); }
 // CHECK-LABEL: @xvavg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvavg_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvavg_w(_1, _2); }
 // CHECK-LABEL: @xvavg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvavg_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvavg_d(_1, _2); }
 // CHECK-LABEL: @xvavg_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvavg_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvavg_bu(_1, _2); }
 // CHECK-LABEL: @xvavg_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvavg_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvavg_hu(_1, _2); }
 // CHECK-LABEL: @xvavg_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvavg_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvavg_wu(_1, _2); }
 // CHECK-LABEL: @xvavg_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvavg_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvavg_du(_1, _2); }
 // CHECK-LABEL: @xvavgr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvavgr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvavgr_b(_1, _2); }
 // CHECK-LABEL: @xvavgr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvavgr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvavgr_h(_1, _2); }
 // CHECK-LABEL: @xvavgr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvavgr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvavgr_w(_1, _2); }
 // CHECK-LABEL: @xvavgr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvavgr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvavgr_d(_1, _2); }
 // CHECK-LABEL: @xvavgr_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvavgr_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvavgr_bu(_1, _2); }
 // CHECK-LABEL: @xvavgr_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvavgr_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvavgr_hu(_1, _2); }
 // CHECK-LABEL: @xvavgr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvavgr_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvavgr_wu(_1, _2); }
 // CHECK-LABEL: @xvavgr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvavgr_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvavgr_du(_1, _2); }
 // CHECK-LABEL: @xvssub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssub_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssub_b(_1, _2); }
 // CHECK-LABEL: @xvssub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssub_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssub_h(_1, _2); }
 // CHECK-LABEL: @xvssub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssub_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssub_w(_1, _2); }
 // CHECK-LABEL: @xvssub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssub_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssub_d(_1, _2); }
 // CHECK-LABEL: @xvssub_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssub_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvssub_bu(_1, _2); }
 // CHECK-LABEL: @xvssub_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssub_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssub_hu(_1, _2); }
 // CHECK-LABEL: @xvssub_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssub_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssub_wu(_1, _2); }
 // CHECK-LABEL: @xvssub_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssub_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssub_du(_1, _2); }
 // CHECK-LABEL: @xvabsd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvabsd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvabsd_b(_1, _2); }
 // CHECK-LABEL: @xvabsd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvabsd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvabsd_h(_1, _2); }
 // CHECK-LABEL: @xvabsd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvabsd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvabsd_w(_1, _2); }
 // CHECK-LABEL: @xvabsd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvabsd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvabsd_d(_1, _2); }
 // CHECK-LABEL: @xvabsd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvabsd_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvabsd_bu(_1, _2); }
 // CHECK-LABEL: @xvabsd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvabsd_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvabsd_hu(_1, _2); }
 // CHECK-LABEL: @xvabsd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvabsd_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvabsd_wu(_1, _2); }
 // CHECK-LABEL: @xvabsd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvabsd_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvabsd_du(_1, _2); }
 // CHECK-LABEL: @xvmul_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmul_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmul_b(_1, _2); }
 // CHECK-LABEL: @xvmul_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmul_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmul_h(_1, _2); }
 // CHECK-LABEL: @xvmul_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmul_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmul_w(_1, _2); }
 // CHECK-LABEL: @xvmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmul_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmul_d(_1, _2); }
 // CHECK-LABEL: @xvmadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmadd_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmadd_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmadd_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmadd_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmadd_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmadd_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmadd_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmadd_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmsub_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmsub_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmsub_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmsub_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmsub_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmsub_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmsub_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmsub_d(_1, _2, _3); }
 // CHECK-LABEL: @xvdiv_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvdiv_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvdiv_b(_1, _2); }
 // CHECK-LABEL: @xvdiv_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvdiv_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvdiv_h(_1, _2); }
 // CHECK-LABEL: @xvdiv_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvdiv_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvdiv_w(_1, _2); }
 // CHECK-LABEL: @xvdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvdiv_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvdiv_d(_1, _2); }
 // CHECK-LABEL: @xvdiv_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvdiv_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvdiv_bu(_1, _2); }
 // CHECK-LABEL: @xvdiv_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvdiv_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvdiv_hu(_1, _2); }
 // CHECK-LABEL: @xvdiv_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvdiv_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvdiv_wu(_1, _2); }
 // CHECK-LABEL: @xvdiv_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvdiv_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvdiv_du(_1, _2); }
 // CHECK-LABEL: @xvhaddw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvhaddw_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvhaddw_h_b(_1, _2); }
 // CHECK-LABEL: @xvhaddw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvhaddw_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvhaddw_w_h(_1, _2); }
 // CHECK-LABEL: @xvhaddw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvhaddw_d_w(_1, _2); }
 // CHECK-LABEL: @xvhaddw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvhaddw_hu_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvhaddw_hu_bu(_1, _2); }
 // CHECK-LABEL: @xvhaddw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvhaddw_wu_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvhaddw_wu_hu(_1, _2); }
 // CHECK-LABEL: @xvhaddw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_du_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvhaddw_du_wu(_1, _2); }
 // CHECK-LABEL: @xvhsubw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvhsubw_h_b(_1, _2); }
 // CHECK-LABEL: @xvhsubw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvhsubw_w_h(_1, _2); }
 // CHECK-LABEL: @xvhsubw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvhsubw_d_w(_1, _2); }
 // CHECK-LABEL: @xvhsubw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_hu_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvhsubw_hu_bu(_1, _2); }
 // CHECK-LABEL: @xvhsubw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_wu_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvhsubw_wu_hu(_1, _2); }
 // CHECK-LABEL: @xvhsubw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_du_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvhsubw_du_wu(_1, _2); }
 // CHECK-LABEL: @xvmod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmod_b(_1, _2); }
 // CHECK-LABEL: @xvmod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmod_h(_1, _2); }
 // CHECK-LABEL: @xvmod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmod_w(_1, _2); }
 // CHECK-LABEL: @xvmod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmod_d(_1, _2); }
 // CHECK-LABEL: @xvmod_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmod_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmod_bu(_1, _2); }
 // CHECK-LABEL: @xvmod_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmod_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmod_hu(_1, _2); }
 // CHECK-LABEL: @xvmod_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmod_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmod_wu(_1, _2); }
 // CHECK-LABEL: @xvmod_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmod_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmod_du(_1, _2); }
 // CHECK-LABEL: @xvrepl128vei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrepl128vei_b(v32i8 _1) { return __builtin_lasx_xvrepl128vei_b(_1, 1); }
 // CHECK-LABEL: @xvrepl128vei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrepl128vei_h(v16i16 _1) { return __builtin_lasx_xvrepl128vei_h(_1, 1); }
 // CHECK-LABEL: @xvrepl128vei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrepl128vei_w(v8i32 _1) { return __builtin_lasx_xvrepl128vei_w(_1, 1); }
 // CHECK-LABEL: @xvrepl128vei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrepl128vei_d(v4i64 _1) { return __builtin_lasx_xvrepl128vei_d(_1, 1); }
 // CHECK-LABEL: @xvpickev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpickev_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpickev_b(_1, _2); }
 // CHECK-LABEL: @xvpickev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpickev_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpickev_h(_1, _2); }
 // CHECK-LABEL: @xvpickev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpickev_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpickev_w(_1, _2); }
 // CHECK-LABEL: @xvpickev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpickev_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpickev_d(_1, _2); }
 // CHECK-LABEL: @xvpickod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpickod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpickod_b(_1, _2); }
 // CHECK-LABEL: @xvpickod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpickod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpickod_h(_1, _2); }
 // CHECK-LABEL: @xvpickod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpickod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpickod_w(_1, _2); }
 // CHECK-LABEL: @xvpickod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpickod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpickod_d(_1, _2); }
 // CHECK-LABEL: @xvilvh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvilvh_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvilvh_b(_1, _2); }
 // CHECK-LABEL: @xvilvh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvilvh_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvilvh_h(_1, _2); }
 // CHECK-LABEL: @xvilvh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvilvh_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvilvh_w(_1, _2); }
 // CHECK-LABEL: @xvilvh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvilvh_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvilvh_d(_1, _2); }
 // CHECK-LABEL: @xvilvl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvilvl_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvilvl_b(_1, _2); }
 // CHECK-LABEL: @xvilvl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvilvl_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvilvl_h(_1, _2); }
 // CHECK-LABEL: @xvilvl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvilvl_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvilvl_w(_1, _2); }
 // CHECK-LABEL: @xvilvl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvilvl_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvilvl_d(_1, _2); }
 // CHECK-LABEL: @xvpackev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpackev_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpackev_b(_1, _2); }
 // CHECK-LABEL: @xvpackev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpackev_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpackev_h(_1, _2); }
 // CHECK-LABEL: @xvpackev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpackev_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpackev_w(_1, _2); }
 // CHECK-LABEL: @xvpackev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpackev_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpackev_d(_1, _2); }
 // CHECK-LABEL: @xvpackod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpackod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpackod_b(_1, _2); }
 // CHECK-LABEL: @xvpackod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpackod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpackod_h(_1, _2); }
 // CHECK-LABEL: @xvpackod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpackod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpackod_w(_1, _2); }
 // CHECK-LABEL: @xvpackod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpackod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpackod_d(_1, _2); }
 // CHECK-LABEL: @xvshuf_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvshuf_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvshuf_b(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvshuf_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvshuf_h(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvshuf_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvshuf_w(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvshuf_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvshuf_d(_1, _2, _3); }
 // CHECK-LABEL: @xvand_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvand_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvand_v(_1, _2); }
 // CHECK-LABEL: @xvandi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvandi_b(v32u8 _1) { return __builtin_lasx_xvandi_b(_1, 1); }
 // CHECK-LABEL: @xvor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvor_v(_1, _2); }
 // CHECK-LABEL: @xvori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvori_b(v32u8 _1) { return __builtin_lasx_xvori_b(_1, 1); }
 // CHECK-LABEL: @xvnor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvnor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvnor_v(_1, _2); }
 // CHECK-LABEL: @xvnori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvnori_b(v32u8 _1) { return __builtin_lasx_xvnori_b(_1, 1); }
 // CHECK-LABEL: @xvxor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvxor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvxor_v(_1, _2); }
 // CHECK-LABEL: @xvxori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvxori_b(v32u8 _1) { return __builtin_lasx_xvxori_b(_1, 1); }
 // CHECK-LABEL: @xvbitsel_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitsel_v(v32u8 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvbitsel_v(_1, _2, _3); }
 // CHECK-LABEL: @xvbitseli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvbitseli_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitseli_b(_1, _2, 1); }
 // CHECK-LABEL: @xvshuf4i_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvshuf4i_b(v32i8 _1) { return __builtin_lasx_xvshuf4i_b(_1, 1); }
 // CHECK-LABEL: @xvshuf4i_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvshuf4i_h(v16i16 _1) { return __builtin_lasx_xvshuf4i_h(_1, 1); }
 // CHECK-LABEL: @xvshuf4i_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvshuf4i_w(v8i32 _1) { return __builtin_lasx_xvshuf4i_w(_1, 1); }
 // CHECK-LABEL: @xvreplgr2vr_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplgr2vr_b(int _1) { return __builtin_lasx_xvreplgr2vr_b(_1); }
 // CHECK-LABEL: @xvreplgr2vr_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvreplgr2vr_h(int _1) { return __builtin_lasx_xvreplgr2vr_h(_1); }
 // CHECK-LABEL: @xvreplgr2vr_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvreplgr2vr_w(int _1) { return __builtin_lasx_xvreplgr2vr_w(_1); }
 // CHECK-LABEL: @xvreplgr2vr_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[_1:%.*]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 [[CONV]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvreplgr2vr_d(int _1) { return __builtin_lasx_xvreplgr2vr_d(_1); }
 // CHECK-LABEL: @xvpcnt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpcnt_b(v32i8 _1) { return __builtin_lasx_xvpcnt_b(_1); }
 // CHECK-LABEL: @xvpcnt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvpcnt_h(v16i16 _1) { return __builtin_lasx_xvpcnt_h(_1); }
 // CHECK-LABEL: @xvpcnt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpcnt_w(v8i32 _1) { return __builtin_lasx_xvpcnt_w(_1); }
 // CHECK-LABEL: @xvpcnt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpcnt_d(v4i64 _1) { return __builtin_lasx_xvpcnt_d(_1); }
 // CHECK-LABEL: @xvclo_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvclo_b(v32i8 _1) { return __builtin_lasx_xvclo_b(_1); }
 // CHECK-LABEL: @xvclo_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvclo_h(v16i16 _1) { return __builtin_lasx_xvclo_h(_1); }
 // CHECK-LABEL: @xvclo_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvclo_w(v8i32 _1) { return __builtin_lasx_xvclo_w(_1); }
 // CHECK-LABEL: @xvclo_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvclo_d(v4i64 _1) { return __builtin_lasx_xvclo_d(_1); }
 // CHECK-LABEL: @xvclz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvclz_b(v32i8 _1) { return __builtin_lasx_xvclz_b(_1); }
 // CHECK-LABEL: @xvclz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvclz_h(v16i16 _1) { return __builtin_lasx_xvclz_h(_1); }
 // CHECK-LABEL: @xvclz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvclz_w(v8i32 _1) { return __builtin_lasx_xvclz_w(_1); }
 // CHECK-LABEL: @xvclz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvclz_d(v4i64 _1) { return __builtin_lasx_xvclz_d(_1); }
 // CHECK-LABEL: @xvfadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfadd_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfadd_s(_1, _2); }
 // CHECK-LABEL: @xvfadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfadd_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfadd_d(_1, _2); }
 // CHECK-LABEL: @xvfsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfsub_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfsub_s(_1, _2); }
 // CHECK-LABEL: @xvfsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfsub_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfsub_d(_1, _2); }
 // CHECK-LABEL: @xvfmul_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmul_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmul_s(_1, _2); }
 // CHECK-LABEL: @xvfmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmul_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmul_d(_1, _2); }
 // CHECK-LABEL: @xvfdiv_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfdiv_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfdiv_s(_1, _2); }
 // CHECK-LABEL: @xvfdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfdiv_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfdiv_d(_1, _2); }
 // CHECK-LABEL: @xvfcvt_h_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvfcvt_h_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcvt_h_s(_1, _2); }
 // CHECK-LABEL: @xvfcvt_s_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfcvt_s_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcvt_s_d(_1, _2); }
 // CHECK-LABEL: @xvfmin_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmin_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmin_s(_1, _2); }
 // CHECK-LABEL: @xvfmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmin_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmin_d(_1, _2); }
 // CHECK-LABEL: @xvfmina_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmina_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmina_s(_1, _2); }
 // CHECK-LABEL: @xvfmina_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmina_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmina_d(_1, _2); }
 // CHECK-LABEL: @xvfmax_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmax_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmax_s(_1, _2); }
 // CHECK-LABEL: @xvfmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmax_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmax_d(_1, _2); }
 // CHECK-LABEL: @xvfmaxa_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmaxa_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmaxa_s(_1, _2); }
 // CHECK-LABEL: @xvfmaxa_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmaxa_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmaxa_d(_1, _2); }
 // CHECK-LABEL: @xvfclass_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfclass_s(v8f32 _1) { return __builtin_lasx_xvfclass_s(_1); }
 // CHECK-LABEL: @xvfclass_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfclass_d(v4f64 _1) { return __builtin_lasx_xvfclass_d(_1); }
 // CHECK-LABEL: @xvfsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfsqrt_s(v8f32 _1) { return __builtin_lasx_xvfsqrt_s(_1); }
 // CHECK-LABEL: @xvfsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfsqrt_d(v4f64 _1) { return __builtin_lasx_xvfsqrt_d(_1); }
 // CHECK-LABEL: @xvfrecip_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfrecip_s(v8f32 _1) { return __builtin_lasx_xvfrecip_s(_1); }
 // CHECK-LABEL: @xvfrecip_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfrecip_d(v4f64 _1) { return __builtin_lasx_xvfrecip_d(_1); }
 // CHECK-LABEL: @xvfrint_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfrint_s(v8f32 _1) { return __builtin_lasx_xvfrint_s(_1); }
 // CHECK-LABEL: @xvfrint_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfrint_d(v4f64 _1) { return __builtin_lasx_xvfrint_d(_1); }
 // CHECK-LABEL: @xvfrsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfrsqrt_s(v8f32 _1) { return __builtin_lasx_xvfrsqrt_s(_1); }
 // CHECK-LABEL: @xvfrsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfrsqrt_d(v4f64 _1) { return __builtin_lasx_xvfrsqrt_d(_1); }
 // CHECK-LABEL: @xvflogb_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvflogb_s(v8f32 _1) { return __builtin_lasx_xvflogb_s(_1); }
 // CHECK-LABEL: @xvflogb_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvflogb_d(v4f64 _1) { return __builtin_lasx_xvflogb_d(_1); }
 // CHECK-LABEL: @xvfcvth_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfcvth_s_h(v16i16 _1) { return __builtin_lasx_xvfcvth_s_h(_1); }
 // CHECK-LABEL: @xvfcvth_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfcvth_d_s(v8f32 _1) { return __builtin_lasx_xvfcvth_d_s(_1); }
 // CHECK-LABEL: @xvfcvtl_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfcvtl_s_h(v16i16 _1) { return __builtin_lasx_xvfcvtl_s_h(_1); }
 // CHECK-LABEL: @xvfcvtl_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfcvtl_d_s(v8f32 _1) { return __builtin_lasx_xvfcvtl_d_s(_1); }
 // CHECK-LABEL: @xvftint_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_s(v8f32 _1) { return __builtin_lasx_xvftint_w_s(_1); }
 // CHECK-LABEL: @xvftint_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftint_l_d(v4f64 _1) { return __builtin_lasx_xvftint_l_d(_1); }
 // CHECK-LABEL: @xvftint_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvftint_wu_s(v8f32 _1) { return __builtin_lasx_xvftint_wu_s(_1); }
 // CHECK-LABEL: @xvftint_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvftint_lu_d(v4f64 _1) { return __builtin_lasx_xvftint_lu_d(_1); }
 // CHECK-LABEL: @xvftintrz_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_s(v8f32 _1) { return __builtin_lasx_xvftintrz_w_s(_1); }
 // CHECK-LABEL: @xvftintrz_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrz_l_d(v4f64 _1) { return __builtin_lasx_xvftintrz_l_d(_1); }
 // CHECK-LABEL: @xvftintrz_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvftintrz_wu_s(v8f32 _1) { return __builtin_lasx_xvftintrz_wu_s(_1); }
 // CHECK-LABEL: @xvftintrz_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvftintrz_lu_d(v4f64 _1) { return __builtin_lasx_xvftintrz_lu_d(_1); }
 // CHECK-LABEL: @xvffint_s_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_w(v8i32 _1) { return __builtin_lasx_xvffint_s_w(_1); }
 // CHECK-LABEL: @xvffint_d_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_l(v4i64 _1) { return __builtin_lasx_xvffint_d_l(_1); }
 // CHECK-LABEL: @xvffint_s_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_wu(v8u32 _1) { return __builtin_lasx_xvffint_s_wu(_1); }
 // CHECK-LABEL: @xvffint_d_lu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_lu(v4u64 _1) { return __builtin_lasx_xvffint_d_lu(_1); }
 // CHECK-LABEL: @xvreplve_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_1]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplve_b(v32i8 _1, int _2) { return __builtin_lasx_xvreplve_b(_1, _2); }
 // CHECK-LABEL: @xvreplve_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_1]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvreplve_h(v16i16 _1, int _2) { return __builtin_lasx_xvreplve_h(_1, _2); }
 // CHECK-LABEL: @xvreplve_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_1]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvreplve_w(v8i32 _1, int _2) { return __builtin_lasx_xvreplve_w(_1, _2); }
 // CHECK-LABEL: @xvreplve_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1]], i32 [[_2:%.*]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvreplve_d(v4i64 _1, int _2) { return __builtin_lasx_xvreplve_d(_1, _2); }
 // CHECK-LABEL: @xvpermi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpermi_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpermi_w(_1, _2, 1); }
 // CHECK-LABEL: @xvandn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvandn_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvandn_v(_1, _2); }
 // CHECK-LABEL: @xvneg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvneg_b(v32i8 _1) { return __builtin_lasx_xvneg_b(_1); }
 // CHECK-LABEL: @xvneg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvneg_h(v16i16 _1) { return __builtin_lasx_xvneg_h(_1); }
 // CHECK-LABEL: @xvneg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvneg_w(v8i32 _1) { return __builtin_lasx_xvneg_w(_1); }
 // CHECK-LABEL: @xvneg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvneg_d(v4i64 _1) { return __builtin_lasx_xvneg_d(_1); }
 // CHECK-LABEL: @xvmuh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmuh_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmuh_b(_1, _2); }
 // CHECK-LABEL: @xvmuh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmuh_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmuh_h(_1, _2); }
 // CHECK-LABEL: @xvmuh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmuh_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmuh_w(_1, _2); }
 // CHECK-LABEL: @xvmuh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmuh_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmuh_d(_1, _2); }
 // CHECK-LABEL: @xvmuh_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvmuh_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmuh_bu(_1, _2); }
 // CHECK-LABEL: @xvmuh_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmuh_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmuh_hu(_1, _2); }
 // CHECK-LABEL: @xvmuh_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmuh_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmuh_wu(_1, _2); }
 // CHECK-LABEL: @xvmuh_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmuh_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmuh_du(_1, _2); }
 // CHECK-LABEL: @xvsllwil_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsllwil_h_b(v32i8 _1) { return __builtin_lasx_xvsllwil_h_b(_1, 1); }
 // CHECK-LABEL: @xvsllwil_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsllwil_w_h(v16i16 _1) { return __builtin_lasx_xvsllwil_w_h(_1, 1); }
 // CHECK-LABEL: @xvsllwil_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsllwil_d_w(v8i32 _1) { return __builtin_lasx_xvsllwil_d_w(_1, 1); }
 // CHECK-LABEL: @xvsllwil_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvsllwil_hu_bu(v32u8 _1) { return __builtin_lasx_xvsllwil_hu_bu(_1, 1); }
 // CHECK-LABEL: @xvsllwil_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvsllwil_wu_hu(v16u16 _1) { return __builtin_lasx_xvsllwil_wu_hu(_1, 1); }
 // CHECK-LABEL: @xvsllwil_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvsllwil_du_wu(v8u32 _1) { return __builtin_lasx_xvsllwil_du_wu(_1, 1); }
 // CHECK-LABEL: @xvsran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsran_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsran_b_h(_1, _2); }
 // CHECK-LABEL: @xvsran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsran_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsran_h_w(_1, _2); }
 // CHECK-LABEL: @xvsran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsran_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsran_w_d(_1, _2); }
 // CHECK-LABEL: @xvssran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssran_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssran_b_h(_1, _2); }
 // CHECK-LABEL: @xvssran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssran_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssran_h_w(_1, _2); }
 // CHECK-LABEL: @xvssran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssran_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssran_w_d(_1, _2); }
 // CHECK-LABEL: @xvssran_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssran_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssran_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssran_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssran_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssran_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssran_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssran_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssran_wu_d(_1, _2); }
 // CHECK-LABEL: @xvsrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrarn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrarn_b_h(_1, _2); }
 // CHECK-LABEL: @xvsrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrarn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrarn_h_w(_1, _2); }
 // CHECK-LABEL: @xvsrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrarn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrarn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrarn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrarn_b_h(_1, _2); }
 // CHECK-LABEL: @xvssrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrarn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrarn_h_w(_1, _2); }
 // CHECK-LABEL: @xvssrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrarn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrarn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrarn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrarn_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrarn_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssrarn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrarn_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrarn_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssrarn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrarn_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrarn_wu_d(_1, _2); }
 // CHECK-LABEL: @xvsrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrln_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrln_b_h(_1, _2); }
 // CHECK-LABEL: @xvsrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrln_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrln_h_w(_1, _2); }
 // CHECK-LABEL: @xvsrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrln_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrln_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrln_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrln_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrln_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssrln_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrln_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrln_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssrln_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrln_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrln_wu_d(_1, _2); }
 // CHECK-LABEL: @xvsrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlrn_b_h(_1, _2); }
 // CHECK-LABEL: @xvsrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlrn_h_w(_1, _2); }
 // CHECK-LABEL: @xvsrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlrn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrn_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrlrn_bu_h(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrn_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrlrn_hu_w(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrn_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrlrn_wu_d(_1, _2); }
 // CHECK-LABEL: @xvfrstpi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvfrstpi_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvfrstpi_b(_1, _2, 1); }
 // CHECK-LABEL: @xvfrstpi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvfrstpi_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvfrstpi_h(_1, _2, 1); }
 // CHECK-LABEL: @xvfrstp_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvfrstp_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvfrstp_b(_1, _2, _3); }
 // CHECK-LABEL: @xvfrstp_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvfrstp_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvfrstp_h(_1, _2, _3); }
 // CHECK-LABEL: @xvshuf4i_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvshuf4i_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvshuf4i_d(_1, _2, 1); }
 // CHECK-LABEL: @xvbsrl_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvbsrl_v(v32i8 _1) { return __builtin_lasx_xvbsrl_v(_1, 1); }
 // CHECK-LABEL: @xvbsll_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvbsll_v(v32i8 _1) { return __builtin_lasx_xvbsll_v(_1, 1); }
 // CHECK-LABEL: @xvextrins_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvextrins_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvextrins_b(_1, _2, 1); }
 // CHECK-LABEL: @xvextrins_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvextrins_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvextrins_h(_1, _2, 1); }
 // CHECK-LABEL: @xvextrins_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvextrins_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvextrins_w(_1, _2, 1); }
 // CHECK-LABEL: @xvextrins_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvextrins_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvextrins_d(_1, _2, 1); }
 // CHECK-LABEL: @xvmskltz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmskltz_b(v32i8 _1) { return __builtin_lasx_xvmskltz_b(_1); }
 // CHECK-LABEL: @xvmskltz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmskltz_h(v16i16 _1) { return __builtin_lasx_xvmskltz_h(_1); }
 // CHECK-LABEL: @xvmskltz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmskltz_w(v8i32 _1) { return __builtin_lasx_xvmskltz_w(_1); }
 // CHECK-LABEL: @xvmskltz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmskltz_d(v4i64 _1) { return __builtin_lasx_xvmskltz_d(_1); }
 // CHECK-LABEL: @xvsigncov_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsigncov_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsigncov_b(_1, _2); }
 // CHECK-LABEL: @xvsigncov_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsigncov_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsigncov_h(_1, _2); }
 // CHECK-LABEL: @xvsigncov_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsigncov_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsigncov_w(_1, _2); }
 // CHECK-LABEL: @xvsigncov_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsigncov_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsigncov_d(_1, _2); }
 // CHECK-LABEL: @xvfmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfmadd_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfmadd_d(_1, _2, _3); }
 // CHECK-LABEL: @xvfmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfmsub_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfmsub_d(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfnmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfnmadd_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfnmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfnmadd_d(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvfnmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfnmsub_s(_1, _2, _3); }
 // CHECK-LABEL: @xvfnmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvfnmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfnmsub_d(_1, _2, _3); }
 // CHECK-LABEL: @xvftintrne_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_s(v8f32 _1) { return __builtin_lasx_xvftintrne_w_s(_1); }
 // CHECK-LABEL: @xvftintrne_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrne_l_d(v4f64 _1) { return __builtin_lasx_xvftintrne_l_d(_1); }
 // CHECK-LABEL: @xvftintrp_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_s(v8f32 _1) { return __builtin_lasx_xvftintrp_w_s(_1); }
 // CHECK-LABEL: @xvftintrp_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrp_l_d(v4f64 _1) { return __builtin_lasx_xvftintrp_l_d(_1); }
 // CHECK-LABEL: @xvftintrm_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_s(v8f32 _1) { return __builtin_lasx_xvftintrm_w_s(_1); }
 // CHECK-LABEL: @xvftintrm_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrm_l_d(v4f64 _1) { return __builtin_lasx_xvftintrm_l_d(_1); }
 // CHECK-LABEL: @xvftint_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftint_w_d(_1, _2); }
 // CHECK-LABEL: @xvffint_s_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_l(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvffint_s_l(_1, _2); }
 // CHECK-LABEL: @xvftintrz_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrz_w_d(_1, _2); }
 // CHECK-LABEL: @xvftintrp_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrp_w_d(_1, _2); }
 // CHECK-LABEL: @xvftintrm_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrm_w_d(_1, _2); }
 // CHECK-LABEL: @xvftintrne_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrne_w_d(_1, _2); }
 // CHECK-LABEL: @xvftinth_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftinth_l_s(v8f32 _1) { return __builtin_lasx_xvftinth_l_s(_1); }
 // CHECK-LABEL: @xvftintl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintl_l_s(v8f32 _1) { return __builtin_lasx_xvftintl_l_s(_1); }
 // CHECK-LABEL: @xvffinth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffinth_d_w(v8i32 _1) { return __builtin_lasx_xvffinth_d_w(_1); }
 // CHECK-LABEL: @xvffintl_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvffintl_d_w(v8i32 _1) { return __builtin_lasx_xvffintl_d_w(_1); }
 // CHECK-LABEL: @xvftintrzh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrzh_l_s(_1); }
 // CHECK-LABEL: @xvftintrzl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzl_l_s(v8f32 _1) { return __builtin_lasx_xvftintrzl_l_s(_1); }
 // CHECK-LABEL: @xvftintrph_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrph_l_s(v8f32 _1) { return __builtin_lasx_xvftintrph_l_s(_1); }
 // CHECK-LABEL: @xvftintrpl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrpl_l_s(v8f32 _1) { return __builtin_lasx_xvftintrpl_l_s(_1); }
 // CHECK-LABEL: @xvftintrmh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrmh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrmh_l_s(_1); }
 // CHECK-LABEL: @xvftintrml_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrml_l_s(v8f32 _1) { return __builtin_lasx_xvftintrml_l_s(_1); }
 // CHECK-LABEL: @xvftintrneh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrneh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrneh_l_s(_1); }
 // CHECK-LABEL: @xvftintrnel_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvftintrnel_l_s(v8f32 _1) { return __builtin_lasx_xvftintrnel_l_s(_1); }
 // CHECK-LABEL: @xvfrintrne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrne_s(v8f32 _1) { return __builtin_lasx_xvfrintrne_s(_1); }
 // CHECK-LABEL: @xvfrintrne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrne_d(v4f64 _1) { return __builtin_lasx_xvfrintrne_d(_1); }
 // CHECK-LABEL: @xvfrintrz_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrz_s(v8f32 _1) { return __builtin_lasx_xvfrintrz_s(_1); }
 // CHECK-LABEL: @xvfrintrz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrz_d(v4f64 _1) { return __builtin_lasx_xvfrintrz_d(_1); }
 // CHECK-LABEL: @xvfrintrp_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrp_s(v8f32 _1) { return __builtin_lasx_xvfrintrp_s(_1); }
 // CHECK-LABEL: @xvfrintrp_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrp_d(v4f64 _1) { return __builtin_lasx_xvfrintrp_d(_1); }
 // CHECK-LABEL: @xvfrintrm_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
-// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrm_s(v8f32 _1) { return __builtin_lasx_xvfrintrm_s(_1); }
 // CHECK-LABEL: @xvfrintrm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
-// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrm_d(v4f64 _1) { return __builtin_lasx_xvfrintrm_d(_1); }
 // CHECK-LABEL: @xvld(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvld(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvld(void *_1) { return __builtin_lasx_xvld(_1, 1); }
 // CHECK-LABEL: @xvst(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1]], ptr [[_2:%.*]], i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvst(v32i8 _1, void *_2) { return __builtin_lasx_xvst(_1, _2, 1); }
 // CHECK-LABEL: @xvstelm_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1]], ptr [[_2:%.*]], i32 1, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_b(v32i8 _1, void * _2) { return __builtin_lasx_xvstelm_b(_1, _2, 1, 1); }
 // CHECK-LABEL: @xvstelm_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1:%.*]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1]], ptr [[_2:%.*]], i32 2, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_h(v16i16 _1, void * _2) { return __builtin_lasx_xvstelm_h(_1, _2, 2, 1); }
 // CHECK-LABEL: @xvstelm_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1:%.*]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1]], ptr [[_2:%.*]], i32 4, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_w(v8i32 _1, void * _2) { return __builtin_lasx_xvstelm_w(_1, _2, 4, 1); }
 // CHECK-LABEL: @xvstelm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1:%.*]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1]], ptr [[_2:%.*]], i32 8, i32 1)
 // CHECK-NEXT:    ret void
 //
 void xvstelm_d(v4i64 _1, void * _2) { return __builtin_lasx_xvstelm_d(_1, _2, 8, 1); }
 // CHECK-LABEL: @xvinsve0_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvinsve0_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvinsve0_w(_1, _2, 1); }
 // CHECK-LABEL: @xvinsve0_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvinsve0_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvinsve0_d(_1, _2, 1); }
 // CHECK-LABEL: @xvpickve_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvpickve_w(v8i32 _1) { return __builtin_lasx_xvpickve_w(_1, 1); }
 // CHECK-LABEL: @xvpickve_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpickve_d(v4i64 _1) { return __builtin_lasx_xvpickve_d(_1, 1); }
 // CHECK-LABEL: @xvssrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrn_b_h(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrn_h_w(_1, _2); }
 // CHECK-LABEL: @xvssrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrn_w_d(_1, _2); }
 // CHECK-LABEL: @xvssrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrln_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrln_b_h(_1, _2); }
 // CHECK-LABEL: @xvssrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrln_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrln_h_w(_1, _2); }
 // CHECK-LABEL: @xvssrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrln_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrln_w_d(_1, _2); }
 // CHECK-LABEL: @xvorn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvorn_v(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvorn_v(_1, _2); }
 // CHECK-LABEL: @xvldi(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvldi() { return __builtin_lasx_xvldi(1); }
 // CHECK-LABEL: @xvldx(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1:%.*]], i64 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvldx(void *_1) { return __builtin_lasx_xvldx(_1, 1); }
 // CHECK-LABEL: @xvstx(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i64 1)
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_1]], ptr [[_2:%.*]], i64 1)
 // CHECK-NEXT:    ret void
 //
 void xvstx(v32i8 _1, void *_2) { return __builtin_lasx_xvstx(_1, _2, 1); }
 // CHECK-LABEL: @xvextl_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvextl_qu_du(v4u64 _1) { return __builtin_lasx_xvextl_qu_du(_1); }
 // CHECK-LABEL: @xvinsgr2vr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1]], i32 1, i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvinsgr2vr_w(v8i32 _1) { return __builtin_lasx_xvinsgr2vr_w(_1, 1, 1); }
 // CHECK-LABEL: @xvinsgr2vr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1:%.*]], i64 1, i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1]], i64 1, i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvinsgr2vr_d(v4i64 _1) { return __builtin_lasx_xvinsgr2vr_d(_1, 1, 1); }
 // CHECK-LABEL: @xvreplve0_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_b(v32i8 _1) { return __builtin_lasx_xvreplve0_b(_1); }
 // CHECK-LABEL: @xvreplve0_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvreplve0_h(v16i16 _1) { return __builtin_lasx_xvreplve0_h(_1); }
 // CHECK-LABEL: @xvreplve0_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvreplve0_w(v8i32 _1) { return __builtin_lasx_xvreplve0_w(_1); }
 // CHECK-LABEL: @xvreplve0_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvreplve0_d(v4i64 _1) { return __builtin_lasx_xvreplve0_d(_1); }
 // CHECK-LABEL: @xvreplve0_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_q(v32i8 _1) { return __builtin_lasx_xvreplve0_q(_1); }
 // CHECK-LABEL: @vext2xv_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_h_b(v32i8 _1) { return __builtin_lasx_vext2xv_h_b(_1); }
 // CHECK-LABEL: @vext2xv_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_h(v16i16 _1) { return __builtin_lasx_vext2xv_w_h(_1); }
 // CHECK-LABEL: @vext2xv_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_w(v8i32 _1) { return __builtin_lasx_vext2xv_d_w(_1); }
 // CHECK-LABEL: @vext2xv_w_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_b(v32i8 _1) { return __builtin_lasx_vext2xv_w_b(_1); }
 // CHECK-LABEL: @vext2xv_d_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_h(v16i16 _1) { return __builtin_lasx_vext2xv_d_h(_1); }
 // CHECK-LABEL: @vext2xv_d_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_b(v32i8 _1) { return __builtin_lasx_vext2xv_d_b(_1); }
 // CHECK-LABEL: @vext2xv_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_hu_bu(v32i8 _1) { return __builtin_lasx_vext2xv_hu_bu(_1); }
 // CHECK-LABEL: @vext2xv_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_hu(v16i16 _1) { return __builtin_lasx_vext2xv_wu_hu(_1); }
 // CHECK-LABEL: @vext2xv_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_wu(v8i32 _1) { return __builtin_lasx_vext2xv_du_wu(_1); }
 // CHECK-LABEL: @vext2xv_wu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_bu(v32i8 _1) { return __builtin_lasx_vext2xv_wu_bu(_1); }
 // CHECK-LABEL: @vext2xv_du_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_hu(v16i16 _1) { return __builtin_lasx_vext2xv_du_hu(_1); }
 // CHECK-LABEL: @vext2xv_du_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_bu(v32i8 _1) { return __builtin_lasx_vext2xv_du_bu(_1); }
 // CHECK-LABEL: @xvpermi_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvpermi_q(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpermi_q(_1, _2, 1); }
 // CHECK-LABEL: @xvpermi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvpermi_d(v4i64 _1) { return __builtin_lasx_xvpermi_d(_1, 1); }
 // CHECK-LABEL: @xvperm_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvperm_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvperm_w(_1, _2); }
 // CHECK-LABEL: @xvldrepl_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvldrepl_b(void *_1) { return __builtin_lasx_xvldrepl_b(_1, 1); }
 // CHECK-LABEL: @xvldrepl_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(ptr [[_1:%.*]], i32 2)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvldrepl_h(void *_1) { return __builtin_lasx_xvldrepl_h(_1, 2); }
 // CHECK-LABEL: @xvldrepl_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(ptr [[_1:%.*]], i32 4)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvldrepl_w(void *_1) { return __builtin_lasx_xvldrepl_w(_1, 4); }
 // CHECK-LABEL: @xvldrepl_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(ptr [[_1:%.*]], i32 8)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvldrepl_d(void *_1) { return __builtin_lasx_xvldrepl_d(_1, 8); }
 // CHECK-LABEL: @xvpickve2gr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xvpickve2gr_w(v8i32 _1) { return __builtin_lasx_xvpickve2gr_w(_1, 1); }
 // CHECK-LABEL: @xvpickve2gr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int xvpickve2gr_wu(v8i32 _1) { return __builtin_lasx_xvpickve2gr_wu(_1, 1); }
 // CHECK-LABEL: @xvpickve2gr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 long xvpickve2gr_d(v4i64 _1) { return __builtin_lasx_xvpickve2gr_d(_1, 1); }
 // CHECK-LABEL: @xvpickve2gr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 unsigned long int xvpickve2gr_du(v4i64 _1) { return __builtin_lasx_xvpickve2gr_du(_1, 1); }
 // CHECK-LABEL: @xvaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvaddwev_q_d(_1, _2); }
 // CHECK-LABEL: @xvaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvaddwev_d_w(_1, _2); }
 // CHECK-LABEL: @xvaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvaddwev_w_h(_1, _2); }
 // CHECK-LABEL: @xvaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvaddwev_h_b(_1, _2); }
 // CHECK-LABEL: @xvaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvaddwev_q_du(_1, _2); }
 // CHECK-LABEL: @xvaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvaddwev_d_wu(_1, _2); }
 // CHECK-LABEL: @xvaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvaddwev_w_hu(_1, _2); }
 // CHECK-LABEL: @xvaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvaddwev_h_bu(_1, _2); }
 // CHECK-LABEL: @xvsubwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsubwev_q_d(_1, _2); }
 // CHECK-LABEL: @xvsubwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsubwev_d_w(_1, _2); }
 // CHECK-LABEL: @xvsubwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsubwev_w_h(_1, _2); }
 // CHECK-LABEL: @xvsubwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsubwev_h_b(_1, _2); }
 // CHECK-LABEL: @xvsubwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsubwev_q_du(_1, _2); }
 // CHECK-LABEL: @xvsubwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsubwev_d_wu(_1, _2); }
 // CHECK-LABEL: @xvsubwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsubwev_w_hu(_1, _2); }
 // CHECK-LABEL: @xvsubwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsubwev_h_bu(_1, _2); }
 // CHECK-LABEL: @xvmulwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmulwev_q_d(_1, _2); }
 // CHECK-LABEL: @xvmulwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmulwev_d_w(_1, _2); }
 // CHECK-LABEL: @xvmulwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmulwev_w_h(_1, _2); }
 // CHECK-LABEL: @xvmulwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmulwev_h_b(_1, _2); }
 // CHECK-LABEL: @xvmulwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmulwev_q_du(_1, _2); }
 // CHECK-LABEL: @xvmulwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmulwev_d_wu(_1, _2); }
 // CHECK-LABEL: @xvmulwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmulwev_w_hu(_1, _2); }
 // CHECK-LABEL: @xvmulwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmulwev_h_bu(_1, _2); }
 // CHECK-LABEL: @xvaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvaddwod_q_d(_1, _2); }
 // CHECK-LABEL: @xvaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvaddwod_d_w(_1, _2); }
 // CHECK-LABEL: @xvaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvaddwod_w_h(_1, _2); }
 // CHECK-LABEL: @xvaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvaddwod_h_b(_1, _2); }
 // CHECK-LABEL: @xvaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvaddwod_q_du(_1, _2); }
 // CHECK-LABEL: @xvaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvaddwod_d_wu(_1, _2); }
 // CHECK-LABEL: @xvaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvaddwod_w_hu(_1, _2); }
 // CHECK-LABEL: @xvaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvaddwod_h_bu(_1, _2); }
 // CHECK-LABEL: @xvsubwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsubwod_q_d(_1, _2); }
 // CHECK-LABEL: @xvsubwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsubwod_d_w(_1, _2); }
 // CHECK-LABEL: @xvsubwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsubwod_w_h(_1, _2); }
 // CHECK-LABEL: @xvsubwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsubwod_h_b(_1, _2); }
 // CHECK-LABEL: @xvsubwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsubwod_q_du(_1, _2); }
 // CHECK-LABEL: @xvsubwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsubwod_d_wu(_1, _2); }
 // CHECK-LABEL: @xvsubwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsubwod_w_hu(_1, _2); }
 // CHECK-LABEL: @xvsubwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsubwod_h_bu(_1, _2); }
 // CHECK-LABEL: @xvmulwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmulwod_q_d(_1, _2); }
 // CHECK-LABEL: @xvmulwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmulwod_d_w(_1, _2); }
 // CHECK-LABEL: @xvmulwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmulwod_w_h(_1, _2); }
 // CHECK-LABEL: @xvmulwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmulwod_h_b(_1, _2); }
 // CHECK-LABEL: @xvmulwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmulwod_q_du(_1, _2); }
 // CHECK-LABEL: @xvmulwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmulwod_d_wu(_1, _2); }
 // CHECK-LABEL: @xvmulwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmulwod_w_hu(_1, _2); }
 // CHECK-LABEL: @xvmulwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmulwod_h_bu(_1, _2); }
 // CHECK-LABEL: @xvaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvaddwev_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvaddwev_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvaddwev_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvmulwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvmulwev_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvmulwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvmulwev_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvmulwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvmulwev_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvaddwod_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvaddwod_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvaddwod_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvmulwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvmulwod_d_wu_w(_1, _2); }
 // CHECK-LABEL: @xvmulwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvmulwod_w_hu_h(_1, _2); }
 // CHECK-LABEL: @xvmulwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvmulwod_h_bu_b(_1, _2); }
 // CHECK-LABEL: @xvhaddw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvhaddw_q_d(_1, _2); }
 // CHECK-LABEL: @xvhaddw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_qu_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvhaddw_qu_du(_1, _2); }
 // CHECK-LABEL: @xvhsubw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvhsubw_q_d(_1, _2); }
 // CHECK-LABEL: @xvhsubw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvhsubw_qu_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvhsubw_qu_du(_1, _2); }
 // CHECK-LABEL: @xvmaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwev_q_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwev_d_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwev_w_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwev_h_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __builtin_lasx_xvmaddwev_q_du(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __builtin_lasx_xvmaddwev_d_wu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwev_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __builtin_lasx_xvmaddwev_w_hu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwev_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvmaddwev_h_bu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwod_q_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwod_d_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwod_w_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwod_h_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __builtin_lasx_xvmaddwod_q_du(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __builtin_lasx_xvmaddwod_d_wu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwod_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __builtin_lasx_xvmaddwod_w_hu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwod_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvmaddwod_h_bu(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwev_q_du_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwev_d_wu_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwev_w_hu_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwev_h_bu_b(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwod_q_du_d(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwod_d_wu_w(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwod_w_hu_h(_1, _2, _3); }
 // CHECK-LABEL: @xvmaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwod_h_bu_b(_1, _2, _3); }
 // CHECK-LABEL: @xvrotr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrotr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvrotr_b(_1, _2); }
 // CHECK-LABEL: @xvrotr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrotr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvrotr_h(_1, _2); }
 // CHECK-LABEL: @xvrotr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrotr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvrotr_w(_1, _2); }
 // CHECK-LABEL: @xvrotr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrotr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvrotr_d(_1, _2); }
 // CHECK-LABEL: @xvadd_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvadd_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadd_q(_1, _2); }
 // CHECK-LABEL: @xvsub_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsub_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsub_q(_1, _2); }
 // CHECK-LABEL: @xvaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvaddwev_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvaddwod_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvmulwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvmulwev_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvmulwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvmulwod_q_du_d(_1, _2); }
 // CHECK-LABEL: @xvmskgez_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmskgez_b(v32i8 _1) { return __builtin_lasx_xvmskgez_b(_1); }
 // CHECK-LABEL: @xvmsknz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvmsknz_b(v32i8 _1) { return __builtin_lasx_xvmsknz_b(_1); }
 // CHECK-LABEL: @xvexth_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvexth_h_b(v32i8 _1) { return __builtin_lasx_xvexth_h_b(_1); }
 // CHECK-LABEL: @xvexth_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvexth_w_h(v16i16 _1) { return __builtin_lasx_xvexth_w_h(_1); }
 // CHECK-LABEL: @xvexth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvexth_d_w(v8i32 _1) { return __builtin_lasx_xvexth_d_w(_1); }
 // CHECK-LABEL: @xvexth_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvexth_q_d(v4i64 _1) { return __builtin_lasx_xvexth_q_d(_1); }
 // CHECK-LABEL: @xvexth_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_1]])
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvexth_hu_bu(v32u8 _1) { return __builtin_lasx_xvexth_hu_bu(_1); }
 // CHECK-LABEL: @xvexth_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_1]])
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvexth_wu_hu(v16u16 _1) { return __builtin_lasx_xvexth_wu_hu(_1); }
 // CHECK-LABEL: @xvexth_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvexth_du_wu(v8u32 _1) { return __builtin_lasx_xvexth_du_wu(_1); }
 // CHECK-LABEL: @xvexth_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvexth_qu_du(v4u64 _1) { return __builtin_lasx_xvexth_qu_du(_1); }
 // CHECK-LABEL: @xvrotri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrotri_b(v32i8 _1) { return __builtin_lasx_xvrotri_b(_1, 1); }
 // CHECK-LABEL: @xvrotri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrotri_h(v16i16 _1) { return __builtin_lasx_xvrotri_h(_1, 1); }
 // CHECK-LABEL: @xvrotri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrotri_w(v8i32 _1) { return __builtin_lasx_xvrotri_w(_1, 1); }
 // CHECK-LABEL: @xvrotri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrotri_d(v4i64 _1) { return __builtin_lasx_xvrotri_d(_1, 1); }
 // CHECK-LABEL: @xvextl_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvextl_q_d(v4i64 _1) { return __builtin_lasx_xvextl_q_d(_1); }
 // CHECK-LABEL: @xvsrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrlrni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrlni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrlni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrlni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrlni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrlni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrlni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrlni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrlni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrlni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrlni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrlni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrlni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrlrni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrlrni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrlrni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrlrni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrani_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrani_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrani_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrani_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvsrarni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvsrarni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvsrarni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvsrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvsrarni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrani_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrani_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrani_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrani_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrani_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrani_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrani_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrani_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrani_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrani_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrani_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrani_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrani_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvssrarni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvssrarni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvssrarni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvssrarni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32u8 xvssrarni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrarni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16u16 xvssrarni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrarni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8u32 xvssrarni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrarni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @xvssrarni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4u64 xvssrarni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrarni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @xbnz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_b(v32u8 _1) { return __builtin_lasx_xbnz_b(_1); }
 // CHECK-LABEL: @xbnz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_d(v4u64 _1) { return __builtin_lasx_xbnz_d(_1); }
 // CHECK-LABEL: @xbnz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_h(v16u16 _1) { return __builtin_lasx_xbnz_h(_1); }
 // CHECK-LABEL: @xbnz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_v(v32u8 _1) { return __builtin_lasx_xbnz_v(_1); }
 // CHECK-LABEL: @xbnz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbnz_w(v8u32 _1) { return __builtin_lasx_xbnz_w(_1); }
 // CHECK-LABEL: @xbz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_b(v32u8 _1) { return __builtin_lasx_xbz_b(_1); }
 // CHECK-LABEL: @xbz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_d(v4u64 _1) { return __builtin_lasx_xbz_d(_1); }
 // CHECK-LABEL: @xbz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_h(v16u16 _1) { return __builtin_lasx_xbz_h(_1); }
 // CHECK-LABEL: @xbz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_v(v32u8 _1) { return __builtin_lasx_xbz_v(_1); }
 // CHECK-LABEL: @xbz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int xbz_w(v8u32 _1) { return __builtin_lasx_xbz_w(_1); }
 // CHECK-LABEL: @xvfcmp_caf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_caf_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_caf_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_caf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_caf_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_caf_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_ceq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_ceq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_ceq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_ceq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_ceq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_ceq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cle_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cle_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cle_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cle_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_clt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_clt_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_clt_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_clt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_clt_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_clt_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cne_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cne_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cne_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cne_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cor_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cor_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cor_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cor_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cueq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cueq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cueq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cueq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cule_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cule_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cule_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cule_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cult_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cult_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cult_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cult_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cun_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cun_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cune_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cune_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cune_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cune_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_cun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cun_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cun_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_saf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_saf_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_saf_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_saf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_saf_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_saf_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_seq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_seq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_seq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_seq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_seq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_seq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sle_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sle_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sle_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sle_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_slt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_slt_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_slt_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_slt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_slt_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_slt_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sne_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sne_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sne_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sne_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sor_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sor_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sor_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sor_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sueq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sueq_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sueq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sueq_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sule_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sule_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sule_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sule_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sult_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sult_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sult_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sult_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sun_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sun_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1]], <4 x double> [[_2]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sune_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sune_d(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sune_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sune_s(_1, _2); }
 // CHECK-LABEL: @xvfcmp_sun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1]], <8 x float> [[_2]])
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sun_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sun_s(_1, _2); }
 // CHECK-LABEL: @xvpickve_d_f(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1]], i32 1)
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4f64 xvpickve_d_f(v4f64 _1) { return __builtin_lasx_xvpickve_d_f(_1, 1); }
 // CHECK-LABEL: @xvpickve_w_f(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1]], i32 1)
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8f32 xvpickve_w_f(v8f32 _1) { return __builtin_lasx_xvpickve_w_f(_1, 1); }
 // CHECK-LABEL: @xvrepli_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 1)
-// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v32i8 xvrepli_b() { return __builtin_lasx_xvrepli_b(1); }
 // CHECK-LABEL: @xvrepli_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 1)
-// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v4i64 xvrepli_d() { return __builtin_lasx_xvrepli_d(1); }
 // CHECK-LABEL: @xvrepli_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 1)
-// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v16i16 xvrepli_h() { return __builtin_lasx_xvrepli_h(1); }
 // CHECK-LABEL: @xvrepli_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 1)
-// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
 //
 v8i32 xvrepli_w() { return __builtin_lasx_xvrepli_w(1); }
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin-alias.c b/clang/test/CodeGen/LoongArch/lsx/builtin-alias.c
index 331e29fb7d17..7a84e0ae24f9 100644
--- a/clang/test/CodeGen/LoongArch/lsx/builtin-alias.c
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin-alias.c
@@ -5,4080 +5,5838 @@
 
 // CHECK-LABEL: @vsll_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsll.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsll.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsll_b(v16i8 _1, v16i8 _2) { return __lsx_vsll_b(_1, _2); }
 // CHECK-LABEL: @vsll_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsll.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsll.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsll_h(v8i16 _1, v8i16 _2) { return __lsx_vsll_h(_1, _2); }
 // CHECK-LABEL: @vsll_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsll.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsll.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsll_w(v4i32 _1, v4i32 _2) { return __lsx_vsll_w(_1, _2); }
 // CHECK-LABEL: @vsll_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsll.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsll.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsll_d(v2i64 _1, v2i64 _2) { return __lsx_vsll_d(_1, _2); }
 // CHECK-LABEL: @vslli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslli_b(v16i8 _1) { return __lsx_vslli_b(_1, 1); }
 // CHECK-LABEL: @vslli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslli_h(v8i16 _1) { return __lsx_vslli_h(_1, 1); }
 // CHECK-LABEL: @vslli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslli_w(v4i32 _1) { return __lsx_vslli_w(_1, 1); }
 // CHECK-LABEL: @vslli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslli_d(v2i64 _1) { return __lsx_vslli_d(_1, 1); }
 // CHECK-LABEL: @vsra_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsra.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsra.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsra_b(v16i8 _1, v16i8 _2) { return __lsx_vsra_b(_1, _2); }
 // CHECK-LABEL: @vsra_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsra.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsra.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsra_h(v8i16 _1, v8i16 _2) { return __lsx_vsra_h(_1, _2); }
 // CHECK-LABEL: @vsra_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsra.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsra.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsra_w(v4i32 _1, v4i32 _2) { return __lsx_vsra_w(_1, _2); }
 // CHECK-LABEL: @vsra_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsra.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsra.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsra_d(v2i64 _1, v2i64 _2) { return __lsx_vsra_d(_1, _2); }
 // CHECK-LABEL: @vsrai_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrai_b(v16i8 _1) { return __lsx_vsrai_b(_1, 1); }
 // CHECK-LABEL: @vsrai_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrai_h(v8i16 _1) { return __lsx_vsrai_h(_1, 1); }
 // CHECK-LABEL: @vsrai_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrai_w(v4i32 _1) { return __lsx_vsrai_w(_1, 1); }
 // CHECK-LABEL: @vsrai_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrai_d(v2i64 _1) { return __lsx_vsrai_d(_1, 1); }
 // CHECK-LABEL: @vsrar_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrar.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrar.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrar_b(v16i8 _1, v16i8 _2) { return __lsx_vsrar_b(_1, _2); }
 // CHECK-LABEL: @vsrar_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrar.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrar.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrar_h(v8i16 _1, v8i16 _2) { return __lsx_vsrar_h(_1, _2); }
 // CHECK-LABEL: @vsrar_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrar.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrar.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrar_w(v4i32 _1, v4i32 _2) { return __lsx_vsrar_w(_1, _2); }
 // CHECK-LABEL: @vsrar_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrar.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrar.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrar_d(v2i64 _1, v2i64 _2) { return __lsx_vsrar_d(_1, _2); }
 // CHECK-LABEL: @vsrari_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrari_b(v16i8 _1) { return __lsx_vsrari_b(_1, 1); }
 // CHECK-LABEL: @vsrari_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrari_h(v8i16 _1) { return __lsx_vsrari_h(_1, 1); }
 // CHECK-LABEL: @vsrari_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrari_w(v4i32 _1) { return __lsx_vsrari_w(_1, 1); }
 // CHECK-LABEL: @vsrari_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrari_d(v2i64 _1) { return __lsx_vsrari_d(_1, 1); }
 // CHECK-LABEL: @vsrl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrl.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrl.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrl_b(v16i8 _1, v16i8 _2) { return __lsx_vsrl_b(_1, _2); }
 // CHECK-LABEL: @vsrl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrl.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrl.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrl_h(v8i16 _1, v8i16 _2) { return __lsx_vsrl_h(_1, _2); }
 // CHECK-LABEL: @vsrl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrl.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrl.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrl_w(v4i32 _1, v4i32 _2) { return __lsx_vsrl_w(_1, _2); }
 // CHECK-LABEL: @vsrl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrl.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrl.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrl_d(v2i64 _1, v2i64 _2) { return __lsx_vsrl_d(_1, _2); }
 // CHECK-LABEL: @vsrli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrli_b(v16i8 _1) { return __lsx_vsrli_b(_1, 1); }
 // CHECK-LABEL: @vsrli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrli_h(v8i16 _1) { return __lsx_vsrli_h(_1, 1); }
 // CHECK-LABEL: @vsrli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrli_w(v4i32 _1) { return __lsx_vsrli_w(_1, 1); }
 // CHECK-LABEL: @vsrli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrli_d(v2i64 _1) { return __lsx_vsrli_d(_1, 1); }
 // CHECK-LABEL: @vsrlr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlr_b(v16i8 _1, v16i8 _2) { return __lsx_vsrlr_b(_1, _2); }
 // CHECK-LABEL: @vsrlr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlr_h(v8i16 _1, v8i16 _2) { return __lsx_vsrlr_h(_1, _2); }
 // CHECK-LABEL: @vsrlr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlr_w(v4i32 _1, v4i32 _2) { return __lsx_vsrlr_w(_1, _2); }
 // CHECK-LABEL: @vsrlr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrlr_d(v2i64 _1, v2i64 _2) { return __lsx_vsrlr_d(_1, _2); }
 // CHECK-LABEL: @vsrlri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrlri_b(v16i8 _1) { return __lsx_vsrlri_b(_1, 1); }
 // CHECK-LABEL: @vsrlri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrlri_h(v8i16 _1) { return __lsx_vsrlri_h(_1, 1); }
 // CHECK-LABEL: @vsrlri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrlri_w(v4i32 _1) { return __lsx_vsrlri_w(_1, 1); }
 // CHECK-LABEL: @vsrlri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrlri_d(v2i64 _1) { return __lsx_vsrlri_d(_1, 1); }
 // CHECK-LABEL: @vbitclr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitclr_b(v16u8 _1, v16u8 _2) { return __lsx_vbitclr_b(_1, _2); }
 // CHECK-LABEL: @vbitclr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vbitclr_h(v8u16 _1, v8u16 _2) { return __lsx_vbitclr_h(_1, _2); }
 // CHECK-LABEL: @vbitclr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vbitclr_w(v4u32 _1, v4u32 _2) { return __lsx_vbitclr_w(_1, _2); }
 // CHECK-LABEL: @vbitclr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vbitclr_d(v2u64 _1, v2u64 _2) { return __lsx_vbitclr_d(_1, _2); }
 // CHECK-LABEL: @vbitclri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vbitclri_b(v16u8 _1) { return __lsx_vbitclri_b(_1, 1); }
 // CHECK-LABEL: @vbitclri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vbitclri_h(v8u16 _1) { return __lsx_vbitclri_h(_1, 1); }
 // CHECK-LABEL: @vbitclri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vbitclri_w(v4u32 _1) { return __lsx_vbitclri_w(_1, 1); }
 // CHECK-LABEL: @vbitclri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vbitclri_d(v2u64 _1) { return __lsx_vbitclri_d(_1, 1); }
 // CHECK-LABEL: @vbitset_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitset.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitset.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitset_b(v16u8 _1, v16u8 _2) { return __lsx_vbitset_b(_1, _2); }
 // CHECK-LABEL: @vbitset_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitset.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitset.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vbitset_h(v8u16 _1, v8u16 _2) { return __lsx_vbitset_h(_1, _2); }
 // CHECK-LABEL: @vbitset_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitset.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitset.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vbitset_w(v4u32 _1, v4u32 _2) { return __lsx_vbitset_w(_1, _2); }
 // CHECK-LABEL: @vbitset_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitset.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitset.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vbitset_d(v2u64 _1, v2u64 _2) { return __lsx_vbitset_d(_1, _2); }
 // CHECK-LABEL: @vbitseti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vbitseti_b(v16u8 _1) { return __lsx_vbitseti_b(_1, 1); }
 // CHECK-LABEL: @vbitseti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vbitseti_h(v8u16 _1) { return __lsx_vbitseti_h(_1, 1); }
 // CHECK-LABEL: @vbitseti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vbitseti_w(v4u32 _1) { return __lsx_vbitseti_w(_1, 1); }
 // CHECK-LABEL: @vbitseti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vbitseti_d(v2u64 _1) { return __lsx_vbitseti_d(_1, 1); }
 // CHECK-LABEL: @vbitrev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrev.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitrev_b(v16u8 _1, v16u8 _2) { return __lsx_vbitrev_b(_1, _2); }
 // CHECK-LABEL: @vbitrev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrev.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vbitrev_h(v8u16 _1, v8u16 _2) { return __lsx_vbitrev_h(_1, _2); }
 // CHECK-LABEL: @vbitrev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrev.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vbitrev_w(v4u32 _1, v4u32 _2) { return __lsx_vbitrev_w(_1, _2); }
 // CHECK-LABEL: @vbitrev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrev.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vbitrev_d(v2u64 _1, v2u64 _2) { return __lsx_vbitrev_d(_1, _2); }
 // CHECK-LABEL: @vbitrevi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vbitrevi_b(v16u8 _1) { return __lsx_vbitrevi_b(_1, 1); }
 // CHECK-LABEL: @vbitrevi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vbitrevi_h(v8u16 _1) { return __lsx_vbitrevi_h(_1, 1); }
 // CHECK-LABEL: @vbitrevi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vbitrevi_w(v4u32 _1) { return __lsx_vbitrevi_w(_1, 1); }
 // CHECK-LABEL: @vbitrevi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vbitrevi_d(v2u64 _1) { return __lsx_vbitrevi_d(_1, 1); }
 // CHECK-LABEL: @vadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vadd_b(v16i8 _1, v16i8 _2) { return __lsx_vadd_b(_1, _2); }
 // CHECK-LABEL: @vadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vadd_h(v8i16 _1, v8i16 _2) { return __lsx_vadd_h(_1, _2); }
 // CHECK-LABEL: @vadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vadd_w(v4i32 _1, v4i32 _2) { return __lsx_vadd_w(_1, _2); }
 // CHECK-LABEL: @vadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vadd_d(v2i64 _1, v2i64 _2) { return __lsx_vadd_d(_1, _2); }
 // CHECK-LABEL: @vaddi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vaddi_bu(v16i8 _1) { return __lsx_vaddi_bu(_1, 1); }
 // CHECK-LABEL: @vaddi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vaddi_hu(v8i16 _1) { return __lsx_vaddi_hu(_1, 1); }
 // CHECK-LABEL: @vaddi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vaddi_wu(v4i32 _1) { return __lsx_vaddi_wu(_1, 1); }
 // CHECK-LABEL: @vaddi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vaddi_du(v2i64 _1) { return __lsx_vaddi_du(_1, 1); }
 // CHECK-LABEL: @vsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsub.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsub_b(v16i8 _1, v16i8 _2) { return __lsx_vsub_b(_1, _2); }
 // CHECK-LABEL: @vsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsub.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsub_h(v8i16 _1, v8i16 _2) { return __lsx_vsub_h(_1, _2); }
 // CHECK-LABEL: @vsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsub.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsub_w(v4i32 _1, v4i32 _2) { return __lsx_vsub_w(_1, _2); }
 // CHECK-LABEL: @vsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsub_d(v2i64 _1, v2i64 _2) { return __lsx_vsub_d(_1, _2); }
 // CHECK-LABEL: @vsubi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsubi_bu(v16i8 _1) { return __lsx_vsubi_bu(_1, 1); }
 // CHECK-LABEL: @vsubi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsubi_hu(v8i16 _1) { return __lsx_vsubi_hu(_1, 1); }
 // CHECK-LABEL: @vsubi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsubi_wu(v4i32 _1) { return __lsx_vsubi_wu(_1, 1); }
 // CHECK-LABEL: @vsubi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsubi_du(v2i64 _1) { return __lsx_vsubi_du(_1, 1); }
 // CHECK-LABEL: @vmax_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmax_b(v16i8 _1, v16i8 _2) { return __lsx_vmax_b(_1, _2); }
 // CHECK-LABEL: @vmax_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmax_h(v8i16 _1, v8i16 _2) { return __lsx_vmax_h(_1, _2); }
 // CHECK-LABEL: @vmax_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmax_w(v4i32 _1, v4i32 _2) { return __lsx_vmax_w(_1, _2); }
 // CHECK-LABEL: @vmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmax_d(v2i64 _1, v2i64 _2) { return __lsx_vmax_d(_1, _2); }
 // CHECK-LABEL: @vmaxi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmaxi_b(v16i8 _1) { return __lsx_vmaxi_b(_1, 1); }
 // CHECK-LABEL: @vmaxi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vmaxi_h(v8i16 _1) { return __lsx_vmaxi_h(_1, 1); }
 // CHECK-LABEL: @vmaxi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vmaxi_w(v4i32 _1) { return __lsx_vmaxi_w(_1, 1); }
 // CHECK-LABEL: @vmaxi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vmaxi_d(v2i64 _1) { return __lsx_vmaxi_d(_1, 1); }
 // CHECK-LABEL: @vmax_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmax_bu(v16u8 _1, v16u8 _2) { return __lsx_vmax_bu(_1, _2); }
 // CHECK-LABEL: @vmax_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmax_hu(v8u16 _1, v8u16 _2) { return __lsx_vmax_hu(_1, _2); }
 // CHECK-LABEL: @vmax_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmax_wu(v4u32 _1, v4u32 _2) { return __lsx_vmax_wu(_1, _2); }
 // CHECK-LABEL: @vmax_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmax_du(v2u64 _1, v2u64 _2) { return __lsx_vmax_du(_1, _2); }
 // CHECK-LABEL: @vmaxi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vmaxi_bu(v16u8 _1) { return __lsx_vmaxi_bu(_1, 1); }
 // CHECK-LABEL: @vmaxi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vmaxi_hu(v8u16 _1) { return __lsx_vmaxi_hu(_1, 1); }
 // CHECK-LABEL: @vmaxi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vmaxi_wu(v4u32 _1) { return __lsx_vmaxi_wu(_1, 1); }
 // CHECK-LABEL: @vmaxi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vmaxi_du(v2u64 _1) { return __lsx_vmaxi_du(_1, 1); }
 // CHECK-LABEL: @vmin_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmin_b(v16i8 _1, v16i8 _2) { return __lsx_vmin_b(_1, _2); }
 // CHECK-LABEL: @vmin_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmin_h(v8i16 _1, v8i16 _2) { return __lsx_vmin_h(_1, _2); }
 // CHECK-LABEL: @vmin_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmin_w(v4i32 _1, v4i32 _2) { return __lsx_vmin_w(_1, _2); }
 // CHECK-LABEL: @vmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmin_d(v2i64 _1, v2i64 _2) { return __lsx_vmin_d(_1, _2); }
 // CHECK-LABEL: @vmini_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmini_b(v16i8 _1) { return __lsx_vmini_b(_1, 1); }
 // CHECK-LABEL: @vmini_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vmini_h(v8i16 _1) { return __lsx_vmini_h(_1, 1); }
 // CHECK-LABEL: @vmini_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vmini_w(v4i32 _1) { return __lsx_vmini_w(_1, 1); }
 // CHECK-LABEL: @vmini_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vmini_d(v2i64 _1) { return __lsx_vmini_d(_1, 1); }
 // CHECK-LABEL: @vmin_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmin_bu(v16u8 _1, v16u8 _2) { return __lsx_vmin_bu(_1, _2); }
 // CHECK-LABEL: @vmin_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmin_hu(v8u16 _1, v8u16 _2) { return __lsx_vmin_hu(_1, _2); }
 // CHECK-LABEL: @vmin_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmin_wu(v4u32 _1, v4u32 _2) { return __lsx_vmin_wu(_1, _2); }
 // CHECK-LABEL: @vmin_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmin_du(v2u64 _1, v2u64 _2) { return __lsx_vmin_du(_1, _2); }
 // CHECK-LABEL: @vmini_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vmini_bu(v16u8 _1) { return __lsx_vmini_bu(_1, 1); }
 // CHECK-LABEL: @vmini_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vmini_hu(v8u16 _1) { return __lsx_vmini_hu(_1, 1); }
 // CHECK-LABEL: @vmini_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vmini_wu(v4u32 _1) { return __lsx_vmini_wu(_1, 1); }
 // CHECK-LABEL: @vmini_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vmini_du(v2u64 _1) { return __lsx_vmini_du(_1, 1); }
 // CHECK-LABEL: @vseq_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseq.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseq.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vseq_b(v16i8 _1, v16i8 _2) { return __lsx_vseq_b(_1, _2); }
 // CHECK-LABEL: @vseq_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseq.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseq.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vseq_h(v8i16 _1, v8i16 _2) { return __lsx_vseq_h(_1, _2); }
 // CHECK-LABEL: @vseq_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseq.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseq.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vseq_w(v4i32 _1, v4i32 _2) { return __lsx_vseq_w(_1, _2); }
 // CHECK-LABEL: @vseq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseq.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseq.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vseq_d(v2i64 _1, v2i64 _2) { return __lsx_vseq_d(_1, _2); }
 // CHECK-LABEL: @vseqi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vseqi_b(v16i8 _1) { return __lsx_vseqi_b(_1, 1); }
 // CHECK-LABEL: @vseqi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vseqi_h(v8i16 _1) { return __lsx_vseqi_h(_1, 1); }
 // CHECK-LABEL: @vseqi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vseqi_w(v4i32 _1) { return __lsx_vseqi_w(_1, 1); }
 // CHECK-LABEL: @vseqi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vseqi_d(v2i64 _1) { return __lsx_vseqi_d(_1, 1); }
 // CHECK-LABEL: @vslti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslti_b(v16i8 _1) { return __lsx_vslti_b(_1, 1); }
 // CHECK-LABEL: @vslt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vslt_b(v16i8 _1, v16i8 _2) { return __lsx_vslt_b(_1, _2); }
 // CHECK-LABEL: @vslt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vslt_h(v8i16 _1, v8i16 _2) { return __lsx_vslt_h(_1, _2); }
 // CHECK-LABEL: @vslt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vslt_w(v4i32 _1, v4i32 _2) { return __lsx_vslt_w(_1, _2); }
 // CHECK-LABEL: @vslt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vslt_d(v2i64 _1, v2i64 _2) { return __lsx_vslt_d(_1, _2); }
 // CHECK-LABEL: @vslti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslti_h(v8i16 _1) { return __lsx_vslti_h(_1, 1); }
 // CHECK-LABEL: @vslti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslti_w(v4i32 _1) { return __lsx_vslti_w(_1, 1); }
 // CHECK-LABEL: @vslti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslti_d(v2i64 _1) { return __lsx_vslti_d(_1, 1); }
 // CHECK-LABEL: @vslt_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vslt_bu(v16u8 _1, v16u8 _2) { return __lsx_vslt_bu(_1, _2); }
 // CHECK-LABEL: @vslt_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vslt_hu(v8u16 _1, v8u16 _2) { return __lsx_vslt_hu(_1, _2); }
 // CHECK-LABEL: @vslt_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vslt_wu(v4u32 _1, v4u32 _2) { return __lsx_vslt_wu(_1, _2); }
 // CHECK-LABEL: @vslt_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vslt_du(v2u64 _1, v2u64 _2) { return __lsx_vslt_du(_1, _2); }
 // CHECK-LABEL: @vslti_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslti_bu(v16u8 _1) { return __lsx_vslti_bu(_1, 1); }
 // CHECK-LABEL: @vslti_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslti_hu(v8u16 _1) { return __lsx_vslti_hu(_1, 1); }
 // CHECK-LABEL: @vslti_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslti_wu(v4u32 _1) { return __lsx_vslti_wu(_1, 1); }
 // CHECK-LABEL: @vslti_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslti_du(v2u64 _1) { return __lsx_vslti_du(_1, 1); }
 // CHECK-LABEL: @vsle_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsle_b(v16i8 _1, v16i8 _2) { return __lsx_vsle_b(_1, _2); }
 // CHECK-LABEL: @vsle_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsle_h(v8i16 _1, v8i16 _2) { return __lsx_vsle_h(_1, _2); }
 // CHECK-LABEL: @vsle_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsle_w(v4i32 _1, v4i32 _2) { return __lsx_vsle_w(_1, _2); }
 // CHECK-LABEL: @vsle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsle_d(v2i64 _1, v2i64 _2) { return __lsx_vsle_d(_1, _2); }
 // CHECK-LABEL: @vslei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslei_b(v16i8 _1) { return __lsx_vslei_b(_1, 1); }
 // CHECK-LABEL: @vslei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslei_h(v8i16 _1) { return __lsx_vslei_h(_1, 1); }
 // CHECK-LABEL: @vslei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslei_w(v4i32 _1) { return __lsx_vslei_w(_1, 1); }
 // CHECK-LABEL: @vslei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslei_d(v2i64 _1) { return __lsx_vslei_d(_1, 1); }
 // CHECK-LABEL: @vsle_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsle_bu(v16u8 _1, v16u8 _2) { return __lsx_vsle_bu(_1, _2); }
 // CHECK-LABEL: @vsle_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsle_hu(v8u16 _1, v8u16 _2) { return __lsx_vsle_hu(_1, _2); }
 // CHECK-LABEL: @vsle_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsle_wu(v4u32 _1, v4u32 _2) { return __lsx_vsle_wu(_1, _2); }
 // CHECK-LABEL: @vsle_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsle_du(v2u64 _1, v2u64 _2) { return __lsx_vsle_du(_1, _2); }
 // CHECK-LABEL: @vslei_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslei_bu(v16u8 _1) { return __lsx_vslei_bu(_1, 1); }
 // CHECK-LABEL: @vslei_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslei_hu(v8u16 _1) { return __lsx_vslei_hu(_1, 1); }
 // CHECK-LABEL: @vslei_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslei_wu(v4u32 _1) { return __lsx_vslei_wu(_1, 1); }
 // CHECK-LABEL: @vslei_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslei_du(v2u64 _1) { return __lsx_vslei_du(_1, 1); }
 // CHECK-LABEL: @vsat_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsat_b(v16i8 _1) { return __lsx_vsat_b(_1, 1); }
 // CHECK-LABEL: @vsat_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsat_h(v8i16 _1) { return __lsx_vsat_h(_1, 1); }
 // CHECK-LABEL: @vsat_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsat_w(v4i32 _1) { return __lsx_vsat_w(_1, 1); }
 // CHECK-LABEL: @vsat_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsat_d(v2i64 _1) { return __lsx_vsat_d(_1, 1); }
 // CHECK-LABEL: @vsat_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vsat_bu(v16u8 _1) { return __lsx_vsat_bu(_1, 1); }
 // CHECK-LABEL: @vsat_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vsat_hu(v8u16 _1) { return __lsx_vsat_hu(_1, 1); }
 // CHECK-LABEL: @vsat_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vsat_wu(v4u32 _1) { return __lsx_vsat_wu(_1, 1); }
 // CHECK-LABEL: @vsat_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vsat_du(v2u64 _1) { return __lsx_vsat_du(_1, 1); }
 // CHECK-LABEL: @vadda_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadda.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadda.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vadda_b(v16i8 _1, v16i8 _2) { return __lsx_vadda_b(_1, _2); }
 // CHECK-LABEL: @vadda_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadda.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadda.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vadda_h(v8i16 _1, v8i16 _2) { return __lsx_vadda_h(_1, _2); }
 // CHECK-LABEL: @vadda_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadda.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadda.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vadda_w(v4i32 _1, v4i32 _2) { return __lsx_vadda_w(_1, _2); }
 // CHECK-LABEL: @vadda_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadda.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadda.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vadda_d(v2i64 _1, v2i64 _2) { return __lsx_vadda_d(_1, _2); }
 // CHECK-LABEL: @vsadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsadd_b(v16i8 _1, v16i8 _2) { return __lsx_vsadd_b(_1, _2); }
 // CHECK-LABEL: @vsadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsadd_h(v8i16 _1, v8i16 _2) { return __lsx_vsadd_h(_1, _2); }
 // CHECK-LABEL: @vsadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsadd_w(v4i32 _1, v4i32 _2) { return __lsx_vsadd_w(_1, _2); }
 // CHECK-LABEL: @vsadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsadd_d(v2i64 _1, v2i64 _2) { return __lsx_vsadd_d(_1, _2); }
 // CHECK-LABEL: @vsadd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vsadd_bu(v16u8 _1, v16u8 _2) { return __lsx_vsadd_bu(_1, _2); }
 // CHECK-LABEL: @vsadd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vsadd_hu(v8u16 _1, v8u16 _2) { return __lsx_vsadd_hu(_1, _2); }
 // CHECK-LABEL: @vsadd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vsadd_wu(v4u32 _1, v4u32 _2) { return __lsx_vsadd_wu(_1, _2); }
 // CHECK-LABEL: @vsadd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vsadd_du(v2u64 _1, v2u64 _2) { return __lsx_vsadd_du(_1, _2); }
 // CHECK-LABEL: @vavg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vavg_b(v16i8 _1, v16i8 _2) { return __lsx_vavg_b(_1, _2); }
 // CHECK-LABEL: @vavg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vavg_h(v8i16 _1, v8i16 _2) { return __lsx_vavg_h(_1, _2); }
 // CHECK-LABEL: @vavg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vavg_w(v4i32 _1, v4i32 _2) { return __lsx_vavg_w(_1, _2); }
 // CHECK-LABEL: @vavg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vavg_d(v2i64 _1, v2i64 _2) { return __lsx_vavg_d(_1, _2); }
 // CHECK-LABEL: @vavg_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vavg_bu(v16u8 _1, v16u8 _2) { return __lsx_vavg_bu(_1, _2); }
 // CHECK-LABEL: @vavg_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vavg_hu(v8u16 _1, v8u16 _2) { return __lsx_vavg_hu(_1, _2); }
 // CHECK-LABEL: @vavg_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vavg_wu(v4u32 _1, v4u32 _2) { return __lsx_vavg_wu(_1, _2); }
 // CHECK-LABEL: @vavg_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vavg_du(v2u64 _1, v2u64 _2) { return __lsx_vavg_du(_1, _2); }
 // CHECK-LABEL: @vavgr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vavgr_b(v16i8 _1, v16i8 _2) { return __lsx_vavgr_b(_1, _2); }
 // CHECK-LABEL: @vavgr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vavgr_h(v8i16 _1, v8i16 _2) { return __lsx_vavgr_h(_1, _2); }
 // CHECK-LABEL: @vavgr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vavgr_w(v4i32 _1, v4i32 _2) { return __lsx_vavgr_w(_1, _2); }
 // CHECK-LABEL: @vavgr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vavgr_d(v2i64 _1, v2i64 _2) { return __lsx_vavgr_d(_1, _2); }
 // CHECK-LABEL: @vavgr_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vavgr_bu(v16u8 _1, v16u8 _2) { return __lsx_vavgr_bu(_1, _2); }
 // CHECK-LABEL: @vavgr_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vavgr_hu(v8u16 _1, v8u16 _2) { return __lsx_vavgr_hu(_1, _2); }
 // CHECK-LABEL: @vavgr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vavgr_wu(v4u32 _1, v4u32 _2) { return __lsx_vavgr_wu(_1, _2); }
 // CHECK-LABEL: @vavgr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vavgr_du(v2u64 _1, v2u64 _2) { return __lsx_vavgr_du(_1, _2); }
 // CHECK-LABEL: @vssub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssub_b(v16i8 _1, v16i8 _2) { return __lsx_vssub_b(_1, _2); }
 // CHECK-LABEL: @vssub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssub_h(v8i16 _1, v8i16 _2) { return __lsx_vssub_h(_1, _2); }
 // CHECK-LABEL: @vssub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssub_w(v4i32 _1, v4i32 _2) { return __lsx_vssub_w(_1, _2); }
 // CHECK-LABEL: @vssub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssub_d(v2i64 _1, v2i64 _2) { return __lsx_vssub_d(_1, _2); }
 // CHECK-LABEL: @vssub_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssub_bu(v16u8 _1, v16u8 _2) { return __lsx_vssub_bu(_1, _2); }
 // CHECK-LABEL: @vssub_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssub_hu(v8u16 _1, v8u16 _2) { return __lsx_vssub_hu(_1, _2); }
 // CHECK-LABEL: @vssub_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssub_wu(v4u32 _1, v4u32 _2) { return __lsx_vssub_wu(_1, _2); }
 // CHECK-LABEL: @vssub_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssub_du(v2u64 _1, v2u64 _2) { return __lsx_vssub_du(_1, _2); }
 // CHECK-LABEL: @vabsd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vabsd_b(v16i8 _1, v16i8 _2) { return __lsx_vabsd_b(_1, _2); }
 // CHECK-LABEL: @vabsd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vabsd_h(v8i16 _1, v8i16 _2) { return __lsx_vabsd_h(_1, _2); }
 // CHECK-LABEL: @vabsd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vabsd_w(v4i32 _1, v4i32 _2) { return __lsx_vabsd_w(_1, _2); }
 // CHECK-LABEL: @vabsd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vabsd_d(v2i64 _1, v2i64 _2) { return __lsx_vabsd_d(_1, _2); }
 // CHECK-LABEL: @vabsd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vabsd_bu(v16u8 _1, v16u8 _2) { return __lsx_vabsd_bu(_1, _2); }
 // CHECK-LABEL: @vabsd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vabsd_hu(v8u16 _1, v8u16 _2) { return __lsx_vabsd_hu(_1, _2); }
 // CHECK-LABEL: @vabsd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vabsd_wu(v4u32 _1, v4u32 _2) { return __lsx_vabsd_wu(_1, _2); }
 // CHECK-LABEL: @vabsd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vabsd_du(v2u64 _1, v2u64 _2) { return __lsx_vabsd_du(_1, _2); }
 // CHECK-LABEL: @vmul_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmul.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmul.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmul_b(v16i8 _1, v16i8 _2) { return __lsx_vmul_b(_1, _2); }
 // CHECK-LABEL: @vmul_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmul.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmul.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmul_h(v8i16 _1, v8i16 _2) { return __lsx_vmul_h(_1, _2); }
 // CHECK-LABEL: @vmul_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmul.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmul.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmul_w(v4i32 _1, v4i32 _2) { return __lsx_vmul_w(_1, _2); }
 // CHECK-LABEL: @vmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmul.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmul.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmul_d(v2i64 _1, v2i64 _2) { return __lsx_vmul_d(_1, _2); }
 // CHECK-LABEL: @vmadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmadd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vmadd_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __lsx_vmadd_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmadd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmadd_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __lsx_vmadd_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmadd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmadd_w(v4i32 _1, v4i32 _2, v4i32 _3) {
   return __lsx_vmadd_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmadd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmadd_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __lsx_vmadd_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsub.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vmsub_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __lsx_vmsub_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmsub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmsub.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmsub_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __lsx_vmsub_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmsub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmsub.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmsub_w(v4i32 _1, v4i32 _2, v4i32 _3) {
   return __lsx_vmsub_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmsub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmsub.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmsub_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __lsx_vmsub_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vdiv_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vdiv_b(v16i8 _1, v16i8 _2) { return __lsx_vdiv_b(_1, _2); }
 // CHECK-LABEL: @vdiv_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vdiv_h(v8i16 _1, v8i16 _2) { return __lsx_vdiv_h(_1, _2); }
 // CHECK-LABEL: @vdiv_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vdiv_w(v4i32 _1, v4i32 _2) { return __lsx_vdiv_w(_1, _2); }
 // CHECK-LABEL: @vdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vdiv_d(v2i64 _1, v2i64 _2) { return __lsx_vdiv_d(_1, _2); }
 // CHECK-LABEL: @vdiv_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vdiv_bu(v16u8 _1, v16u8 _2) { return __lsx_vdiv_bu(_1, _2); }
 // CHECK-LABEL: @vdiv_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vdiv_hu(v8u16 _1, v8u16 _2) { return __lsx_vdiv_hu(_1, _2); }
 // CHECK-LABEL: @vdiv_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vdiv_wu(v4u32 _1, v4u32 _2) { return __lsx_vdiv_wu(_1, _2); }
 // CHECK-LABEL: @vdiv_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vdiv_du(v2u64 _1, v2u64 _2) { return __lsx_vdiv_du(_1, _2); }
 // CHECK-LABEL: @vhaddw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vhaddw_h_b(v16i8 _1, v16i8 _2) { return __lsx_vhaddw_h_b(_1, _2); }
 // CHECK-LABEL: @vhaddw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vhaddw_w_h(v8i16 _1, v8i16 _2) { return __lsx_vhaddw_w_h(_1, _2); }
 // CHECK-LABEL: @vhaddw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhaddw_d_w(v4i32 _1, v4i32 _2) { return __lsx_vhaddw_d_w(_1, _2); }
 // CHECK-LABEL: @vhaddw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.hu.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.hu.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vhaddw_hu_bu(v16u8 _1, v16u8 _2) { return __lsx_vhaddw_hu_bu(_1, _2); }
 // CHECK-LABEL: @vhaddw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.wu.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.wu.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vhaddw_wu_hu(v8u16 _1, v8u16 _2) { return __lsx_vhaddw_wu_hu(_1, _2); }
 // CHECK-LABEL: @vhaddw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.du.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.du.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vhaddw_du_wu(v4u32 _1, v4u32 _2) { return __lsx_vhaddw_du_wu(_1, _2); }
 // CHECK-LABEL: @vhsubw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vhsubw_h_b(v16i8 _1, v16i8 _2) { return __lsx_vhsubw_h_b(_1, _2); }
 // CHECK-LABEL: @vhsubw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vhsubw_w_h(v8i16 _1, v8i16 _2) { return __lsx_vhsubw_w_h(_1, _2); }
 // CHECK-LABEL: @vhsubw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhsubw_d_w(v4i32 _1, v4i32 _2) { return __lsx_vhsubw_d_w(_1, _2); }
 // CHECK-LABEL: @vhsubw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.hu.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.hu.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vhsubw_hu_bu(v16u8 _1, v16u8 _2) { return __lsx_vhsubw_hu_bu(_1, _2); }
 // CHECK-LABEL: @vhsubw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.wu.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.wu.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vhsubw_wu_hu(v8u16 _1, v8u16 _2) { return __lsx_vhsubw_wu_hu(_1, _2); }
 // CHECK-LABEL: @vhsubw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.du.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.du.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhsubw_du_wu(v4u32 _1, v4u32 _2) { return __lsx_vhsubw_du_wu(_1, _2); }
 // CHECK-LABEL: @vmod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmod_b(v16i8 _1, v16i8 _2) { return __lsx_vmod_b(_1, _2); }
 // CHECK-LABEL: @vmod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmod_h(v8i16 _1, v8i16 _2) { return __lsx_vmod_h(_1, _2); }
 // CHECK-LABEL: @vmod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmod_w(v4i32 _1, v4i32 _2) { return __lsx_vmod_w(_1, _2); }
 // CHECK-LABEL: @vmod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmod_d(v2i64 _1, v2i64 _2) { return __lsx_vmod_d(_1, _2); }
 // CHECK-LABEL: @vmod_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmod_bu(v16u8 _1, v16u8 _2) { return __lsx_vmod_bu(_1, _2); }
 // CHECK-LABEL: @vmod_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmod_hu(v8u16 _1, v8u16 _2) { return __lsx_vmod_hu(_1, _2); }
 // CHECK-LABEL: @vmod_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmod_wu(v4u32 _1, v4u32 _2) { return __lsx_vmod_wu(_1, _2); }
 // CHECK-LABEL: @vmod_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmod_du(v2u64 _1, v2u64 _2) { return __lsx_vmod_du(_1, _2); }
 // CHECK-LABEL: @vreplve_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplve.b(<16 x i8> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplve.b(<16 x i8> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vreplve_b(v16i8 _1, int _2) { return __lsx_vreplve_b(_1, _2); }
 // CHECK-LABEL: @vreplve_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplve.h(<8 x i16> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplve.h(<8 x i16> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vreplve_h(v8i16 _1, int _2) { return __lsx_vreplve_h(_1, _2); }
 // CHECK-LABEL: @vreplve_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplve.w(<4 x i32> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplve.w(<4 x i32> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vreplve_w(v4i32 _1, int _2) { return __lsx_vreplve_w(_1, _2); }
 // CHECK-LABEL: @vreplve_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplve.d(<2 x i64> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplve.d(<2 x i64> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vreplve_d(v2i64 _1, int _2) { return __lsx_vreplve_d(_1, _2); }
 // CHECK-LABEL: @vreplvei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vreplvei_b(v16i8 _1) { return __lsx_vreplvei_b(_1, 1); }
 // CHECK-LABEL: @vreplvei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vreplvei_h(v8i16 _1) { return __lsx_vreplvei_h(_1, 1); }
 // CHECK-LABEL: @vreplvei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vreplvei_w(v4i32 _1) { return __lsx_vreplvei_w(_1, 1); }
 // CHECK-LABEL: @vreplvei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vreplvei_d(v2i64 _1) { return __lsx_vreplvei_d(_1, 1); }
 // CHECK-LABEL: @vpickev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickev.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpickev_b(v16i8 _1, v16i8 _2) { return __lsx_vpickev_b(_1, _2); }
 // CHECK-LABEL: @vpickev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickev.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpickev_h(v8i16 _1, v8i16 _2) { return __lsx_vpickev_h(_1, _2); }
 // CHECK-LABEL: @vpickev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickev.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpickev_w(v4i32 _1, v4i32 _2) { return __lsx_vpickev_w(_1, _2); }
 // CHECK-LABEL: @vpickev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickev.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpickev_d(v2i64 _1, v2i64 _2) { return __lsx_vpickev_d(_1, _2); }
 // CHECK-LABEL: @vpickod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickod.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpickod_b(v16i8 _1, v16i8 _2) { return __lsx_vpickod_b(_1, _2); }
 // CHECK-LABEL: @vpickod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickod.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpickod_h(v8i16 _1, v8i16 _2) { return __lsx_vpickod_h(_1, _2); }
 // CHECK-LABEL: @vpickod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickod.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpickod_w(v4i32 _1, v4i32 _2) { return __lsx_vpickod_w(_1, _2); }
 // CHECK-LABEL: @vpickod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickod.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpickod_d(v2i64 _1, v2i64 _2) { return __lsx_vpickod_d(_1, _2); }
 // CHECK-LABEL: @vilvh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvh.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvh.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vilvh_b(v16i8 _1, v16i8 _2) { return __lsx_vilvh_b(_1, _2); }
 // CHECK-LABEL: @vilvh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvh.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvh.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vilvh_h(v8i16 _1, v8i16 _2) { return __lsx_vilvh_h(_1, _2); }
 // CHECK-LABEL: @vilvh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvh.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvh.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vilvh_w(v4i32 _1, v4i32 _2) { return __lsx_vilvh_w(_1, _2); }
 // CHECK-LABEL: @vilvh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvh.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvh.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vilvh_d(v2i64 _1, v2i64 _2) { return __lsx_vilvh_d(_1, _2); }
 // CHECK-LABEL: @vilvl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvl.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvl.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vilvl_b(v16i8 _1, v16i8 _2) { return __lsx_vilvl_b(_1, _2); }
 // CHECK-LABEL: @vilvl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvl.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvl.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vilvl_h(v8i16 _1, v8i16 _2) { return __lsx_vilvl_h(_1, _2); }
 // CHECK-LABEL: @vilvl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvl.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvl.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vilvl_w(v4i32 _1, v4i32 _2) { return __lsx_vilvl_w(_1, _2); }
 // CHECK-LABEL: @vilvl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvl.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvl.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vilvl_d(v2i64 _1, v2i64 _2) { return __lsx_vilvl_d(_1, _2); }
 // CHECK-LABEL: @vpackev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackev.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpackev_b(v16i8 _1, v16i8 _2) { return __lsx_vpackev_b(_1, _2); }
 // CHECK-LABEL: @vpackev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackev.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpackev_h(v8i16 _1, v8i16 _2) { return __lsx_vpackev_h(_1, _2); }
 // CHECK-LABEL: @vpackev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackev.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpackev_w(v4i32 _1, v4i32 _2) { return __lsx_vpackev_w(_1, _2); }
 // CHECK-LABEL: @vpackev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackev.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpackev_d(v2i64 _1, v2i64 _2) { return __lsx_vpackev_d(_1, _2); }
 // CHECK-LABEL: @vpackod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackod.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpackod_b(v16i8 _1, v16i8 _2) { return __lsx_vpackod_b(_1, _2); }
 // CHECK-LABEL: @vpackod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackod.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpackod_h(v8i16 _1, v8i16 _2) { return __lsx_vpackod_h(_1, _2); }
 // CHECK-LABEL: @vpackod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackod.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpackod_w(v4i32 _1, v4i32 _2) { return __lsx_vpackod_w(_1, _2); }
 // CHECK-LABEL: @vpackod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackod.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpackod_d(v2i64 _1, v2i64 _2) { return __lsx_vpackod_d(_1, _2); }
 // CHECK-LABEL: @vshuf_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vshuf_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __lsx_vshuf_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vshuf_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vshuf_w(v4i32 _1, v4i32 _2, v4i32 _3) {
   return __lsx_vshuf_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vshuf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vshuf_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __lsx_vshuf_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vand_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vand.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vand.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vand_v(v16u8 _1, v16u8 _2) { return __lsx_vand_v(_1, _2); }
 // CHECK-LABEL: @vandi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vandi_b(v16u8 _1) { return __lsx_vandi_b(_1, 1); }
 // CHECK-LABEL: @vor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vor.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vor_v(v16u8 _1, v16u8 _2) { return __lsx_vor_v(_1, _2); }
 // CHECK-LABEL: @vori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vori_b(v16u8 _1) { return __lsx_vori_b(_1, 1); }
 // CHECK-LABEL: @vnor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnor.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vnor_v(v16u8 _1, v16u8 _2) { return __lsx_vnor_v(_1, _2); }
 // CHECK-LABEL: @vnori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vnori_b(v16u8 _1) { return __lsx_vnori_b(_1, 1); }
 // CHECK-LABEL: @vxor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxor.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vxor_v(v16u8 _1, v16u8 _2) { return __lsx_vxor_v(_1, _2); }
 // CHECK-LABEL: @vxori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vxori_b(v16u8 _1) { return __lsx_vxori_b(_1, 1); }
 // CHECK-LABEL: @vbitsel_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitsel.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitsel.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16u8 vbitsel_v(v16u8 _1, v16u8 _2, v16u8 _3) {
   return __lsx_vbitsel_v(_1, _2, _3);
 }
 // CHECK-LABEL: @vbitseli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitseli_b(v16u8 _1, v16u8 _2) { return __lsx_vbitseli_b(_1, _2, 1); }
 // CHECK-LABEL: @vshuf4i_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vshuf4i_b(v16i8 _1) { return __lsx_vshuf4i_b(_1, 1); }
 // CHECK-LABEL: @vshuf4i_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vshuf4i_h(v8i16 _1) { return __lsx_vshuf4i_h(_1, 1); }
 // CHECK-LABEL: @vshuf4i_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vshuf4i_w(v4i32 _1) { return __lsx_vshuf4i_w(_1, 1); }
 // CHECK-LABEL: @vreplgr2vr_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vreplgr2vr_b(int _1) { return __lsx_vreplgr2vr_b(_1); }
 // CHECK-LABEL: @vreplgr2vr_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplgr2vr.h(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v8i16 vreplgr2vr_h(int _1) { return __lsx_vreplgr2vr_h(_1); }
 // CHECK-LABEL: @vreplgr2vr_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplgr2vr.w(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v4i32 vreplgr2vr_w(int _1) { return __lsx_vreplgr2vr_w(_1); }
 // CHECK-LABEL: @vreplgr2vr_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vreplgr2vr_d(long _1) { return __lsx_vreplgr2vr_d(_1); }
 // CHECK-LABEL: @vpcnt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpcnt.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpcnt.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vpcnt_b(v16i8 _1) { return __lsx_vpcnt_b(_1); }
 // CHECK-LABEL: @vpcnt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpcnt.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpcnt.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vpcnt_h(v8i16 _1) { return __lsx_vpcnt_h(_1); }
 // CHECK-LABEL: @vpcnt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpcnt.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpcnt.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vpcnt_w(v4i32 _1) { return __lsx_vpcnt_w(_1); }
 // CHECK-LABEL: @vpcnt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpcnt.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpcnt.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vpcnt_d(v2i64 _1) { return __lsx_vpcnt_d(_1); }
 // CHECK-LABEL: @vclo_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclo.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclo.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vclo_b(v16i8 _1) { return __lsx_vclo_b(_1); }
 // CHECK-LABEL: @vclo_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclo.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclo.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vclo_h(v8i16 _1) { return __lsx_vclo_h(_1); }
 // CHECK-LABEL: @vclo_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclo.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclo.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vclo_w(v4i32 _1) { return __lsx_vclo_w(_1); }
 // CHECK-LABEL: @vclo_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclo.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclo.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vclo_d(v2i64 _1) { return __lsx_vclo_d(_1); }
 // CHECK-LABEL: @vclz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vclz_b(v16i8 _1) { return __lsx_vclz_b(_1); }
 // CHECK-LABEL: @vclz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vclz_h(v8i16 _1) { return __lsx_vclz_h(_1); }
 // CHECK-LABEL: @vclz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vclz_w(v4i32 _1) { return __lsx_vclz_w(_1); }
 // CHECK-LABEL: @vclz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vclz_d(v2i64 _1) { return __lsx_vclz_d(_1); }
 // CHECK-LABEL: @vpickve2gr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int vpickve2gr_b(v16i8 _1) { return __lsx_vpickve2gr_b(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int vpickve2gr_h(v8i16 _1) { return __lsx_vpickve2gr_h(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int vpickve2gr_w(v4i32 _1) { return __lsx_vpickve2gr_w(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 long vpickve2gr_d(v2i64 _1) { return __lsx_vpickve2gr_d(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int vpickve2gr_bu(v16i8 _1) { return __lsx_vpickve2gr_bu(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int vpickve2gr_hu(v8i16 _1) { return __lsx_vpickve2gr_hu(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int vpickve2gr_wu(v4i32 _1) { return __lsx_vpickve2gr_wu(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 unsigned long int vpickve2gr_du(v2i64 _1) { return __lsx_vpickve2gr_du(_1, 1); }
 // CHECK-LABEL: @vinsgr2vr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> [[TMP0]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vinsgr2vr_b(v16i8 _1) { return __lsx_vinsgr2vr_b(_1, 1, 1); }
 // CHECK-LABEL: @vinsgr2vr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> [[TMP0]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vinsgr2vr_h(v8i16 _1) { return __lsx_vinsgr2vr_h(_1, 1, 1); }
 // CHECK-LABEL: @vinsgr2vr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> [[TMP0]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vinsgr2vr_w(v4i32 _1) { return __lsx_vinsgr2vr_w(_1, 1, 1); }
 // CHECK-LABEL: @vinsgr2vr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> [[_1:%.*]], i64 1, i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> [[TMP0]], i64 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vinsgr2vr_d(v2i64 _1) { return __lsx_vinsgr2vr_d(_1, 1, 1); }
 // CHECK-LABEL: @vfadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfadd.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfadd_s(v4f32 _1, v4f32 _2) { return __lsx_vfadd_s(_1, _2); }
 // CHECK-LABEL: @vfadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfadd.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfadd_d(v2f64 _1, v2f64 _2) { return __lsx_vfadd_d(_1, _2); }
 // CHECK-LABEL: @vfsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsub.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfsub_s(v4f32 _1, v4f32 _2) { return __lsx_vfsub_s(_1, _2); }
 // CHECK-LABEL: @vfsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsub.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfsub_d(v2f64 _1, v2f64 _2) { return __lsx_vfsub_d(_1, _2); }
 // CHECK-LABEL: @vfmul_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmul.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmul.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmul_s(v4f32 _1, v4f32 _2) { return __lsx_vfmul_s(_1, _2); }
 // CHECK-LABEL: @vfmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmul.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmul.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmul_d(v2f64 _1, v2f64 _2) { return __lsx_vfmul_d(_1, _2); }
 // CHECK-LABEL: @vfdiv_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfdiv.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfdiv.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfdiv_s(v4f32 _1, v4f32 _2) { return __lsx_vfdiv_s(_1, _2); }
 // CHECK-LABEL: @vfdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfdiv.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfdiv.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfdiv_d(v2f64 _1, v2f64 _2) { return __lsx_vfdiv_d(_1, _2); }
 // CHECK-LABEL: @vfcvt_h_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfcvt.h.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfcvt.h.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vfcvt_h_s(v4f32 _1, v4f32 _2) { return __lsx_vfcvt_h_s(_1, _2); }
 // CHECK-LABEL: @vfcvt_s_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvt.s.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvt.s.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfcvt_s_d(v2f64 _1, v2f64 _2) { return __lsx_vfcvt_s_d(_1, _2); }
 // CHECK-LABEL: @vfmin_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmin.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmin.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmin_s(v4f32 _1, v4f32 _2) { return __lsx_vfmin_s(_1, _2); }
 // CHECK-LABEL: @vfmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmin.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmin.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmin_d(v2f64 _1, v2f64 _2) { return __lsx_vfmin_d(_1, _2); }
 // CHECK-LABEL: @vfmina_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmina.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmina.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmina_s(v4f32 _1, v4f32 _2) { return __lsx_vfmina_s(_1, _2); }
 // CHECK-LABEL: @vfmina_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmina.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmina.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmina_d(v2f64 _1, v2f64 _2) { return __lsx_vfmina_d(_1, _2); }
 // CHECK-LABEL: @vfmax_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmax.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmax.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmax_s(v4f32 _1, v4f32 _2) { return __lsx_vfmax_s(_1, _2); }
 // CHECK-LABEL: @vfmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmax.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmax.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmax_d(v2f64 _1, v2f64 _2) { return __lsx_vfmax_d(_1, _2); }
 // CHECK-LABEL: @vfmaxa_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmaxa.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmaxa.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmaxa_s(v4f32 _1, v4f32 _2) { return __lsx_vfmaxa_s(_1, _2); }
 // CHECK-LABEL: @vfmaxa_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmaxa.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmaxa.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmaxa_d(v2f64 _1, v2f64 _2) { return __lsx_vfmaxa_d(_1, _2); }
 // CHECK-LABEL: @vfclass_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfclass.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfclass.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfclass_s(v4f32 _1) { return __lsx_vfclass_s(_1); }
 // CHECK-LABEL: @vfclass_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfclass.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfclass.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfclass_d(v2f64 _1) { return __lsx_vfclass_d(_1); }
 // CHECK-LABEL: @vfsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsqrt.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsqrt.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfsqrt_s(v4f32 _1) { return __lsx_vfsqrt_s(_1); }
 // CHECK-LABEL: @vfsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsqrt.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsqrt.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfsqrt_d(v2f64 _1) { return __lsx_vfsqrt_d(_1); }
 // CHECK-LABEL: @vfrecip_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecip.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecip.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfrecip_s(v4f32 _1) { return __lsx_vfrecip_s(_1); }
 // CHECK-LABEL: @vfrecip_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecip.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecip.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfrecip_d(v2f64 _1) { return __lsx_vfrecip_d(_1); }
 // CHECK-LABEL: @vfrint_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrint.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrint.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfrint_s(v4f32 _1) { return __lsx_vfrint_s(_1); }
 // CHECK-LABEL: @vfrint_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrint.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrint.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfrint_d(v2f64 _1) { return __lsx_vfrint_d(_1); }
 // CHECK-LABEL: @vfrsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrt.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrt.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfrsqrt_s(v4f32 _1) { return __lsx_vfrsqrt_s(_1); }
 // CHECK-LABEL: @vfrsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrt.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrt.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfrsqrt_d(v2f64 _1) { return __lsx_vfrsqrt_d(_1); }
 // CHECK-LABEL: @vflogb_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vflogb.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vflogb.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vflogb_s(v4f32 _1) { return __lsx_vflogb_s(_1); }
 // CHECK-LABEL: @vflogb_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vflogb.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vflogb.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vflogb_d(v2f64 _1) { return __lsx_vflogb_d(_1); }
 // CHECK-LABEL: @vfcvth_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvth.s.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvth.s.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfcvth_s_h(v8i16 _1) { return __lsx_vfcvth_s_h(_1); }
 // CHECK-LABEL: @vfcvth_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvth.d.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvth.d.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfcvth_d_s(v4f32 _1) { return __lsx_vfcvth_d_s(_1); }
 // CHECK-LABEL: @vfcvtl_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvtl.s.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvtl.s.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfcvtl_s_h(v8i16 _1) { return __lsx_vfcvtl_s_h(_1); }
 // CHECK-LABEL: @vfcvtl_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvtl.d.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvtl.d.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfcvtl_d_s(v4f32 _1) { return __lsx_vfcvtl_d_s(_1); }
 // CHECK-LABEL: @vftint_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftint_w_s(v4f32 _1) { return __lsx_vftint_w_s(_1); }
 // CHECK-LABEL: @vftint_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftint_l_d(v2f64 _1) { return __lsx_vftint_l_d(_1); }
 // CHECK-LABEL: @vftint_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.wu.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.wu.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vftint_wu_s(v4f32 _1) { return __lsx_vftint_wu_s(_1); }
 // CHECK-LABEL: @vftint_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.lu.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.lu.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vftint_lu_d(v2f64 _1) { return __lsx_vftint_lu_d(_1); }
 // CHECK-LABEL: @vftintrz_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrz_w_s(v4f32 _1) { return __lsx_vftintrz_w_s(_1); }
 // CHECK-LABEL: @vftintrz_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrz_l_d(v2f64 _1) { return __lsx_vftintrz_l_d(_1); }
 // CHECK-LABEL: @vftintrz_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.wu.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.wu.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vftintrz_wu_s(v4f32 _1) { return __lsx_vftintrz_wu_s(_1); }
 // CHECK-LABEL: @vftintrz_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.lu.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.lu.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vftintrz_lu_d(v2f64 _1) { return __lsx_vftintrz_lu_d(_1); }
 // CHECK-LABEL: @vffint_s_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vffint_s_w(v4i32 _1) { return __lsx_vffint_s_w(_1); }
 // CHECK-LABEL: @vffint_d_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.l(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.l(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffint_d_l(v2i64 _1) { return __lsx_vffint_d_l(_1); }
 // CHECK-LABEL: @vffint_s_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.wu(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.wu(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vffint_s_wu(v4u32 _1) { return __lsx_vffint_s_wu(_1); }
 // CHECK-LABEL: @vffint_d_lu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.lu(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.lu(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffint_d_lu(v2u64 _1) { return __lsx_vffint_d_lu(_1); }
 // CHECK-LABEL: @vandn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandn.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandn.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vandn_v(v16u8 _1, v16u8 _2) { return __lsx_vandn_v(_1, _2); }
 // CHECK-LABEL: @vneg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vneg.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vneg.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vneg_b(v16i8 _1) { return __lsx_vneg_b(_1); }
 // CHECK-LABEL: @vneg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vneg.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vneg.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vneg_h(v8i16 _1) { return __lsx_vneg_h(_1); }
 // CHECK-LABEL: @vneg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vneg.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vneg.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vneg_w(v4i32 _1) { return __lsx_vneg_w(_1); }
 // CHECK-LABEL: @vneg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vneg.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vneg.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vneg_d(v2i64 _1) { return __lsx_vneg_d(_1); }
 // CHECK-LABEL: @vmuh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmuh_b(v16i8 _1, v16i8 _2) { return __lsx_vmuh_b(_1, _2); }
 // CHECK-LABEL: @vmuh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmuh_h(v8i16 _1, v8i16 _2) { return __lsx_vmuh_h(_1, _2); }
 // CHECK-LABEL: @vmuh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmuh_w(v4i32 _1, v4i32 _2) { return __lsx_vmuh_w(_1, _2); }
 // CHECK-LABEL: @vmuh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmuh_d(v2i64 _1, v2i64 _2) { return __lsx_vmuh_d(_1, _2); }
 // CHECK-LABEL: @vmuh_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmuh_bu(v16u8 _1, v16u8 _2) { return __lsx_vmuh_bu(_1, _2); }
 // CHECK-LABEL: @vmuh_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmuh_hu(v8u16 _1, v8u16 _2) { return __lsx_vmuh_hu(_1, _2); }
 // CHECK-LABEL: @vmuh_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmuh_wu(v4u32 _1, v4u32 _2) { return __lsx_vmuh_wu(_1, _2); }
 // CHECK-LABEL: @vmuh_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmuh_du(v2u64 _1, v2u64 _2) { return __lsx_vmuh_du(_1, _2); }
 // CHECK-LABEL: @vsllwil_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsllwil_h_b(v16i8 _1) { return __lsx_vsllwil_h_b(_1, 1); }
 // CHECK-LABEL: @vsllwil_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsllwil_w_h(v8i16 _1) { return __lsx_vsllwil_w_h(_1, 1); }
 // CHECK-LABEL: @vsllwil_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsllwil_d_w(v4i32 _1) { return __lsx_vsllwil_d_w(_1, 1); }
 // CHECK-LABEL: @vsllwil_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vsllwil_hu_bu(v16u8 _1) { return __lsx_vsllwil_hu_bu(_1, 1); }
 // CHECK-LABEL: @vsllwil_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vsllwil_wu_hu(v8u16 _1) { return __lsx_vsllwil_wu_hu(_1, 1); }
 // CHECK-LABEL: @vsllwil_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vsllwil_du_wu(v4u32 _1) { return __lsx_vsllwil_du_wu(_1, 1); }
 // CHECK-LABEL: @vsran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsran.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsran.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsran_b_h(v8i16 _1, v8i16 _2) { return __lsx_vsran_b_h(_1, _2); }
 // CHECK-LABEL: @vsran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsran.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsran.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsran_h_w(v4i32 _1, v4i32 _2) { return __lsx_vsran_h_w(_1, _2); }
 // CHECK-LABEL: @vsran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsran.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsran.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsran_w_d(v2i64 _1, v2i64 _2) { return __lsx_vsran_w_d(_1, _2); }
 // CHECK-LABEL: @vssran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssran_b_h(v8i16 _1, v8i16 _2) { return __lsx_vssran_b_h(_1, _2); }
 // CHECK-LABEL: @vssran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssran_h_w(v4i32 _1, v4i32 _2) { return __lsx_vssran_h_w(_1, _2); }
 // CHECK-LABEL: @vssran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssran_w_d(v2i64 _1, v2i64 _2) { return __lsx_vssran_w_d(_1, _2); }
 // CHECK-LABEL: @vssran_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssran_bu_h(v8u16 _1, v8u16 _2) { return __lsx_vssran_bu_h(_1, _2); }
 // CHECK-LABEL: @vssran_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssran_hu_w(v4u32 _1, v4u32 _2) { return __lsx_vssran_hu_w(_1, _2); }
 // CHECK-LABEL: @vssran_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssran_wu_d(v2u64 _1, v2u64 _2) { return __lsx_vssran_wu_d(_1, _2); }
 // CHECK-LABEL: @vsrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrarn_b_h(v8i16 _1, v8i16 _2) { return __lsx_vsrarn_b_h(_1, _2); }
 // CHECK-LABEL: @vsrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrarn_h_w(v4i32 _1, v4i32 _2) { return __lsx_vsrarn_h_w(_1, _2); }
 // CHECK-LABEL: @vsrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrarn_w_d(v2i64 _1, v2i64 _2) { return __lsx_vsrarn_w_d(_1, _2); }
 // CHECK-LABEL: @vssrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrarn_b_h(v8i16 _1, v8i16 _2) { return __lsx_vssrarn_b_h(_1, _2); }
 // CHECK-LABEL: @vssrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrarn_h_w(v4i32 _1, v4i32 _2) { return __lsx_vssrarn_h_w(_1, _2); }
 // CHECK-LABEL: @vssrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrarn_w_d(v2i64 _1, v2i64 _2) { return __lsx_vssrarn_w_d(_1, _2); }
 // CHECK-LABEL: @vssrarn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrarn_bu_h(v8u16 _1, v8u16 _2) { return __lsx_vssrarn_bu_h(_1, _2); }
 // CHECK-LABEL: @vssrarn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrarn_hu_w(v4u32 _1, v4u32 _2) { return __lsx_vssrarn_hu_w(_1, _2); }
 // CHECK-LABEL: @vssrarn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrarn_wu_d(v2u64 _1, v2u64 _2) { return __lsx_vssrarn_wu_d(_1, _2); }
 // CHECK-LABEL: @vsrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrln.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrln.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrln_b_h(v8i16 _1, v8i16 _2) { return __lsx_vsrln_b_h(_1, _2); }
 // CHECK-LABEL: @vsrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrln.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrln.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrln_h_w(v4i32 _1, v4i32 _2) { return __lsx_vsrln_h_w(_1, _2); }
 // CHECK-LABEL: @vsrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrln.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrln.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrln_w_d(v2i64 _1, v2i64 _2) { return __lsx_vsrln_w_d(_1, _2); }
 // CHECK-LABEL: @vssrln_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrln_bu_h(v8u16 _1, v8u16 _2) { return __lsx_vssrln_bu_h(_1, _2); }
 // CHECK-LABEL: @vssrln_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrln_hu_w(v4u32 _1, v4u32 _2) { return __lsx_vssrln_hu_w(_1, _2); }
 // CHECK-LABEL: @vssrln_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrln_wu_d(v2u64 _1, v2u64 _2) { return __lsx_vssrln_wu_d(_1, _2); }
 // CHECK-LABEL: @vsrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlrn_b_h(v8i16 _1, v8i16 _2) { return __lsx_vsrlrn_b_h(_1, _2); }
 // CHECK-LABEL: @vsrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlrn_h_w(v4i32 _1, v4i32 _2) { return __lsx_vsrlrn_h_w(_1, _2); }
 // CHECK-LABEL: @vsrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlrn_w_d(v2i64 _1, v2i64 _2) { return __lsx_vsrlrn_w_d(_1, _2); }
 // CHECK-LABEL: @vssrlrn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrlrn_bu_h(v8u16 _1, v8u16 _2) { return __lsx_vssrlrn_bu_h(_1, _2); }
 // CHECK-LABEL: @vssrlrn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrlrn_hu_w(v4u32 _1, v4u32 _2) { return __lsx_vssrlrn_hu_w(_1, _2); }
 // CHECK-LABEL: @vssrlrn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrlrn_wu_d(v2u64 _1, v2u64 _2) { return __lsx_vssrlrn_wu_d(_1, _2); }
 // CHECK-LABEL: @vfrstpi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vfrstpi_b(v16i8 _1, v16i8 _2) { return __lsx_vfrstpi_b(_1, _2, 1); }
 // CHECK-LABEL: @vfrstpi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vfrstpi_h(v8i16 _1, v8i16 _2) { return __lsx_vfrstpi_h(_1, _2, 1); }
 // CHECK-LABEL: @vfrstp_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstp.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstp.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vfrstp_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __lsx_vfrstp_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vfrstp_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstp.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstp.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vfrstp_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __lsx_vfrstp_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vshuf4i_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vshuf4i_d(v2i64 _1, v2i64 _2) { return __lsx_vshuf4i_d(_1, _2, 1); }
 // CHECK-LABEL: @vbsrl_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vbsrl_v(v16i8 _1) { return __lsx_vbsrl_v(_1, 1); }
 // CHECK-LABEL: @vbsll_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vbsll_v(v16i8 _1) { return __lsx_vbsll_v(_1, 1); }
 // CHECK-LABEL: @vextrins_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vextrins_b(v16i8 _1, v16i8 _2) { return __lsx_vextrins_b(_1, _2, 1); }
 // CHECK-LABEL: @vextrins_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vextrins_h(v8i16 _1, v8i16 _2) { return __lsx_vextrins_h(_1, _2, 1); }
 // CHECK-LABEL: @vextrins_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vextrins_w(v4i32 _1, v4i32 _2) { return __lsx_vextrins_w(_1, _2, 1); }
 // CHECK-LABEL: @vextrins_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vextrins_d(v2i64 _1, v2i64 _2) { return __lsx_vextrins_d(_1, _2, 1); }
 // CHECK-LABEL: @vmskltz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskltz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskltz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmskltz_b(v16i8 _1) { return __lsx_vmskltz_b(_1); }
 // CHECK-LABEL: @vmskltz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmskltz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmskltz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vmskltz_h(v8i16 _1) { return __lsx_vmskltz_h(_1); }
 // CHECK-LABEL: @vmskltz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmskltz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmskltz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vmskltz_w(v4i32 _1) { return __lsx_vmskltz_w(_1); }
 // CHECK-LABEL: @vmskltz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmskltz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmskltz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vmskltz_d(v2i64 _1) { return __lsx_vmskltz_d(_1); }
 // CHECK-LABEL: @vsigncov_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsigncov.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsigncov.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsigncov_b(v16i8 _1, v16i8 _2) { return __lsx_vsigncov_b(_1, _2); }
 // CHECK-LABEL: @vsigncov_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsigncov.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsigncov.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsigncov_h(v8i16 _1, v8i16 _2) { return __lsx_vsigncov_h(_1, _2); }
 // CHECK-LABEL: @vsigncov_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsigncov.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsigncov.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsigncov_w(v4i32 _1, v4i32 _2) { return __lsx_vsigncov_w(_1, _2); }
 // CHECK-LABEL: @vsigncov_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsigncov.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsigncov.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsigncov_d(v2i64 _1, v2i64 _2) { return __lsx_vsigncov_d(_1, _2); }
 // CHECK-LABEL: @vfmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmadd.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfmadd_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __lsx_vfmadd_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmadd.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfmadd_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __lsx_vfmadd_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vfmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmsub.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfmsub_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __lsx_vfmsub_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmsub.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfmsub_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __lsx_vfmsub_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmadd.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfnmadd_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __lsx_vfnmadd_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmadd.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfnmadd_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __lsx_vfnmadd_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmsub.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfnmsub_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __lsx_vfnmsub_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmsub.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfnmsub_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __lsx_vfnmsub_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vftintrne_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrne_w_s(v4f32 _1) { return __lsx_vftintrne_w_s(_1); }
 // CHECK-LABEL: @vftintrne_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrne.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrne.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrne_l_d(v2f64 _1) { return __lsx_vftintrne_l_d(_1); }
 // CHECK-LABEL: @vftintrp_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrp_w_s(v4f32 _1) { return __lsx_vftintrp_w_s(_1); }
 // CHECK-LABEL: @vftintrp_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrp.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrp.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrp_l_d(v2f64 _1) { return __lsx_vftintrp_l_d(_1); }
 // CHECK-LABEL: @vftintrm_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrm_w_s(v4f32 _1) { return __lsx_vftintrm_w_s(_1); }
 // CHECK-LABEL: @vftintrm_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrm.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrm.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrm_l_d(v2f64 _1) { return __lsx_vftintrm_l_d(_1); }
 // CHECK-LABEL: @vftint_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftint_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftint_w_d(_1, _2); }
 // CHECK-LABEL: @vffint_s_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.l(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.l(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vffint_s_l(v2i64 _1, v2i64 _2) { return __lsx_vffint_s_l(_1, _2); }
 // CHECK-LABEL: @vftintrz_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrz_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftintrz_w_d(_1, _2); }
 // CHECK-LABEL: @vftintrp_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrp_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftintrp_w_d(_1, _2); }
 // CHECK-LABEL: @vftintrm_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrm_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftintrm_w_d(_1, _2); }
 // CHECK-LABEL: @vftintrne_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrne_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftintrne_w_d(_1, _2); }
 // CHECK-LABEL: @vftintl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintl.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintl.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintl_l_s(v4f32 _1) { return __lsx_vftintl_l_s(_1); }
 // CHECK-LABEL: @vftinth_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftinth.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftinth.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftinth_l_s(v4f32 _1) { return __lsx_vftinth_l_s(_1); }
 // CHECK-LABEL: @vffinth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffinth.d.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffinth.d.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffinth_d_w(v4i32 _1) { return __lsx_vffinth_d_w(_1); }
 // CHECK-LABEL: @vffintl_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffintl.d.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffintl.d.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffintl_d_w(v4i32 _1) { return __lsx_vffintl_d_w(_1); }
 // CHECK-LABEL: @vftintrzl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzl.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzl.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrzl_l_s(v4f32 _1) { return __lsx_vftintrzl_l_s(_1); }
 // CHECK-LABEL: @vftintrzh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzh.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzh.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrzh_l_s(v4f32 _1) { return __lsx_vftintrzh_l_s(_1); }
 // CHECK-LABEL: @vftintrpl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrpl.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrpl.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrpl_l_s(v4f32 _1) { return __lsx_vftintrpl_l_s(_1); }
 // CHECK-LABEL: @vftintrph_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrph.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrph.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrph_l_s(v4f32 _1) { return __lsx_vftintrph_l_s(_1); }
 // CHECK-LABEL: @vftintrml_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrml.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrml.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrml_l_s(v4f32 _1) { return __lsx_vftintrml_l_s(_1); }
 // CHECK-LABEL: @vftintrmh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrmh.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrmh.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrmh_l_s(v4f32 _1) { return __lsx_vftintrmh_l_s(_1); }
 // CHECK-LABEL: @vftintrnel_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrnel.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrnel.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrnel_l_s(v4f32 _1) { return __lsx_vftintrnel_l_s(_1); }
 // CHECK-LABEL: @vftintrneh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrneh.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrneh.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrneh_l_s(v4f32 _1) { return __lsx_vftintrneh_l_s(_1); }
 // CHECK-LABEL: @vfrintrne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrne.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrne.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrne_s(v4f32 _1) { return __lsx_vfrintrne_s(_1); }
 // CHECK-LABEL: @vfrintrne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrne.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrne.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrne_d(v2f64 _1) { return __lsx_vfrintrne_d(_1); }
 // CHECK-LABEL: @vfrintrz_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrz.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrz.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrz_s(v4f32 _1) { return __lsx_vfrintrz_s(_1); }
 // CHECK-LABEL: @vfrintrz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrz.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrz.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrz_d(v2f64 _1) { return __lsx_vfrintrz_d(_1); }
 // CHECK-LABEL: @vfrintrp_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrp.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrp.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrp_s(v4f32 _1) { return __lsx_vfrintrp_s(_1); }
 // CHECK-LABEL: @vfrintrp_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrp.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrp.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrp_d(v2f64 _1) { return __lsx_vfrintrp_d(_1); }
 // CHECK-LABEL: @vfrintrm_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrm.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrm.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrm_s(v4f32 _1) { return __lsx_vfrintrm_s(_1); }
 // CHECK-LABEL: @vfrintrm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrm.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrm.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrm_d(v2f64 _1) { return __lsx_vfrintrm_d(_1); }
 // CHECK-LABEL: @vstelm_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> [[TMP0]], ptr [[_2:%.*]], i32 1, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_b(v16i8 _1, void *_2) { return __lsx_vstelm_b(_1, _2, 1, 1); }
 // CHECK-LABEL: @vstelm_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> [[_1:%.*]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> [[TMP0]], ptr [[_2:%.*]], i32 2, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_h(v8i16 _1, void *_2) { return __lsx_vstelm_h(_1, _2, 2, 1); }
 // CHECK-LABEL: @vstelm_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> [[_1:%.*]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> [[TMP0]], ptr [[_2:%.*]], i32 4, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_w(v4i32 _1, void *_2) { return __lsx_vstelm_w(_1, _2, 4, 1); }
 // CHECK-LABEL: @vstelm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> [[_1:%.*]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> [[TMP0]], ptr [[_2:%.*]], i32 8, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_d(v2i64 _1, void *_2) { return __lsx_vstelm_d(_1, _2, 8, 1); }
 // CHECK-LABEL: @vaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_d_w(v4i32 _1, v4i32 _2) { return __lsx_vaddwev_d_w(_1, _2); }
 // CHECK-LABEL: @vaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwev_w_h(v8i16 _1, v8i16 _2) { return __lsx_vaddwev_w_h(_1, _2); }
 // CHECK-LABEL: @vaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwev_h_b(v16i8 _1, v16i8 _2) { return __lsx_vaddwev_h_b(_1, _2); }
 // CHECK-LABEL: @vaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_d_w(v4i32 _1, v4i32 _2) { return __lsx_vaddwod_d_w(_1, _2); }
 // CHECK-LABEL: @vaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwod_w_h(v8i16 _1, v8i16 _2) { return __lsx_vaddwod_w_h(_1, _2); }
 // CHECK-LABEL: @vaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwod_h_b(v16i8 _1, v16i8 _2) { return __lsx_vaddwod_h_b(_1, _2); }
 // CHECK-LABEL: @vaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vaddwev_d_wu(_1, _2); }
 // CHECK-LABEL: @vaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwev_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vaddwev_w_hu(_1, _2); }
 // CHECK-LABEL: @vaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwev_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vaddwev_h_bu(_1, _2); }
 // CHECK-LABEL: @vaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vaddwod_d_wu(_1, _2); }
 // CHECK-LABEL: @vaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwod_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vaddwod_w_hu(_1, _2); }
 // CHECK-LABEL: @vaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwod_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vaddwod_h_bu(_1, _2); }
 // CHECK-LABEL: @vaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_d_wu_w(v4u32 _1, v4i32 _2) {
   return __lsx_vaddwev_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwev_w_hu_h(v8u16 _1, v8i16 _2) {
   return __lsx_vaddwev_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwev_h_bu_b(v16u8 _1, v16i8 _2) {
   return __lsx_vaddwev_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_d_wu_w(v4u32 _1, v4i32 _2) {
   return __lsx_vaddwod_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwod_w_hu_h(v8u16 _1, v8i16 _2) {
   return __lsx_vaddwod_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwod_h_bu_b(v16u8 _1, v16i8 _2) {
   return __lsx_vaddwod_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_d_w(v4i32 _1, v4i32 _2) { return __lsx_vsubwev_d_w(_1, _2); }
 // CHECK-LABEL: @vsubwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwev_w_h(v8i16 _1, v8i16 _2) { return __lsx_vsubwev_w_h(_1, _2); }
 // CHECK-LABEL: @vsubwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwev_h_b(v16i8 _1, v16i8 _2) { return __lsx_vsubwev_h_b(_1, _2); }
 // CHECK-LABEL: @vsubwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_d_w(v4i32 _1, v4i32 _2) { return __lsx_vsubwod_d_w(_1, _2); }
 // CHECK-LABEL: @vsubwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwod_w_h(v8i16 _1, v8i16 _2) { return __lsx_vsubwod_w_h(_1, _2); }
 // CHECK-LABEL: @vsubwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwod_h_b(v16i8 _1, v16i8 _2) { return __lsx_vsubwod_h_b(_1, _2); }
 // CHECK-LABEL: @vsubwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vsubwev_d_wu(_1, _2); }
 // CHECK-LABEL: @vsubwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwev_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vsubwev_w_hu(_1, _2); }
 // CHECK-LABEL: @vsubwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwev_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vsubwev_h_bu(_1, _2); }
 // CHECK-LABEL: @vsubwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vsubwod_d_wu(_1, _2); }
 // CHECK-LABEL: @vsubwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwod_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vsubwod_w_hu(_1, _2); }
 // CHECK-LABEL: @vsubwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwod_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vsubwod_h_bu(_1, _2); }
 // CHECK-LABEL: @vaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_q_d(v2i64 _1, v2i64 _2) { return __lsx_vaddwev_q_d(_1, _2); }
 // CHECK-LABEL: @vaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_q_d(v2i64 _1, v2i64 _2) { return __lsx_vaddwod_q_d(_1, _2); }
 // CHECK-LABEL: @vaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_q_du(v2u64 _1, v2u64 _2) { return __lsx_vaddwev_q_du(_1, _2); }
 // CHECK-LABEL: @vaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_q_du(v2u64 _1, v2u64 _2) { return __lsx_vaddwod_q_du(_1, _2); }
 // CHECK-LABEL: @vsubwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_q_d(v2i64 _1, v2i64 _2) { return __lsx_vsubwev_q_d(_1, _2); }
 // CHECK-LABEL: @vsubwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_q_d(v2i64 _1, v2i64 _2) { return __lsx_vsubwod_q_d(_1, _2); }
 // CHECK-LABEL: @vsubwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_q_du(v2u64 _1, v2u64 _2) { return __lsx_vsubwev_q_du(_1, _2); }
 // CHECK-LABEL: @vsubwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_q_du(v2u64 _1, v2u64 _2) { return __lsx_vsubwod_q_du(_1, _2); }
 // CHECK-LABEL: @vaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_q_du_d(v2u64 _1, v2i64 _2) {
   return __lsx_vaddwev_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_q_du_d(v2u64 _1, v2i64 _2) {
   return __lsx_vaddwod_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_d_w(v4i32 _1, v4i32 _2) { return __lsx_vmulwev_d_w(_1, _2); }
 // CHECK-LABEL: @vmulwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwev_w_h(v8i16 _1, v8i16 _2) { return __lsx_vmulwev_w_h(_1, _2); }
 // CHECK-LABEL: @vmulwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwev_h_b(v16i8 _1, v16i8 _2) { return __lsx_vmulwev_h_b(_1, _2); }
 // CHECK-LABEL: @vmulwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_d_w(v4i32 _1, v4i32 _2) { return __lsx_vmulwod_d_w(_1, _2); }
 // CHECK-LABEL: @vmulwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwod_w_h(v8i16 _1, v8i16 _2) { return __lsx_vmulwod_w_h(_1, _2); }
 // CHECK-LABEL: @vmulwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwod_h_b(v16i8 _1, v16i8 _2) { return __lsx_vmulwod_h_b(_1, _2); }
 // CHECK-LABEL: @vmulwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vmulwev_d_wu(_1, _2); }
 // CHECK-LABEL: @vmulwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwev_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vmulwev_w_hu(_1, _2); }
 // CHECK-LABEL: @vmulwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwev_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vmulwev_h_bu(_1, _2); }
 // CHECK-LABEL: @vmulwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vmulwod_d_wu(_1, _2); }
 // CHECK-LABEL: @vmulwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwod_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vmulwod_w_hu(_1, _2); }
 // CHECK-LABEL: @vmulwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwod_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vmulwod_h_bu(_1, _2); }
 // CHECK-LABEL: @vmulwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_d_wu_w(v4u32 _1, v4i32 _2) {
   return __lsx_vmulwev_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwev_w_hu_h(v8u16 _1, v8i16 _2) {
   return __lsx_vmulwev_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwev_h_bu_b(v16u8 _1, v16i8 _2) {
   return __lsx_vmulwev_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_d_wu_w(v4u32 _1, v4i32 _2) {
   return __lsx_vmulwod_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwod_w_hu_h(v8u16 _1, v8i16 _2) {
   return __lsx_vmulwod_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwod_h_bu_b(v16u8 _1, v16i8 _2) {
   return __lsx_vmulwod_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_q_d(v2i64 _1, v2i64 _2) { return __lsx_vmulwev_q_d(_1, _2); }
 // CHECK-LABEL: @vmulwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_q_d(v2i64 _1, v2i64 _2) { return __lsx_vmulwod_q_d(_1, _2); }
 // CHECK-LABEL: @vmulwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_q_du(v2u64 _1, v2u64 _2) { return __lsx_vmulwev_q_du(_1, _2); }
 // CHECK-LABEL: @vmulwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_q_du(v2u64 _1, v2u64 _2) { return __lsx_vmulwod_q_du(_1, _2); }
 // CHECK-LABEL: @vmulwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_q_du_d(v2u64 _1, v2i64 _2) {
   return __lsx_vmulwev_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_q_du_d(v2u64 _1, v2i64 _2) {
   return __lsx_vmulwod_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhaddw_q_d(v2i64 _1, v2i64 _2) { return __lsx_vhaddw_q_d(_1, _2); }
 // CHECK-LABEL: @vhaddw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.qu.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.qu.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vhaddw_qu_du(v2u64 _1, v2u64 _2) { return __lsx_vhaddw_qu_du(_1, _2); }
 // CHECK-LABEL: @vhsubw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhsubw_q_d(v2i64 _1, v2i64 _2) { return __lsx_vhsubw_q_d(_1, _2); }
 // CHECK-LABEL: @vhsubw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.qu.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.qu.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vhsubw_qu_du(v2u64 _1, v2u64 _2) { return __lsx_vhsubw_qu_du(_1, _2); }
 // CHECK-LABEL: @vmaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_d_w(v2i64 _1, v4i32 _2, v4i32 _3) {
   return __lsx_vmaddwev_d_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwev_w_h(v4i32 _1, v8i16 _2, v8i16 _3) {
   return __lsx_vmaddwev_w_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwev_h_b(v8i16 _1, v16i8 _2, v16i8 _3) {
   return __lsx_vmaddwev_h_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwev_d_wu(v2u64 _1, v4u32 _2, v4u32 _3) {
   return __lsx_vmaddwev_d_wu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4u32 vmaddwev_w_hu(v4u32 _1, v8u16 _2, v8u16 _3) {
   return __lsx_vmaddwev_w_hu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8u16 vmaddwev_h_bu(v8u16 _1, v16u8 _2, v16u8 _3) {
   return __lsx_vmaddwev_h_bu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_d_w(v2i64 _1, v4i32 _2, v4i32 _3) {
   return __lsx_vmaddwod_d_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwod_w_h(v4i32 _1, v8i16 _2, v8i16 _3) {
   return __lsx_vmaddwod_w_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwod_h_b(v8i16 _1, v16i8 _2, v16i8 _3) {
   return __lsx_vmaddwod_h_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwod_d_wu(v2u64 _1, v4u32 _2, v4u32 _3) {
   return __lsx_vmaddwod_d_wu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4u32 vmaddwod_w_hu(v4u32 _1, v8u16 _2, v8u16 _3) {
   return __lsx_vmaddwod_w_hu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8u16 vmaddwod_h_bu(v8u16 _1, v16u8 _2, v16u8 _3) {
   return __lsx_vmaddwod_h_bu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_d_wu_w(v2i64 _1, v4u32 _2, v4i32 _3) {
   return __lsx_vmaddwev_d_wu_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwev_w_hu_h(v4i32 _1, v8u16 _2, v8i16 _3) {
   return __lsx_vmaddwev_w_hu_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwev_h_bu_b(v8i16 _1, v16u8 _2, v16i8 _3) {
   return __lsx_vmaddwev_h_bu_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_d_wu_w(v2i64 _1, v4u32 _2, v4i32 _3) {
   return __lsx_vmaddwod_d_wu_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwod_w_hu_h(v4i32 _1, v8u16 _2, v8i16 _3) {
   return __lsx_vmaddwod_w_hu_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwod_h_bu_b(v8i16 _1, v16u8 _2, v16i8 _3) {
   return __lsx_vmaddwod_h_bu_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_q_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __lsx_vmaddwev_q_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_q_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __lsx_vmaddwod_q_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwev_q_du(v2u64 _1, v2u64 _2, v2u64 _3) {
   return __lsx_vmaddwev_q_du(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwod_q_du(v2u64 _1, v2u64 _2, v2u64 _3) {
   return __lsx_vmaddwod_q_du(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_q_du_d(v2i64 _1, v2u64 _2, v2i64 _3) {
   return __lsx_vmaddwev_q_du_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_q_du_d(v2i64 _1, v2u64 _2, v2i64 _3) {
   return __lsx_vmaddwod_q_du_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vrotr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vrotr_b(v16i8 _1, v16i8 _2) { return __lsx_vrotr_b(_1, _2); }
 // CHECK-LABEL: @vrotr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vrotr_h(v8i16 _1, v8i16 _2) { return __lsx_vrotr_h(_1, _2); }
 // CHECK-LABEL: @vrotr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vrotr_w(v4i32 _1, v4i32 _2) { return __lsx_vrotr_w(_1, _2); }
 // CHECK-LABEL: @vrotr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vrotr_d(v2i64 _1, v2i64 _2) { return __lsx_vrotr_d(_1, _2); }
 // CHECK-LABEL: @vadd_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vadd_q(v2i64 _1, v2i64 _2) { return __lsx_vadd_q(_1, _2); }
 // CHECK-LABEL: @vsub_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsub_q(v2i64 _1, v2i64 _2) { return __lsx_vsub_q(_1, _2); }
 // CHECK-LABEL: @vldrepl_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vldrepl.b(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vldrepl_b(void *_1) { return __lsx_vldrepl_b(_1, 1); }
 // CHECK-LABEL: @vldrepl_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vldrepl.h(ptr [[_1:%.*]], i32 2)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v8i16 vldrepl_h(void *_1) { return __lsx_vldrepl_h(_1, 2); }
 // CHECK-LABEL: @vldrepl_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vldrepl.w(ptr [[_1:%.*]], i32 4)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v4i32 vldrepl_w(void *_1) { return __lsx_vldrepl_w(_1, 4); }
 // CHECK-LABEL: @vldrepl_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vldrepl.d(ptr [[_1:%.*]], i32 8)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vldrepl_d(void *_1) { return __lsx_vldrepl_d(_1, 8); }
 // CHECK-LABEL: @vmskgez_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskgez.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskgez.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmskgez_b(v16i8 _1) { return __lsx_vmskgez_b(_1); }
 // CHECK-LABEL: @vmsknz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsknz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsknz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmsknz_b(v16i8 _1) { return __lsx_vmsknz_b(_1); }
 // CHECK-LABEL: @vexth_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.h.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.h.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vexth_h_b(v16i8 _1) { return __lsx_vexth_h_b(_1); }
 // CHECK-LABEL: @vexth_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.w.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.w.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vexth_w_h(v8i16 _1) { return __lsx_vexth_w_h(_1); }
 // CHECK-LABEL: @vexth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.d.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.d.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vexth_d_w(v4i32 _1) { return __lsx_vexth_d_w(_1); }
 // CHECK-LABEL: @vexth_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.q.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.q.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vexth_q_d(v2i64 _1) { return __lsx_vexth_q_d(_1); }
 // CHECK-LABEL: @vexth_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.hu.bu(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.hu.bu(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vexth_hu_bu(v16u8 _1) { return __lsx_vexth_hu_bu(_1); }
 // CHECK-LABEL: @vexth_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.wu.hu(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.wu.hu(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vexth_wu_hu(v8u16 _1) { return __lsx_vexth_wu_hu(_1); }
 // CHECK-LABEL: @vexth_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.du.wu(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.du.wu(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vexth_du_wu(v4u32 _1) { return __lsx_vexth_du_wu(_1); }
 // CHECK-LABEL: @vexth_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.qu.du(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.qu.du(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vexth_qu_du(v2u64 _1) { return __lsx_vexth_qu_du(_1); }
 // CHECK-LABEL: @vrotri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vrotri_b(v16i8 _1) { return __lsx_vrotri_b(_1, 1); }
 // CHECK-LABEL: @vrotri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vrotri_h(v8i16 _1) { return __lsx_vrotri_h(_1, 1); }
 // CHECK-LABEL: @vrotri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vrotri_w(v4i32 _1) { return __lsx_vrotri_w(_1, 1); }
 // CHECK-LABEL: @vrotri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vrotri_d(v2i64 _1) { return __lsx_vrotri_d(_1, 1); }
 // CHECK-LABEL: @vextl_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.q.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.q.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vextl_q_d(v2i64 _1) { return __lsx_vextl_q_d(_1); }
 // CHECK-LABEL: @vsrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vsrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vsrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vsrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vsrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vsrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vsrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrlni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vsrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vsrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlrni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vsrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vsrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlrni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vsrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vsrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlrni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vsrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vsrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrlrni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vsrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrlni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vssrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrlni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vssrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrlni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vssrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrlni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vssrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrlni_bu_h(v16u8 _1, v16i8 _2) { return __lsx_vssrlni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrlni_hu_w(v8u16 _1, v8i16 _2) { return __lsx_vssrlni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrlni_wu_d(v4u32 _1, v4i32 _2) { return __lsx_vssrlni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @vssrlni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrlni_du_q(v2u64 _1, v2i64 _2) { return __lsx_vssrlni_du_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrlrni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vssrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrlrni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vssrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrlrni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vssrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrlrni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vssrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrlrni_bu_h(v16u8 _1, v16i8 _2) {
   return __lsx_vssrlrni_bu_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrlrni_hu_w(v8u16 _1, v8i16 _2) {
   return __lsx_vssrlrni_hu_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrlrni_wu_d(v4u32 _1, v4i32 _2) {
   return __lsx_vssrlrni_wu_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrlrni_du_q(v2u64 _1, v2i64 _2) {
   return __lsx_vssrlrni_du_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrani_b_h(v16i8 _1, v16i8 _2) { return __lsx_vsrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vsrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrani_h_w(v8i16 _1, v8i16 _2) { return __lsx_vsrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vsrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrani_w_d(v4i32 _1, v4i32 _2) { return __lsx_vsrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vsrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrani_d_q(v2i64 _1, v2i64 _2) { return __lsx_vsrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vsrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrarni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vsrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vsrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrarni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vsrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vsrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrarni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vsrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vsrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrarni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vsrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrani_b_h(v16i8 _1, v16i8 _2) { return __lsx_vssrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrani_h_w(v8i16 _1, v8i16 _2) { return __lsx_vssrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrani_w_d(v4i32 _1, v4i32 _2) { return __lsx_vssrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrani_d_q(v2i64 _1, v2i64 _2) { return __lsx_vssrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrani_bu_h(v16u8 _1, v16i8 _2) { return __lsx_vssrani_bu_h(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrani_hu_w(v8u16 _1, v8i16 _2) { return __lsx_vssrani_hu_w(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrani_wu_d(v4u32 _1, v4i32 _2) { return __lsx_vssrani_wu_d(_1, _2, 1); }
 // CHECK-LABEL: @vssrani_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrani_du_q(v2u64 _1, v2i64 _2) { return __lsx_vssrani_du_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrarni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vssrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: @vssrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrarni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vssrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: @vssrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrarni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vssrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: @vssrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrarni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vssrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: @vssrarni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrarni_bu_h(v16u8 _1, v16i8 _2) {
   return __lsx_vssrarni_bu_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrarni_hu_w(v8u16 _1, v8i16 _2) {
   return __lsx_vssrarni_hu_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrarni_wu_d(v4u32 _1, v4i32 _2) {
   return __lsx_vssrarni_wu_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrarni_du_q(v2u64 _1, v2i64 _2) {
   return __lsx_vssrarni_du_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vpermi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpermi_w(v4i32 _1, v4i32 _2) { return __lsx_vpermi_w(_1, _2, 1); }
 // CHECK-LABEL: @vld(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vld(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vld(void *_1) { return __lsx_vld(_1, 1); }
 // CHECK-LABEL: @vst(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vst(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vst(<16 x i8> [[TMP0]], ptr [[_2:%.*]], i32 1)
 // CHECK-NEXT:    ret void
 //
 void vst(v16i8 _1, void *_2) { return __lsx_vst(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrlrn_b_h(v8i16 _1, v8i16 _2) { return __lsx_vssrlrn_b_h(_1, _2); }
 // CHECK-LABEL: @vssrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrlrn_h_w(v4i32 _1, v4i32 _2) { return __lsx_vssrlrn_h_w(_1, _2); }
 // CHECK-LABEL: @vssrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrlrn_w_d(v2i64 _1, v2i64 _2) { return __lsx_vssrlrn_w_d(_1, _2); }
 // CHECK-LABEL: @vssrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrln_b_h(v8i16 _1, v8i16 _2) { return __lsx_vssrln_b_h(_1, _2); }
 // CHECK-LABEL: @vssrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrln_h_w(v4i32 _1, v4i32 _2) { return __lsx_vssrln_h_w(_1, _2); }
 // CHECK-LABEL: @vssrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrln_w_d(v2i64 _1, v2i64 _2) { return __lsx_vssrln_w_d(_1, _2); }
 // CHECK-LABEL: @vorn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vorn.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vorn.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vorn_v(v16i8 _1, v16i8 _2) { return __lsx_vorn_v(_1, _2); }
 // CHECK-LABEL: @vldi(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vldi(i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vldi() { return __lsx_vldi(1); }
 // CHECK-LABEL: @vshuf_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vshuf_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __lsx_vshuf_b(_1, _2, _3);
@@ -4086,366 +5844,516 @@ v16i8 vshuf_b(v16i8 _1, v16i8 _2, v16i8 _3) {
 // CHECK-LABEL: @vldx(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vldx(ptr [[_1:%.*]], i64 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vldx(void *_1) { return __lsx_vldx(_1, 1); }
 // CHECK-LABEL: @vstx(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstx(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i64 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstx(<16 x i8> [[TMP0]], ptr [[_2:%.*]], i64 1)
 // CHECK-NEXT:    ret void
 //
 void vstx(v16i8 _1, void *_2) { return __lsx_vstx(_1, _2, 1); }
 // CHECK-LABEL: @vextl_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.qu.du(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.qu.du(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vextl_qu_du(v2u64 _1) { return __lsx_vextl_qu_du(_1); }
 // CHECK-LABEL: @bnz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_b(v16u8 _1) { return __lsx_bnz_b(_1); }
 // CHECK-LABEL: @bnz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_d(v2u64 _1) { return __lsx_bnz_d(_1); }
 // CHECK-LABEL: @bnz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_h(v8u16 _1) { return __lsx_bnz_h(_1); }
 // CHECK-LABEL: @bnz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.v(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.v(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_v(v16u8 _1) { return __lsx_bnz_v(_1); }
 // CHECK-LABEL: @bnz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_w(v4u32 _1) { return __lsx_bnz_w(_1); }
 // CHECK-LABEL: @bz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_b(v16u8 _1) { return __lsx_bz_b(_1); }
 // CHECK-LABEL: @bz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_d(v2u64 _1) { return __lsx_bz_d(_1); }
 // CHECK-LABEL: @bz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_h(v8u16 _1) { return __lsx_bz_h(_1); }
 // CHECK-LABEL: @bz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.v(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.v(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_v(v16u8 _1) { return __lsx_bz_v(_1); }
 // CHECK-LABEL: @bz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_w(v4u32 _1) { return __lsx_bz_w(_1); }
 // CHECK-LABEL: @vfcmp_caf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.caf.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.caf.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_caf_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_caf_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_caf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_caf_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_caf_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_ceq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.ceq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.ceq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_ceq_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_ceq_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_ceq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.ceq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.ceq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_ceq_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_ceq_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cle.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cle.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cle_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cle_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cle.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cle.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cle_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cle_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_clt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.clt.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.clt.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_clt_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_clt_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_clt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.clt.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.clt.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_clt_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_clt_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cne.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cne.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cne_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cne_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cne.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cne.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cne_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cne_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cor.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cor.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cor_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cor_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cor.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cor.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cor_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cor_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cueq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cueq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cueq_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cueq_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cueq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cueq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cueq_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cueq_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cule.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cule.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cule_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cule_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cule.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cule.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cule_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cule_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cult.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cult.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cult_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cult_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cult.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cult.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cult_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cult_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cun.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cun.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cun_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cun_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cune.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cune.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cune_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cune_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_cune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cune.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cune.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cune_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cune_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_cun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cun.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cun.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cun_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cun_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_saf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.saf.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.saf.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_saf_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_saf_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_saf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.saf.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.saf.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_saf_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_saf_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_seq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.seq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.seq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_seq_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_seq_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_seq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.seq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.seq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_seq_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_seq_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sle.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sle.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sle_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sle_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sle.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sle.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sle_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sle_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_slt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.slt.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.slt.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_slt_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_slt_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_slt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.slt.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.slt.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_slt_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_slt_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sne.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sne.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sne_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sne_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sne.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sne.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sne_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sne_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sor.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sor.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sor_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sor_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sor.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sor.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sor_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sor_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sueq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sueq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sueq_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sueq_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sueq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sueq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sueq_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sueq_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sule.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sule.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sule_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sule_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sule.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sule.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sule_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sule_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sult.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sult.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sult_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sult_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sult.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sult.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sult_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sult_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sun.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sun.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sun_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sun_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sune.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sune.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sune_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sune_d(_1, _2); }
 // CHECK-LABEL: @vfcmp_sune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sune.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sune.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sune_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sune_s(_1, _2); }
 // CHECK-LABEL: @vfcmp_sun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sun.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sun.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sun_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sun_s(_1, _2); }
 // CHECK-LABEL: @vrepli_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrepli.b(i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vrepli_b() { return __lsx_vrepli_b(1); }
 // CHECK-LABEL: @vrepli_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrepli.d(i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vrepli_d() { return __lsx_vrepli_d(1); }
 // CHECK-LABEL: @vrepli_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrepli.h(i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v8i16 vrepli_h() { return __lsx_vrepli_h(1); }
 // CHECK-LABEL: @vrepli_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrepli.w(i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v4i32 vrepli_w() { return __lsx_vrepli_w(1); }
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin.c b/clang/test/CodeGen/LoongArch/lsx/builtin.c
index ef5a390e1838..05a3d13a7fb9 100644
--- a/clang/test/CodeGen/LoongArch/lsx/builtin.c
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin.c
@@ -29,3319 +29,4547 @@ typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
 
 // CHECK-LABEL: @vsll_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsll.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsll.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsll_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsll_b(_1, _2); }
 // CHECK-LABEL: @vsll_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsll.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsll.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsll_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsll_h(_1, _2); }
 // CHECK-LABEL: @vsll_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsll.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsll.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsll_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsll_w(_1, _2); }
 // CHECK-LABEL: @vsll_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsll.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsll.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsll_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsll_d(_1, _2); }
 // CHECK-LABEL: @vslli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslli_b(v16i8 _1) { return __builtin_lsx_vslli_b(_1, 1); }
 // CHECK-LABEL: @vslli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslli_h(v8i16 _1) { return __builtin_lsx_vslli_h(_1, 1); }
 // CHECK-LABEL: @vslli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslli_w(v4i32 _1) { return __builtin_lsx_vslli_w(_1, 1); }
 // CHECK-LABEL: @vslli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslli_d(v2i64 _1) { return __builtin_lsx_vslli_d(_1, 1); }
 // CHECK-LABEL: @vsra_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsra.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsra.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsra_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsra_b(_1, _2); }
 // CHECK-LABEL: @vsra_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsra.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsra.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsra_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsra_h(_1, _2); }
 // CHECK-LABEL: @vsra_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsra.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsra.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsra_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsra_w(_1, _2); }
 // CHECK-LABEL: @vsra_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsra.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsra.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsra_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsra_d(_1, _2); }
 // CHECK-LABEL: @vsrai_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrai_b(v16i8 _1) { return __builtin_lsx_vsrai_b(_1, 1); }
 // CHECK-LABEL: @vsrai_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrai_h(v8i16 _1) { return __builtin_lsx_vsrai_h(_1, 1); }
 // CHECK-LABEL: @vsrai_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrai_w(v4i32 _1) { return __builtin_lsx_vsrai_w(_1, 1); }
 // CHECK-LABEL: @vsrai_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrai_d(v2i64 _1) { return __builtin_lsx_vsrai_d(_1, 1); }
 // CHECK-LABEL: @vsrar_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrar.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrar.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrar_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsrar_b(_1, _2);
 }
 // CHECK-LABEL: @vsrar_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrar.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrar.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrar_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrar_h(_1, _2);
 }
 // CHECK-LABEL: @vsrar_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrar.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrar.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrar_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrar_w(_1, _2);
 }
 // CHECK-LABEL: @vsrar_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrar.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrar.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrar_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrar_d(_1, _2);
 }
 // CHECK-LABEL: @vsrari_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrari_b(v16i8 _1) { return __builtin_lsx_vsrari_b(_1, 1); }
 // CHECK-LABEL: @vsrari_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrari_h(v8i16 _1) { return __builtin_lsx_vsrari_h(_1, 1); }
 // CHECK-LABEL: @vsrari_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrari_w(v4i32 _1) { return __builtin_lsx_vsrari_w(_1, 1); }
 // CHECK-LABEL: @vsrari_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrari_d(v2i64 _1) { return __builtin_lsx_vsrari_d(_1, 1); }
 // CHECK-LABEL: @vsrl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrl.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrl.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrl_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsrl_b(_1, _2); }
 // CHECK-LABEL: @vsrl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrl.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrl.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrl_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsrl_h(_1, _2); }
 // CHECK-LABEL: @vsrl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrl.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrl.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrl_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsrl_w(_1, _2); }
 // CHECK-LABEL: @vsrl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrl.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrl.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrl_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsrl_d(_1, _2); }
 // CHECK-LABEL: @vsrli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrli_b(v16i8 _1) { return __builtin_lsx_vsrli_b(_1, 1); }
 // CHECK-LABEL: @vsrli_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrli_h(v8i16 _1) { return __builtin_lsx_vsrli_h(_1, 1); }
 // CHECK-LABEL: @vsrli_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrli_w(v4i32 _1) { return __builtin_lsx_vsrli_w(_1, 1); }
 // CHECK-LABEL: @vsrli_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrli_d(v2i64 _1) { return __builtin_lsx_vsrli_d(_1, 1); }
 // CHECK-LABEL: @vsrlr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlr_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsrlr_b(_1, _2);
 }
 // CHECK-LABEL: @vsrlr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlr_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrlr_h(_1, _2);
 }
 // CHECK-LABEL: @vsrlr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlr_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrlr_w(_1, _2);
 }
 // CHECK-LABEL: @vsrlr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrlr_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrlr_d(_1, _2);
 }
 // CHECK-LABEL: @vsrlri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsrlri_b(v16i8 _1) { return __builtin_lsx_vsrlri_b(_1, 1); }
 // CHECK-LABEL: @vsrlri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsrlri_h(v8i16 _1) { return __builtin_lsx_vsrlri_h(_1, 1); }
 // CHECK-LABEL: @vsrlri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsrlri_w(v4i32 _1) { return __builtin_lsx_vsrlri_w(_1, 1); }
 // CHECK-LABEL: @vsrlri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsrlri_d(v2i64 _1) { return __builtin_lsx_vsrlri_d(_1, 1); }
 // CHECK-LABEL: @vbitclr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitclr_b(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vbitclr_b(_1, _2);
 }
 // CHECK-LABEL: @vbitclr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vbitclr_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vbitclr_h(_1, _2);
 }
 // CHECK-LABEL: @vbitclr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vbitclr_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vbitclr_w(_1, _2);
 }
 // CHECK-LABEL: @vbitclr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vbitclr_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vbitclr_d(_1, _2);
 }
 // CHECK-LABEL: @vbitclri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vbitclri_b(v16u8 _1) { return __builtin_lsx_vbitclri_b(_1, 1); }
 // CHECK-LABEL: @vbitclri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vbitclri_h(v8u16 _1) { return __builtin_lsx_vbitclri_h(_1, 1); }
 // CHECK-LABEL: @vbitclri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vbitclri_w(v4u32 _1) { return __builtin_lsx_vbitclri_w(_1, 1); }
 // CHECK-LABEL: @vbitclri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vbitclri_d(v2u64 _1) { return __builtin_lsx_vbitclri_d(_1, 1); }
 // CHECK-LABEL: @vbitset_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitset.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitset.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitset_b(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vbitset_b(_1, _2);
 }
 // CHECK-LABEL: @vbitset_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitset.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitset.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vbitset_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vbitset_h(_1, _2);
 }
 // CHECK-LABEL: @vbitset_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitset.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitset.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vbitset_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vbitset_w(_1, _2);
 }
 // CHECK-LABEL: @vbitset_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitset.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitset.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vbitset_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vbitset_d(_1, _2);
 }
 // CHECK-LABEL: @vbitseti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vbitseti_b(v16u8 _1) { return __builtin_lsx_vbitseti_b(_1, 1); }
 // CHECK-LABEL: @vbitseti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vbitseti_h(v8u16 _1) { return __builtin_lsx_vbitseti_h(_1, 1); }
 // CHECK-LABEL: @vbitseti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vbitseti_w(v4u32 _1) { return __builtin_lsx_vbitseti_w(_1, 1); }
 // CHECK-LABEL: @vbitseti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vbitseti_d(v2u64 _1) { return __builtin_lsx_vbitseti_d(_1, 1); }
 // CHECK-LABEL: @vbitrev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrev.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitrev_b(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vbitrev_b(_1, _2);
 }
 // CHECK-LABEL: @vbitrev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrev.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vbitrev_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vbitrev_h(_1, _2);
 }
 // CHECK-LABEL: @vbitrev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrev.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vbitrev_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vbitrev_w(_1, _2);
 }
 // CHECK-LABEL: @vbitrev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrev.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vbitrev_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vbitrev_d(_1, _2);
 }
 // CHECK-LABEL: @vbitrevi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vbitrevi_b(v16u8 _1) { return __builtin_lsx_vbitrevi_b(_1, 1); }
 // CHECK-LABEL: @vbitrevi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vbitrevi_h(v8u16 _1) { return __builtin_lsx_vbitrevi_h(_1, 1); }
 // CHECK-LABEL: @vbitrevi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vbitrevi_w(v4u32 _1) { return __builtin_lsx_vbitrevi_w(_1, 1); }
 // CHECK-LABEL: @vbitrevi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vbitrevi_d(v2u64 _1) { return __builtin_lsx_vbitrevi_d(_1, 1); }
 // CHECK-LABEL: @vadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vadd_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vadd_b(_1, _2); }
 // CHECK-LABEL: @vadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vadd_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vadd_h(_1, _2); }
 // CHECK-LABEL: @vadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vadd_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vadd_w(_1, _2); }
 // CHECK-LABEL: @vadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vadd_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vadd_d(_1, _2); }
 // CHECK-LABEL: @vaddi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vaddi_bu(v16i8 _1) { return __builtin_lsx_vaddi_bu(_1, 1); }
 // CHECK-LABEL: @vaddi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vaddi_hu(v8i16 _1) { return __builtin_lsx_vaddi_hu(_1, 1); }
 // CHECK-LABEL: @vaddi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vaddi_wu(v4i32 _1) { return __builtin_lsx_vaddi_wu(_1, 1); }
 // CHECK-LABEL: @vaddi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vaddi_du(v2i64 _1) { return __builtin_lsx_vaddi_du(_1, 1); }
 // CHECK-LABEL: @vsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsub.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsub_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsub_b(_1, _2); }
 // CHECK-LABEL: @vsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsub.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsub_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsub_h(_1, _2); }
 // CHECK-LABEL: @vsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsub.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsub_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsub_w(_1, _2); }
 // CHECK-LABEL: @vsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsub_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsub_d(_1, _2); }
 // CHECK-LABEL: @vsubi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsubi_bu(v16i8 _1) { return __builtin_lsx_vsubi_bu(_1, 1); }
 // CHECK-LABEL: @vsubi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsubi_hu(v8i16 _1) { return __builtin_lsx_vsubi_hu(_1, 1); }
 // CHECK-LABEL: @vsubi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsubi_wu(v4i32 _1) { return __builtin_lsx_vsubi_wu(_1, 1); }
 // CHECK-LABEL: @vsubi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsubi_du(v2i64 _1) { return __builtin_lsx_vsubi_du(_1, 1); }
 // CHECK-LABEL: @vmax_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmax_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmax_b(_1, _2); }
 // CHECK-LABEL: @vmax_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmax_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmax_h(_1, _2); }
 // CHECK-LABEL: @vmax_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmax_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmax_w(_1, _2); }
 // CHECK-LABEL: @vmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmax_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmax_d(_1, _2); }
 // CHECK-LABEL: @vmaxi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmaxi_b(v16i8 _1) { return __builtin_lsx_vmaxi_b(_1, 1); }
 // CHECK-LABEL: @vmaxi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vmaxi_h(v8i16 _1) { return __builtin_lsx_vmaxi_h(_1, 1); }
 // CHECK-LABEL: @vmaxi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vmaxi_w(v4i32 _1) { return __builtin_lsx_vmaxi_w(_1, 1); }
 // CHECK-LABEL: @vmaxi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vmaxi_d(v2i64 _1) { return __builtin_lsx_vmaxi_d(_1, 1); }
 // CHECK-LABEL: @vmax_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmax_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vmax_bu(_1, _2);
 }
 // CHECK-LABEL: @vmax_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmax_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vmax_hu(_1, _2);
 }
 // CHECK-LABEL: @vmax_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmax_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vmax_wu(_1, _2);
 }
 // CHECK-LABEL: @vmax_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmax_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vmax_du(_1, _2);
 }
 // CHECK-LABEL: @vmaxi_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vmaxi_bu(v16u8 _1) { return __builtin_lsx_vmaxi_bu(_1, 1); }
 // CHECK-LABEL: @vmaxi_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vmaxi_hu(v8u16 _1) { return __builtin_lsx_vmaxi_hu(_1, 1); }
 // CHECK-LABEL: @vmaxi_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vmaxi_wu(v4u32 _1) { return __builtin_lsx_vmaxi_wu(_1, 1); }
 // CHECK-LABEL: @vmaxi_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vmaxi_du(v2u64 _1) { return __builtin_lsx_vmaxi_du(_1, 1); }
 // CHECK-LABEL: @vmin_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmin_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmin_b(_1, _2); }
 // CHECK-LABEL: @vmin_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmin_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmin_h(_1, _2); }
 // CHECK-LABEL: @vmin_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmin_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmin_w(_1, _2); }
 // CHECK-LABEL: @vmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmin_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmin_d(_1, _2); }
 // CHECK-LABEL: @vmini_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmini_b(v16i8 _1) { return __builtin_lsx_vmini_b(_1, 1); }
 // CHECK-LABEL: @vmini_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vmini_h(v8i16 _1) { return __builtin_lsx_vmini_h(_1, 1); }
 // CHECK-LABEL: @vmini_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vmini_w(v4i32 _1) { return __builtin_lsx_vmini_w(_1, 1); }
 // CHECK-LABEL: @vmini_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vmini_d(v2i64 _1) { return __builtin_lsx_vmini_d(_1, 1); }
 // CHECK-LABEL: @vmin_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmin_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vmin_bu(_1, _2);
 }
 // CHECK-LABEL: @vmin_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmin_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vmin_hu(_1, _2);
 }
 // CHECK-LABEL: @vmin_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmin_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vmin_wu(_1, _2);
 }
 // CHECK-LABEL: @vmin_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmin_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vmin_du(_1, _2);
 }
 // CHECK-LABEL: @vmini_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vmini_bu(v16u8 _1) { return __builtin_lsx_vmini_bu(_1, 1); }
 // CHECK-LABEL: @vmini_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vmini_hu(v8u16 _1) { return __builtin_lsx_vmini_hu(_1, 1); }
 // CHECK-LABEL: @vmini_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vmini_wu(v4u32 _1) { return __builtin_lsx_vmini_wu(_1, 1); }
 // CHECK-LABEL: @vmini_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vmini_du(v2u64 _1) { return __builtin_lsx_vmini_du(_1, 1); }
 // CHECK-LABEL: @vseq_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseq.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseq.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vseq_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vseq_b(_1, _2); }
 // CHECK-LABEL: @vseq_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseq.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseq.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vseq_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vseq_h(_1, _2); }
 // CHECK-LABEL: @vseq_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseq.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseq.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vseq_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vseq_w(_1, _2); }
 // CHECK-LABEL: @vseq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseq.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseq.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vseq_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vseq_d(_1, _2); }
 // CHECK-LABEL: @vseqi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vseqi_b(v16i8 _1) { return __builtin_lsx_vseqi_b(_1, 1); }
 // CHECK-LABEL: @vseqi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vseqi_h(v8i16 _1) { return __builtin_lsx_vseqi_h(_1, 1); }
 // CHECK-LABEL: @vseqi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vseqi_w(v4i32 _1) { return __builtin_lsx_vseqi_w(_1, 1); }
 // CHECK-LABEL: @vseqi_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vseqi_d(v2i64 _1) { return __builtin_lsx_vseqi_d(_1, 1); }
 // CHECK-LABEL: @vslti_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslti_b(v16i8 _1) { return __builtin_lsx_vslti_b(_1, 1); }
 // CHECK-LABEL: @vslt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vslt_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vslt_b(_1, _2); }
 // CHECK-LABEL: @vslt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vslt_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vslt_h(_1, _2); }
 // CHECK-LABEL: @vslt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vslt_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vslt_w(_1, _2); }
 // CHECK-LABEL: @vslt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vslt_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vslt_d(_1, _2); }
 // CHECK-LABEL: @vslti_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslti_h(v8i16 _1) { return __builtin_lsx_vslti_h(_1, 1); }
 // CHECK-LABEL: @vslti_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslti_w(v4i32 _1) { return __builtin_lsx_vslti_w(_1, 1); }
 // CHECK-LABEL: @vslti_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslti_d(v2i64 _1) { return __builtin_lsx_vslti_d(_1, 1); }
 // CHECK-LABEL: @vslt_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vslt_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vslt_bu(_1, _2);
 }
 // CHECK-LABEL: @vslt_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vslt_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vslt_hu(_1, _2);
 }
 // CHECK-LABEL: @vslt_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vslt_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vslt_wu(_1, _2);
 }
 // CHECK-LABEL: @vslt_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vslt_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vslt_du(_1, _2);
 }
 // CHECK-LABEL: @vslti_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslti_bu(v16u8 _1) { return __builtin_lsx_vslti_bu(_1, 1); }
 // CHECK-LABEL: @vslti_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslti_hu(v8u16 _1) { return __builtin_lsx_vslti_hu(_1, 1); }
 // CHECK-LABEL: @vslti_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslti_wu(v4u32 _1) { return __builtin_lsx_vslti_wu(_1, 1); }
 // CHECK-LABEL: @vslti_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslti_du(v2u64 _1) { return __builtin_lsx_vslti_du(_1, 1); }
 // CHECK-LABEL: @vsle_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsle_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsle_b(_1, _2); }
 // CHECK-LABEL: @vsle_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsle_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsle_h(_1, _2); }
 // CHECK-LABEL: @vsle_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsle_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsle_w(_1, _2); }
 // CHECK-LABEL: @vsle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsle_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsle_d(_1, _2); }
 // CHECK-LABEL: @vslei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslei_b(v16i8 _1) { return __builtin_lsx_vslei_b(_1, 1); }
 // CHECK-LABEL: @vslei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslei_h(v8i16 _1) { return __builtin_lsx_vslei_h(_1, 1); }
 // CHECK-LABEL: @vslei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslei_w(v4i32 _1) { return __builtin_lsx_vslei_w(_1, 1); }
 // CHECK-LABEL: @vslei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslei_d(v2i64 _1) { return __builtin_lsx_vslei_d(_1, 1); }
 // CHECK-LABEL: @vsle_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsle_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vsle_bu(_1, _2);
 }
 // CHECK-LABEL: @vsle_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsle_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vsle_hu(_1, _2);
 }
 // CHECK-LABEL: @vsle_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsle_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vsle_wu(_1, _2);
 }
 // CHECK-LABEL: @vsle_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsle_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vsle_du(_1, _2);
 }
 // CHECK-LABEL: @vslei_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vslei_bu(v16u8 _1) { return __builtin_lsx_vslei_bu(_1, 1); }
 // CHECK-LABEL: @vslei_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vslei_hu(v8u16 _1) { return __builtin_lsx_vslei_hu(_1, 1); }
 // CHECK-LABEL: @vslei_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vslei_wu(v4u32 _1) { return __builtin_lsx_vslei_wu(_1, 1); }
 // CHECK-LABEL: @vslei_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vslei_du(v2u64 _1) { return __builtin_lsx_vslei_du(_1, 1); }
 // CHECK-LABEL: @vsat_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vsat_b(v16i8 _1) { return __builtin_lsx_vsat_b(_1, 1); }
 // CHECK-LABEL: @vsat_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsat_h(v8i16 _1) { return __builtin_lsx_vsat_h(_1, 1); }
 // CHECK-LABEL: @vsat_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsat_w(v4i32 _1) { return __builtin_lsx_vsat_w(_1, 1); }
 // CHECK-LABEL: @vsat_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsat_d(v2i64 _1) { return __builtin_lsx_vsat_d(_1, 1); }
 // CHECK-LABEL: @vsat_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vsat_bu(v16u8 _1) { return __builtin_lsx_vsat_bu(_1, 1); }
 // CHECK-LABEL: @vsat_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vsat_hu(v8u16 _1) { return __builtin_lsx_vsat_hu(_1, 1); }
 // CHECK-LABEL: @vsat_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vsat_wu(v4u32 _1) { return __builtin_lsx_vsat_wu(_1, 1); }
 // CHECK-LABEL: @vsat_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vsat_du(v2u64 _1) { return __builtin_lsx_vsat_du(_1, 1); }
 // CHECK-LABEL: @vadda_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadda.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadda.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vadda_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vadda_b(_1, _2);
 }
 // CHECK-LABEL: @vadda_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadda.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadda.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vadda_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vadda_h(_1, _2);
 }
 // CHECK-LABEL: @vadda_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadda.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadda.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vadda_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vadda_w(_1, _2);
 }
 // CHECK-LABEL: @vadda_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadda.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadda.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vadda_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vadda_d(_1, _2);
 }
 // CHECK-LABEL: @vsadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsadd_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsadd_b(_1, _2);
 }
 // CHECK-LABEL: @vsadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsadd_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsadd_h(_1, _2);
 }
 // CHECK-LABEL: @vsadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsadd_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsadd_w(_1, _2);
 }
 // CHECK-LABEL: @vsadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsadd_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsadd_d(_1, _2);
 }
 // CHECK-LABEL: @vsadd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vsadd_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vsadd_bu(_1, _2);
 }
 // CHECK-LABEL: @vsadd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vsadd_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vsadd_hu(_1, _2);
 }
 // CHECK-LABEL: @vsadd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vsadd_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vsadd_wu(_1, _2);
 }
 // CHECK-LABEL: @vsadd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vsadd_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vsadd_du(_1, _2);
 }
 // CHECK-LABEL: @vavg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vavg_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vavg_b(_1, _2); }
 // CHECK-LABEL: @vavg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vavg_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vavg_h(_1, _2); }
 // CHECK-LABEL: @vavg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vavg_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vavg_w(_1, _2); }
 // CHECK-LABEL: @vavg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vavg_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vavg_d(_1, _2); }
 // CHECK-LABEL: @vavg_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vavg_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vavg_bu(_1, _2);
 }
 // CHECK-LABEL: @vavg_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vavg_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vavg_hu(_1, _2);
 }
 // CHECK-LABEL: @vavg_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vavg_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vavg_wu(_1, _2);
 }
 // CHECK-LABEL: @vavg_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vavg_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vavg_du(_1, _2);
 }
 // CHECK-LABEL: @vavgr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vavgr_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vavgr_b(_1, _2);
 }
 // CHECK-LABEL: @vavgr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vavgr_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vavgr_h(_1, _2);
 }
 // CHECK-LABEL: @vavgr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vavgr_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vavgr_w(_1, _2);
 }
 // CHECK-LABEL: @vavgr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vavgr_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vavgr_d(_1, _2);
 }
 // CHECK-LABEL: @vavgr_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vavgr_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vavgr_bu(_1, _2);
 }
 // CHECK-LABEL: @vavgr_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vavgr_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vavgr_hu(_1, _2);
 }
 // CHECK-LABEL: @vavgr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vavgr_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vavgr_wu(_1, _2);
 }
 // CHECK-LABEL: @vavgr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vavgr_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vavgr_du(_1, _2);
 }
 // CHECK-LABEL: @vssub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssub_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vssub_b(_1, _2);
 }
 // CHECK-LABEL: @vssub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssub_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssub_h(_1, _2);
 }
 // CHECK-LABEL: @vssub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssub_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssub_w(_1, _2);
 }
 // CHECK-LABEL: @vssub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssub_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssub_d(_1, _2);
 }
 // CHECK-LABEL: @vssub_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssub_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vssub_bu(_1, _2);
 }
 // CHECK-LABEL: @vssub_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssub_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vssub_hu(_1, _2);
 }
 // CHECK-LABEL: @vssub_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssub_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vssub_wu(_1, _2);
 }
 // CHECK-LABEL: @vssub_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssub_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vssub_du(_1, _2);
 }
 // CHECK-LABEL: @vabsd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vabsd_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vabsd_b(_1, _2);
 }
 // CHECK-LABEL: @vabsd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vabsd_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vabsd_h(_1, _2);
 }
 // CHECK-LABEL: @vabsd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vabsd_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vabsd_w(_1, _2);
 }
 // CHECK-LABEL: @vabsd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vabsd_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vabsd_d(_1, _2);
 }
 // CHECK-LABEL: @vabsd_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vabsd_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vabsd_bu(_1, _2);
 }
 // CHECK-LABEL: @vabsd_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vabsd_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vabsd_hu(_1, _2);
 }
 // CHECK-LABEL: @vabsd_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vabsd_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vabsd_wu(_1, _2);
 }
 // CHECK-LABEL: @vabsd_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vabsd_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vabsd_du(_1, _2);
 }
 // CHECK-LABEL: @vmul_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmul.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmul.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmul_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmul_b(_1, _2); }
 // CHECK-LABEL: @vmul_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmul.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmul.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmul_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmul_h(_1, _2); }
 // CHECK-LABEL: @vmul_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmul.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmul.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmul_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmul_w(_1, _2); }
 // CHECK-LABEL: @vmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmul.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmul.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmul_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmul_d(_1, _2); }
 // CHECK-LABEL: @vmadd_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmadd.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vmadd_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __builtin_lsx_vmadd_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmadd_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmadd.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmadd_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __builtin_lsx_vmadd_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmadd_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmadd.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmadd_w(v4i32 _1, v4i32 _2, v4i32 _3) {
   return __builtin_lsx_vmadd_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmadd.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmadd_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __builtin_lsx_vmadd_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsub.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vmsub_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __builtin_lsx_vmsub_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmsub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmsub.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmsub_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __builtin_lsx_vmsub_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmsub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmsub.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmsub_w(v4i32 _1, v4i32 _2, v4i32 _3) {
   return __builtin_lsx_vmsub_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmsub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmsub.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmsub_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __builtin_lsx_vmsub_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vdiv_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vdiv_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vdiv_b(_1, _2); }
 // CHECK-LABEL: @vdiv_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vdiv_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vdiv_h(_1, _2); }
 // CHECK-LABEL: @vdiv_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vdiv_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vdiv_w(_1, _2); }
 // CHECK-LABEL: @vdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vdiv_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vdiv_d(_1, _2); }
 // CHECK-LABEL: @vdiv_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vdiv_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vdiv_bu(_1, _2);
 }
 // CHECK-LABEL: @vdiv_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vdiv_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vdiv_hu(_1, _2);
 }
 // CHECK-LABEL: @vdiv_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vdiv_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vdiv_wu(_1, _2);
 }
 // CHECK-LABEL: @vdiv_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vdiv_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vdiv_du(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vhaddw_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vhaddw_h_b(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vhaddw_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vhaddw_w_h(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhaddw_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vhaddw_d_w(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.hu.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.hu.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vhaddw_hu_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vhaddw_hu_bu(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.wu.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.wu.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vhaddw_wu_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vhaddw_wu_hu(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.du.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.du.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vhaddw_du_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vhaddw_du_wu(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vhsubw_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vhsubw_h_b(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vhsubw_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vhsubw_w_h(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhsubw_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vhsubw_d_w(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.hu.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.hu.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vhsubw_hu_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vhsubw_hu_bu(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.wu.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.wu.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vhsubw_wu_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vhsubw_wu_hu(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.du.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.du.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhsubw_du_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vhsubw_du_wu(_1, _2);
 }
 // CHECK-LABEL: @vmod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmod_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmod_b(_1, _2); }
 // CHECK-LABEL: @vmod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmod_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmod_h(_1, _2); }
 // CHECK-LABEL: @vmod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmod_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmod_w(_1, _2); }
 // CHECK-LABEL: @vmod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmod_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmod_d(_1, _2); }
 // CHECK-LABEL: @vmod_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmod_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vmod_bu(_1, _2);
 }
 // CHECK-LABEL: @vmod_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmod_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vmod_hu(_1, _2);
 }
 // CHECK-LABEL: @vmod_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmod_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vmod_wu(_1, _2);
 }
 // CHECK-LABEL: @vmod_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmod_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vmod_du(_1, _2);
 }
 // CHECK-LABEL: @vreplve_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplve.b(<16 x i8> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplve.b(<16 x i8> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vreplve_b(v16i8 _1, int _2) {
   return __builtin_lsx_vreplve_b(_1, _2);
 }
 // CHECK-LABEL: @vreplve_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplve.h(<8 x i16> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplve.h(<8 x i16> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vreplve_h(v8i16 _1, int _2) {
   return __builtin_lsx_vreplve_h(_1, _2);
 }
 // CHECK-LABEL: @vreplve_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplve.w(<4 x i32> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplve.w(<4 x i32> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vreplve_w(v4i32 _1, int _2) {
   return __builtin_lsx_vreplve_w(_1, _2);
 }
 // CHECK-LABEL: @vreplve_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplve.d(<2 x i64> [[_1:%.*]], i32 [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplve.d(<2 x i64> [[TMP0]], i32 [[_2:%.*]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vreplve_d(v2i64 _1, int _2) {
   return __builtin_lsx_vreplve_d(_1, _2);
 }
 // CHECK-LABEL: @vreplvei_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vreplvei_b(v16i8 _1) { return __builtin_lsx_vreplvei_b(_1, 1); }
 // CHECK-LABEL: @vreplvei_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vreplvei_h(v8i16 _1) { return __builtin_lsx_vreplvei_h(_1, 1); }
 // CHECK-LABEL: @vreplvei_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vreplvei_w(v4i32 _1) { return __builtin_lsx_vreplvei_w(_1, 1); }
 // CHECK-LABEL: @vreplvei_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vreplvei_d(v2i64 _1) { return __builtin_lsx_vreplvei_d(_1, 1); }
 // CHECK-LABEL: @vpickev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickev.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpickev_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vpickev_b(_1, _2);
 }
 // CHECK-LABEL: @vpickev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickev.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpickev_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vpickev_h(_1, _2);
 }
 // CHECK-LABEL: @vpickev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickev.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpickev_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vpickev_w(_1, _2);
 }
 // CHECK-LABEL: @vpickev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickev.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpickev_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vpickev_d(_1, _2);
 }
 // CHECK-LABEL: @vpickod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickod.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpickod_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vpickod_b(_1, _2);
 }
 // CHECK-LABEL: @vpickod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickod.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpickod_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vpickod_h(_1, _2);
 }
 // CHECK-LABEL: @vpickod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickod.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpickod_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vpickod_w(_1, _2);
 }
 // CHECK-LABEL: @vpickod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickod.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpickod_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vpickod_d(_1, _2);
 }
 // CHECK-LABEL: @vilvh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvh.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvh.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vilvh_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vilvh_b(_1, _2);
 }
 // CHECK-LABEL: @vilvh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvh.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvh.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vilvh_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vilvh_h(_1, _2);
 }
 // CHECK-LABEL: @vilvh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvh.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvh.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vilvh_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vilvh_w(_1, _2);
 }
 // CHECK-LABEL: @vilvh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvh.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvh.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vilvh_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vilvh_d(_1, _2);
 }
 // CHECK-LABEL: @vilvl_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvl.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvl.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vilvl_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vilvl_b(_1, _2);
 }
 // CHECK-LABEL: @vilvl_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvl.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvl.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vilvl_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vilvl_h(_1, _2);
 }
 // CHECK-LABEL: @vilvl_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvl.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvl.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vilvl_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vilvl_w(_1, _2);
 }
 // CHECK-LABEL: @vilvl_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvl.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvl.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vilvl_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vilvl_d(_1, _2);
 }
 // CHECK-LABEL: @vpackev_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackev.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpackev_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vpackev_b(_1, _2);
 }
 // CHECK-LABEL: @vpackev_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackev.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpackev_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vpackev_h(_1, _2);
 }
 // CHECK-LABEL: @vpackev_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackev.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpackev_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vpackev_w(_1, _2);
 }
 // CHECK-LABEL: @vpackev_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackev.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpackev_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vpackev_d(_1, _2);
 }
 // CHECK-LABEL: @vpackod_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackod.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vpackod_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vpackod_b(_1, _2);
 }
 // CHECK-LABEL: @vpackod_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackod.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vpackod_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vpackod_h(_1, _2);
 }
 // CHECK-LABEL: @vpackod_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackod.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpackod_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vpackod_w(_1, _2);
 }
 // CHECK-LABEL: @vpackod_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackod.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vpackod_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vpackod_d(_1, _2);
 }
 // CHECK-LABEL: @vshuf_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vshuf_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __builtin_lsx_vshuf_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vshuf_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vshuf_w(v4i32 _1, v4i32 _2, v4i32 _3) {
   return __builtin_lsx_vshuf_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vshuf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vshuf_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __builtin_lsx_vshuf_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vand_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vand.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vand.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vand_v(v16u8 _1, v16u8 _2) { return __builtin_lsx_vand_v(_1, _2); }
 // CHECK-LABEL: @vandi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vandi_b(v16u8 _1) { return __builtin_lsx_vandi_b(_1, 1); }
 // CHECK-LABEL: @vor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vor.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vor_v(v16u8 _1, v16u8 _2) { return __builtin_lsx_vor_v(_1, _2); }
 // CHECK-LABEL: @vori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vori_b(v16u8 _1) { return __builtin_lsx_vori_b(_1, 1); }
 // CHECK-LABEL: @vnor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnor.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vnor_v(v16u8 _1, v16u8 _2) { return __builtin_lsx_vnor_v(_1, _2); }
 // CHECK-LABEL: @vnori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vnori_b(v16u8 _1) { return __builtin_lsx_vnori_b(_1, 1); }
 // CHECK-LABEL: @vxor_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxor.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vxor_v(v16u8 _1, v16u8 _2) { return __builtin_lsx_vxor_v(_1, _2); }
 // CHECK-LABEL: @vxori_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16u8 vxori_b(v16u8 _1) { return __builtin_lsx_vxori_b(_1, 1); }
 // CHECK-LABEL: @vbitsel_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitsel.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitsel.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16u8 vbitsel_v(v16u8 _1, v16u8 _2, v16u8 _3) {
   return __builtin_lsx_vbitsel_v(_1, _2, _3);
 }
 // CHECK-LABEL: @vbitseli_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vbitseli_b(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vbitseli_b(_1, _2, 1);
 }
 // CHECK-LABEL: @vshuf4i_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vshuf4i_b(v16i8 _1) { return __builtin_lsx_vshuf4i_b(_1, 1); }
 // CHECK-LABEL: @vshuf4i_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vshuf4i_h(v8i16 _1) { return __builtin_lsx_vshuf4i_h(_1, 1); }
 // CHECK-LABEL: @vshuf4i_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vshuf4i_w(v4i32 _1) { return __builtin_lsx_vshuf4i_w(_1, 1); }
 // CHECK-LABEL: @vreplgr2vr_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vreplgr2vr_b(int _1) { return __builtin_lsx_vreplgr2vr_b(_1); }
 // CHECK-LABEL: @vreplgr2vr_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplgr2vr.h(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v8i16 vreplgr2vr_h(int _1) { return __builtin_lsx_vreplgr2vr_h(_1); }
 // CHECK-LABEL: @vreplgr2vr_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplgr2vr.w(i32 [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v4i32 vreplgr2vr_w(int _1) { return __builtin_lsx_vreplgr2vr_w(_1); }
 // CHECK-LABEL: @vreplgr2vr_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vreplgr2vr_d(long _1) { return __builtin_lsx_vreplgr2vr_d(_1); }
 // CHECK-LABEL: @vpcnt_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpcnt.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpcnt.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vpcnt_b(v16i8 _1) { return __builtin_lsx_vpcnt_b(_1); }
 // CHECK-LABEL: @vpcnt_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpcnt.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpcnt.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vpcnt_h(v8i16 _1) { return __builtin_lsx_vpcnt_h(_1); }
 // CHECK-LABEL: @vpcnt_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpcnt.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpcnt.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vpcnt_w(v4i32 _1) { return __builtin_lsx_vpcnt_w(_1); }
 // CHECK-LABEL: @vpcnt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpcnt.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpcnt.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vpcnt_d(v2i64 _1) { return __builtin_lsx_vpcnt_d(_1); }
 // CHECK-LABEL: @vclo_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclo.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclo.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vclo_b(v16i8 _1) { return __builtin_lsx_vclo_b(_1); }
 // CHECK-LABEL: @vclo_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclo.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclo.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vclo_h(v8i16 _1) { return __builtin_lsx_vclo_h(_1); }
 // CHECK-LABEL: @vclo_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclo.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclo.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vclo_w(v4i32 _1) { return __builtin_lsx_vclo_w(_1); }
 // CHECK-LABEL: @vclo_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclo.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclo.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vclo_d(v2i64 _1) { return __builtin_lsx_vclo_d(_1); }
 // CHECK-LABEL: @vclz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vclz_b(v16i8 _1) { return __builtin_lsx_vclz_b(_1); }
 // CHECK-LABEL: @vclz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vclz_h(v8i16 _1) { return __builtin_lsx_vclz_h(_1); }
 // CHECK-LABEL: @vclz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vclz_w(v4i32 _1) { return __builtin_lsx_vclz_w(_1); }
 // CHECK-LABEL: @vclz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vclz_d(v2i64 _1) { return __builtin_lsx_vclz_d(_1); }
 // CHECK-LABEL: @vpickve2gr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int vpickve2gr_b(v16i8 _1) { return __builtin_lsx_vpickve2gr_b(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int vpickve2gr_h(v8i16 _1) { return __builtin_lsx_vpickve2gr_h(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int vpickve2gr_w(v4i32 _1) { return __builtin_lsx_vpickve2gr_w(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 long vpickve2gr_d(v2i64 _1) { return __builtin_lsx_vpickve2gr_d(_1, 1); }
 // CHECK-LABEL: @vpickve2gr_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int vpickve2gr_bu(v16i8 _1) {
   return __builtin_lsx_vpickve2gr_bu(_1, 1);
 }
 // CHECK-LABEL: @vpickve2gr_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int vpickve2gr_hu(v8i16 _1) {
   return __builtin_lsx_vpickve2gr_hu(_1, 1);
 }
 // CHECK-LABEL: @vpickve2gr_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 unsigned int vpickve2gr_wu(v4i32 _1) {
   return __builtin_lsx_vpickve2gr_wu(_1, 1);
 }
 // CHECK-LABEL: @vpickve2gr_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret i64 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP1]]
 //
 unsigned long int vpickve2gr_du(v2i64 _1) {
   return __builtin_lsx_vpickve2gr_du(_1, 1);
 }
 // CHECK-LABEL: @vinsgr2vr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> [[TMP0]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vinsgr2vr_b(v16i8 _1) {
   return __builtin_lsx_vinsgr2vr_b(_1, 1, 1);
 }
 // CHECK-LABEL: @vinsgr2vr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> [[TMP0]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vinsgr2vr_h(v8i16 _1) {
   return __builtin_lsx_vinsgr2vr_h(_1, 1, 1);
 }
 // CHECK-LABEL: @vinsgr2vr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> [[_1:%.*]], i32 1, i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> [[TMP0]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vinsgr2vr_w(v4i32 _1) {
   return __builtin_lsx_vinsgr2vr_w(_1, 1, 1);
 }
 // CHECK-LABEL: @vinsgr2vr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> [[_1:%.*]], i64 1, i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> [[TMP0]], i64 1, i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vinsgr2vr_d(v2i64 _1) {
   return __builtin_lsx_vinsgr2vr_d(_1, 1, 1);
 }
 // CHECK-LABEL: @vfadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfadd.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfadd_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfadd_s(_1, _2);
 }
 // CHECK-LABEL: @vfadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfadd.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfadd_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfadd_d(_1, _2);
 }
 // CHECK-LABEL: @vfsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsub.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfsub_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfsub_s(_1, _2);
 }
 // CHECK-LABEL: @vfsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsub.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfsub_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfsub_d(_1, _2);
 }
 // CHECK-LABEL: @vfmul_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmul.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmul.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmul_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfmul_s(_1, _2);
 }
 // CHECK-LABEL: @vfmul_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmul.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmul.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmul_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfmul_d(_1, _2);
 }
 // CHECK-LABEL: @vfdiv_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfdiv.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfdiv.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfdiv_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfdiv_s(_1, _2);
 }
 // CHECK-LABEL: @vfdiv_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfdiv.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfdiv.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfdiv_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfdiv_d(_1, _2);
 }
 // CHECK-LABEL: @vfcvt_h_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfcvt.h.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfcvt.h.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vfcvt_h_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcvt_h_s(_1, _2);
 }
 // CHECK-LABEL: @vfcvt_s_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvt.s.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvt.s.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfcvt_s_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcvt_s_d(_1, _2);
 }
 // CHECK-LABEL: @vfmin_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmin.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmin.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmin_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfmin_s(_1, _2);
 }
 // CHECK-LABEL: @vfmin_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmin.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmin.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmin_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfmin_d(_1, _2);
 }
 // CHECK-LABEL: @vfmina_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmina.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmina.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmina_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfmina_s(_1, _2);
 }
 // CHECK-LABEL: @vfmina_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmina.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmina.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmina_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfmina_d(_1, _2);
 }
 // CHECK-LABEL: @vfmax_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmax.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmax.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmax_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfmax_s(_1, _2);
 }
 // CHECK-LABEL: @vfmax_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmax.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmax.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmax_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfmax_d(_1, _2);
 }
 // CHECK-LABEL: @vfmaxa_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmaxa.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmaxa.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vfmaxa_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfmaxa_s(_1, _2);
 }
 // CHECK-LABEL: @vfmaxa_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmaxa.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmaxa.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2f64 vfmaxa_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfmaxa_d(_1, _2);
 }
 // CHECK-LABEL: @vfclass_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfclass.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfclass.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfclass_s(v4f32 _1) { return __builtin_lsx_vfclass_s(_1); }
 // CHECK-LABEL: @vfclass_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfclass.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfclass.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfclass_d(v2f64 _1) { return __builtin_lsx_vfclass_d(_1); }
 // CHECK-LABEL: @vfsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsqrt.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsqrt.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfsqrt_s(v4f32 _1) { return __builtin_lsx_vfsqrt_s(_1); }
 // CHECK-LABEL: @vfsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsqrt.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsqrt.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfsqrt_d(v2f64 _1) { return __builtin_lsx_vfsqrt_d(_1); }
 // CHECK-LABEL: @vfrecip_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecip.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecip.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfrecip_s(v4f32 _1) { return __builtin_lsx_vfrecip_s(_1); }
 // CHECK-LABEL: @vfrecip_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecip.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecip.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfrecip_d(v2f64 _1) { return __builtin_lsx_vfrecip_d(_1); }
 // CHECK-LABEL: @vfrint_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrint.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrint.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfrint_s(v4f32 _1) { return __builtin_lsx_vfrint_s(_1); }
 // CHECK-LABEL: @vfrint_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrint.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrint.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfrint_d(v2f64 _1) { return __builtin_lsx_vfrint_d(_1); }
 // CHECK-LABEL: @vfrsqrt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrt.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrt.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfrsqrt_s(v4f32 _1) { return __builtin_lsx_vfrsqrt_s(_1); }
 // CHECK-LABEL: @vfrsqrt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrt.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrt.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfrsqrt_d(v2f64 _1) { return __builtin_lsx_vfrsqrt_d(_1); }
 // CHECK-LABEL: @vflogb_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vflogb.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vflogb.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vflogb_s(v4f32 _1) { return __builtin_lsx_vflogb_s(_1); }
 // CHECK-LABEL: @vflogb_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vflogb.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vflogb.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vflogb_d(v2f64 _1) { return __builtin_lsx_vflogb_d(_1); }
 // CHECK-LABEL: @vfcvth_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvth.s.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvth.s.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfcvth_s_h(v8i16 _1) { return __builtin_lsx_vfcvth_s_h(_1); }
 // CHECK-LABEL: @vfcvth_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvth.d.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvth.d.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfcvth_d_s(v4f32 _1) { return __builtin_lsx_vfcvth_d_s(_1); }
 // CHECK-LABEL: @vfcvtl_s_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvtl.s.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvtl.s.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vfcvtl_s_h(v8i16 _1) { return __builtin_lsx_vfcvtl_s_h(_1); }
 // CHECK-LABEL: @vfcvtl_d_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvtl.d.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvtl.d.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vfcvtl_d_s(v4f32 _1) { return __builtin_lsx_vfcvtl_d_s(_1); }
 // CHECK-LABEL: @vftint_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftint_w_s(v4f32 _1) { return __builtin_lsx_vftint_w_s(_1); }
 // CHECK-LABEL: @vftint_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftint_l_d(v2f64 _1) { return __builtin_lsx_vftint_l_d(_1); }
 // CHECK-LABEL: @vftint_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.wu.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.wu.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vftint_wu_s(v4f32 _1) { return __builtin_lsx_vftint_wu_s(_1); }
 // CHECK-LABEL: @vftint_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.lu.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.lu.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vftint_lu_d(v2f64 _1) { return __builtin_lsx_vftint_lu_d(_1); }
 // CHECK-LABEL: @vftintrz_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrz_w_s(v4f32 _1) { return __builtin_lsx_vftintrz_w_s(_1); }
 // CHECK-LABEL: @vftintrz_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrz_l_d(v2f64 _1) { return __builtin_lsx_vftintrz_l_d(_1); }
 // CHECK-LABEL: @vftintrz_wu_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.wu.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.wu.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vftintrz_wu_s(v4f32 _1) { return __builtin_lsx_vftintrz_wu_s(_1); }
 // CHECK-LABEL: @vftintrz_lu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.lu.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.lu.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vftintrz_lu_d(v2f64 _1) { return __builtin_lsx_vftintrz_lu_d(_1); }
 // CHECK-LABEL: @vffint_s_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vffint_s_w(v4i32 _1) { return __builtin_lsx_vffint_s_w(_1); }
 // CHECK-LABEL: @vffint_d_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.l(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.l(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffint_d_l(v2i64 _1) { return __builtin_lsx_vffint_d_l(_1); }
 // CHECK-LABEL: @vffint_s_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.wu(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.wu(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4f32 vffint_s_wu(v4u32 _1) { return __builtin_lsx_vffint_s_wu(_1); }
 // CHECK-LABEL: @vffint_d_lu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.lu(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.lu(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffint_d_lu(v2u64 _1) { return __builtin_lsx_vffint_d_lu(_1); }
 // CHECK-LABEL: @vandn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandn.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandn.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vandn_v(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vandn_v(_1, _2);
 }
 // CHECK-LABEL: @vneg_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vneg.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vneg.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vneg_b(v16i8 _1) { return __builtin_lsx_vneg_b(_1); }
 // CHECK-LABEL: @vneg_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vneg.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vneg.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vneg_h(v8i16 _1) { return __builtin_lsx_vneg_h(_1); }
 // CHECK-LABEL: @vneg_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vneg.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vneg.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vneg_w(v4i32 _1) { return __builtin_lsx_vneg_w(_1); }
 // CHECK-LABEL: @vneg_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vneg.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vneg.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vneg_d(v2i64 _1) { return __builtin_lsx_vneg_d(_1); }
 // CHECK-LABEL: @vmuh_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vmuh_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmuh_b(_1, _2); }
 // CHECK-LABEL: @vmuh_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmuh_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmuh_h(_1, _2); }
 // CHECK-LABEL: @vmuh_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmuh_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmuh_w(_1, _2); }
 // CHECK-LABEL: @vmuh_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmuh_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmuh_d(_1, _2); }
 // CHECK-LABEL: @vmuh_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vmuh_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vmuh_bu(_1, _2);
 }
 // CHECK-LABEL: @vmuh_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vmuh_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vmuh_hu(_1, _2);
 }
 // CHECK-LABEL: @vmuh_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vmuh_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vmuh_wu(_1, _2);
 }
 // CHECK-LABEL: @vmuh_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vmuh_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vmuh_du(_1, _2);
 }
 // CHECK-LABEL: @vsllwil_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vsllwil_h_b(v16i8 _1) { return __builtin_lsx_vsllwil_h_b(_1, 1); }
 // CHECK-LABEL: @vsllwil_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vsllwil_w_h(v8i16 _1) { return __builtin_lsx_vsllwil_w_h(_1, 1); }
 // CHECK-LABEL: @vsllwil_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vsllwil_d_w(v4i32 _1) { return __builtin_lsx_vsllwil_d_w(_1, 1); }
 // CHECK-LABEL: @vsllwil_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vsllwil_hu_bu(v16u8 _1) {
   return __builtin_lsx_vsllwil_hu_bu(_1, 1);
 }
 // CHECK-LABEL: @vsllwil_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vsllwil_wu_hu(v8u16 _1) {
   return __builtin_lsx_vsllwil_wu_hu(_1, 1);
 }
 // CHECK-LABEL: @vsllwil_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vsllwil_du_wu(v4u32 _1) {
   return __builtin_lsx_vsllwil_du_wu(_1, 1);
 }
 // CHECK-LABEL: @vsran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsran.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsran.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsran_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsran_b_h(_1, _2);
 }
 // CHECK-LABEL: @vsran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsran.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsran.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsran_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsran_h_w(_1, _2);
 }
 // CHECK-LABEL: @vsran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsran.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsran.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsran_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsran_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssran_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssran_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssran_b_h(_1, _2);
 }
 // CHECK-LABEL: @vssran_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssran_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssran_h_w(_1, _2);
 }
 // CHECK-LABEL: @vssran_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssran_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssran_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssran_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssran_bu_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vssran_bu_h(_1, _2);
 }
 // CHECK-LABEL: @vssran_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssran_hu_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vssran_hu_w(_1, _2);
 }
 // CHECK-LABEL: @vssran_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssran_wu_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vssran_wu_d(_1, _2);
 }
 // CHECK-LABEL: @vsrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrarn_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrarn_b_h(_1, _2);
 }
 // CHECK-LABEL: @vsrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrarn_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrarn_h_w(_1, _2);
 }
 // CHECK-LABEL: @vsrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrarn_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrarn_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssrarn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrarn_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrarn_b_h(_1, _2);
 }
 // CHECK-LABEL: @vssrarn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrarn_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrarn_h_w(_1, _2);
 }
 // CHECK-LABEL: @vssrarn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrarn_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrarn_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssrarn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrarn_bu_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vssrarn_bu_h(_1, _2);
 }
 // CHECK-LABEL: @vssrarn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrarn_hu_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vssrarn_hu_w(_1, _2);
 }
 // CHECK-LABEL: @vssrarn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrarn_wu_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vssrarn_wu_d(_1, _2);
 }
 // CHECK-LABEL: @vsrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrln.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrln.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrln_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrln_b_h(_1, _2);
 }
 // CHECK-LABEL: @vsrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrln.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrln.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrln_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrln_h_w(_1, _2);
 }
 // CHECK-LABEL: @vsrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrln.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrln.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrln_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrln_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssrln_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrln_bu_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vssrln_bu_h(_1, _2);
 }
 // CHECK-LABEL: @vssrln_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrln_hu_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vssrln_hu_w(_1, _2);
 }
 // CHECK-LABEL: @vssrln_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrln_wu_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vssrln_wu_d(_1, _2);
 }
 // CHECK-LABEL: @vsrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlrn_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrlrn_b_h(_1, _2);
 }
 // CHECK-LABEL: @vsrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlrn_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrlrn_h_w(_1, _2);
 }
 // CHECK-LABEL: @vsrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlrn_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrlrn_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssrlrn_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.bu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrlrn_bu_h(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vssrlrn_bu_h(_1, _2);
 }
 // CHECK-LABEL: @vssrlrn_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.hu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrlrn_hu_w(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vssrlrn_hu_w(_1, _2);
 }
 // CHECK-LABEL: @vssrlrn_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.wu.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrlrn_wu_d(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vssrlrn_wu_d(_1, _2);
 }
 // CHECK-LABEL: @vfrstpi_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vfrstpi_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vfrstpi_b(_1, _2, 1);
 }
 // CHECK-LABEL: @vfrstpi_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vfrstpi_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vfrstpi_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vfrstp_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstp.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstp.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vfrstp_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __builtin_lsx_vfrstp_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vfrstp_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstp.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstp.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vfrstp_h(v8i16 _1, v8i16 _2, v8i16 _3) {
   return __builtin_lsx_vfrstp_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vshuf4i_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vshuf4i_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vshuf4i_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vbsrl_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vbsrl_v(v16i8 _1) { return __builtin_lsx_vbsrl_v(_1, 1); }
 // CHECK-LABEL: @vbsll_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vbsll_v(v16i8 _1) { return __builtin_lsx_vbsll_v(_1, 1); }
 // CHECK-LABEL: @vextrins_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vextrins_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vextrins_b(_1, _2, 1);
 }
 // CHECK-LABEL: @vextrins_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vextrins_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vextrins_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vextrins_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vextrins_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vextrins_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vextrins_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vextrins_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vextrins_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vmskltz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskltz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskltz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmskltz_b(v16i8 _1) { return __builtin_lsx_vmskltz_b(_1); }
 // CHECK-LABEL: @vmskltz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmskltz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmskltz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vmskltz_h(v8i16 _1) { return __builtin_lsx_vmskltz_h(_1); }
 // CHECK-LABEL: @vmskltz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmskltz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmskltz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vmskltz_w(v4i32 _1) { return __builtin_lsx_vmskltz_w(_1); }
 // CHECK-LABEL: @vmskltz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmskltz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmskltz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vmskltz_d(v2i64 _1) { return __builtin_lsx_vmskltz_d(_1); }
 // CHECK-LABEL: @vsigncov_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsigncov.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsigncov.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsigncov_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsigncov_b(_1, _2);
 }
 // CHECK-LABEL: @vsigncov_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsigncov.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsigncov.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsigncov_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsigncov_h(_1, _2);
 }
 // CHECK-LABEL: @vsigncov_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsigncov.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsigncov.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsigncov_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsigncov_w(_1, _2);
 }
 // CHECK-LABEL: @vsigncov_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsigncov.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsigncov.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsigncov_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsigncov_d(_1, _2);
 }
 // CHECK-LABEL: @vfmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmadd.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfmadd_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __builtin_lsx_vfmadd_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmadd.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfmadd_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __builtin_lsx_vfmadd_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vfmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmsub.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfmsub_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __builtin_lsx_vfmsub_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmsub.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfmsub_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __builtin_lsx_vfmsub_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmadd_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmadd.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfnmadd_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __builtin_lsx_vfnmadd_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmadd_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmadd.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfnmadd_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __builtin_lsx_vfnmadd_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmsub_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmsub.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4f32 vfnmsub_s(v4f32 _1, v4f32 _2, v4f32 _3) {
   return __builtin_lsx_vfnmsub_s(_1, _2, _3);
 }
 // CHECK-LABEL: @vfnmsub_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmsub.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2f64 vfnmsub_d(v2f64 _1, v2f64 _2, v2f64 _3) {
   return __builtin_lsx_vfnmsub_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vftintrne_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrne_w_s(v4f32 _1) { return __builtin_lsx_vftintrne_w_s(_1); }
 // CHECK-LABEL: @vftintrne_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrne.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrne.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrne_l_d(v2f64 _1) { return __builtin_lsx_vftintrne_l_d(_1); }
 // CHECK-LABEL: @vftintrp_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrp_w_s(v4f32 _1) { return __builtin_lsx_vftintrp_w_s(_1); }
 // CHECK-LABEL: @vftintrp_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrp.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrp.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrp_l_d(v2f64 _1) { return __builtin_lsx_vftintrp_l_d(_1); }
 // CHECK-LABEL: @vftintrm_w_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vftintrm_w_s(v4f32 _1) { return __builtin_lsx_vftintrm_w_s(_1); }
 // CHECK-LABEL: @vftintrm_l_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrm.l.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrm.l.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrm_l_d(v2f64 _1) { return __builtin_lsx_vftintrm_l_d(_1); }
 // CHECK-LABEL: @vftint_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftint_w_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vftint_w_d(_1, _2);
 }
 // CHECK-LABEL: @vffint_s_l(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.l(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.l(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4f32 vffint_s_l(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vffint_s_l(_1, _2);
 }
 // CHECK-LABEL: @vftintrz_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrz_w_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vftintrz_w_d(_1, _2);
 }
 // CHECK-LABEL: @vftintrp_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrp_w_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vftintrp_w_d(_1, _2);
 }
 // CHECK-LABEL: @vftintrm_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrm_w_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vftintrm_w_d(_1, _2);
 }
 // CHECK-LABEL: @vftintrne_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vftintrne_w_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vftintrne_w_d(_1, _2);
 }
 // CHECK-LABEL: @vftintl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintl.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintl.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintl_l_s(v4f32 _1) { return __builtin_lsx_vftintl_l_s(_1); }
 // CHECK-LABEL: @vftinth_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftinth.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftinth.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftinth_l_s(v4f32 _1) { return __builtin_lsx_vftinth_l_s(_1); }
 // CHECK-LABEL: @vffinth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffinth.d.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffinth.d.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffinth_d_w(v4i32 _1) { return __builtin_lsx_vffinth_d_w(_1); }
 // CHECK-LABEL: @vffintl_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffintl.d.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffintl.d.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2f64 vffintl_d_w(v4i32 _1) { return __builtin_lsx_vffintl_d_w(_1); }
 // CHECK-LABEL: @vftintrzl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzl.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzl.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrzl_l_s(v4f32 _1) { return __builtin_lsx_vftintrzl_l_s(_1); }
 // CHECK-LABEL: @vftintrzh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzh.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzh.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrzh_l_s(v4f32 _1) { return __builtin_lsx_vftintrzh_l_s(_1); }
 // CHECK-LABEL: @vftintrpl_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrpl.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrpl.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrpl_l_s(v4f32 _1) { return __builtin_lsx_vftintrpl_l_s(_1); }
 // CHECK-LABEL: @vftintrph_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrph.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrph.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrph_l_s(v4f32 _1) { return __builtin_lsx_vftintrph_l_s(_1); }
 // CHECK-LABEL: @vftintrml_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrml.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrml.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrml_l_s(v4f32 _1) { return __builtin_lsx_vftintrml_l_s(_1); }
 // CHECK-LABEL: @vftintrmh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrmh.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrmh.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrmh_l_s(v4f32 _1) { return __builtin_lsx_vftintrmh_l_s(_1); }
 // CHECK-LABEL: @vftintrnel_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrnel.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrnel.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrnel_l_s(v4f32 _1) {
   return __builtin_lsx_vftintrnel_l_s(_1);
 }
 // CHECK-LABEL: @vftintrneh_l_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrneh.l.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrneh.l.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vftintrneh_l_s(v4f32 _1) {
   return __builtin_lsx_vftintrneh_l_s(_1);
 }
 // CHECK-LABEL: @vfrintrne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrne.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrne.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrne_s(v4f32 _1) { return __builtin_lsx_vfrintrne_s(_1); }
 // CHECK-LABEL: @vfrintrne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrne.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrne.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrne_d(v2f64 _1) { return __builtin_lsx_vfrintrne_d(_1); }
 // CHECK-LABEL: @vfrintrz_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrz.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrz.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrz_s(v4f32 _1) { return __builtin_lsx_vfrintrz_s(_1); }
 // CHECK-LABEL: @vfrintrz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrz.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrz.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrz_d(v2f64 _1) { return __builtin_lsx_vfrintrz_d(_1); }
 // CHECK-LABEL: @vfrintrp_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrp.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrp.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrp_s(v4f32 _1) { return __builtin_lsx_vfrintrp_s(_1); }
 // CHECK-LABEL: @vfrintrp_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrp.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrp.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrp_d(v2f64 _1) { return __builtin_lsx_vfrintrp_d(_1); }
 // CHECK-LABEL: @vfrintrm_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrm.s(<4 x float> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrm.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vfrintrm_s(v4f32 _1) { return __builtin_lsx_vfrintrm_s(_1); }
 // CHECK-LABEL: @vfrintrm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrm.d(<2 x double> [[_1:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
-// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrm.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vfrintrm_d(v2f64 _1) { return __builtin_lsx_vfrintrm_d(_1); }
 // CHECK-LABEL: @vstelm_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> [[TMP0]], ptr [[_2:%.*]], i32 1, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_b(v16i8 _1, void *_2) {
@@ -3349,7 +4577,8 @@ void vstelm_b(v16i8 _1, void *_2) {
 }
 // CHECK-LABEL: @vstelm_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> [[_1:%.*]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> [[TMP0]], ptr [[_2:%.*]], i32 2, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_h(v8i16 _1, void *_2) {
@@ -3357,7 +4586,8 @@ void vstelm_h(v8i16 _1, void *_2) {
 }
 // CHECK-LABEL: @vstelm_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> [[_1:%.*]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> [[TMP0]], ptr [[_2:%.*]], i32 4, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_w(v4i32 _1, void *_2) {
@@ -3365,7 +4595,8 @@ void vstelm_w(v4i32 _1, void *_2) {
 }
 // CHECK-LABEL: @vstelm_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> [[_1:%.*]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> [[TMP0]], ptr [[_2:%.*]], i32 8, i32 1)
 // CHECK-NEXT:    ret void
 //
 void vstelm_d(v2i64 _1, void *_2) {
@@ -3373,1286 +4604,1785 @@ void vstelm_d(v2i64 _1, void *_2) {
 }
 // CHECK-LABEL: @vaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vaddwev_d_w(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwev_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vaddwev_w_h(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwev_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vaddwev_h_b(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vaddwod_d_w(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwod_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vaddwod_w_h(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwod_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vaddwod_h_b(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_d_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vaddwev_d_wu(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwev_w_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vaddwev_w_hu(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwev_h_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vaddwev_h_bu(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_d_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vaddwod_d_wu(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwod_w_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vaddwod_w_hu(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwod_h_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vaddwod_h_bu(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_d_wu_w(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vaddwev_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwev_w_hu_h(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vaddwev_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwev_h_bu_b(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vaddwev_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_d_wu_w(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vaddwod_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vaddwod_w_hu_h(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vaddwod_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vaddwod_h_bu_b(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vaddwod_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsubwev_d_w(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwev_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsubwev_w_h(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwev_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsubwev_h_b(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsubwod_d_w(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwod_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsubwod_w_h(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwod_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsubwod_h_b(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_d_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vsubwev_d_wu(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwev_w_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vsubwev_w_hu(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwev_h_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vsubwev_h_bu(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_d_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vsubwod_d_wu(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsubwod_w_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vsubwod_w_hu(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsubwod_h_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vsubwod_h_bu(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vaddwev_q_d(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vaddwod_q_d(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_q_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vaddwev_q_du(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_q_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vaddwod_q_du(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsubwev_q_d(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsubwod_q_d(_1, _2);
 }
 // CHECK-LABEL: @vsubwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwev_q_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vsubwev_q_du(_1, _2);
 }
 // CHECK-LABEL: @vsubwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsubwod_q_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vsubwod_q_du(_1, _2);
 }
 // CHECK-LABEL: @vaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwev_q_du_d(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vaddwev_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vaddwod_q_du_d(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vaddwod_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vmulwev_d_w(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwev_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vmulwev_w_h(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwev_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vmulwev_h_b(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_d_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vmulwod_d_w(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwod_w_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vmulwod_w_h(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwod_h_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vmulwod_h_b(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_d_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vmulwev_d_wu(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwev_w_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vmulwev_w_hu(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwev_h_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vmulwev_h_bu(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_d_wu(v4u32 _1, v4u32 _2) {
   return __builtin_lsx_vmulwod_d_wu(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwod_w_hu(v8u16 _1, v8u16 _2) {
   return __builtin_lsx_vmulwod_w_hu(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwod_h_bu(v16u8 _1, v16u8 _2) {
   return __builtin_lsx_vmulwod_h_bu(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_d_wu_w(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vmulwev_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwev_w_hu_h(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vmulwev_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwev_h_bu_b(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vmulwev_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_d_wu_w(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vmulwod_d_wu_w(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vmulwod_w_hu_h(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vmulwod_w_hu_h(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vmulwod_h_bu_b(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vmulwod_h_bu_b(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vmulwev_q_d(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vmulwod_q_d(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_q_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vmulwev_q_du(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_q_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vmulwod_q_du(_1, _2);
 }
 // CHECK-LABEL: @vmulwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwev_q_du_d(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vmulwev_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vmulwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vmulwod_q_du_d(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vmulwod_q_du_d(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhaddw_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vhaddw_q_d(_1, _2);
 }
 // CHECK-LABEL: @vhaddw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.qu.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.qu.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vhaddw_qu_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vhaddw_qu_du(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vhsubw_q_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vhsubw_q_d(_1, _2);
 }
 // CHECK-LABEL: @vhsubw_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.qu.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.qu.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vhsubw_qu_du(v2u64 _1, v2u64 _2) {
   return __builtin_lsx_vhsubw_qu_du(_1, _2);
 }
 // CHECK-LABEL: @vmaddwev_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_d_w(v2i64 _1, v4i32 _2, v4i32 _3) {
   return __builtin_lsx_vmaddwev_d_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwev_w_h(v4i32 _1, v8i16 _2, v8i16 _3) {
   return __builtin_lsx_vmaddwev_w_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwev_h_b(v8i16 _1, v16i8 _2, v16i8 _3) {
   return __builtin_lsx_vmaddwev_h_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwev_d_wu(v2u64 _1, v4u32 _2, v4u32 _3) {
   return __builtin_lsx_vmaddwev_d_wu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4u32 vmaddwev_w_hu(v4u32 _1, v8u16 _2, v8u16 _3) {
   return __builtin_lsx_vmaddwev_w_hu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8u16 vmaddwev_h_bu(v8u16 _1, v16u8 _2, v16u8 _3) {
   return __builtin_lsx_vmaddwev_h_bu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_d_w(v2i64 _1, v4i32 _2, v4i32 _3) {
   return __builtin_lsx_vmaddwod_d_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwod_w_h(v4i32 _1, v8i16 _2, v8i16 _3) {
   return __builtin_lsx_vmaddwod_w_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwod_h_b(v8i16 _1, v16i8 _2, v16i8 _3) {
   return __builtin_lsx_vmaddwod_h_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_d_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwod_d_wu(v2u64 _1, v4u32 _2, v4u32 _3) {
   return __builtin_lsx_vmaddwod_d_wu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_w_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4u32 vmaddwod_w_hu(v4u32 _1, v8u16 _2, v8u16 _3) {
   return __builtin_lsx_vmaddwod_w_hu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_h_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8u16 vmaddwod_h_bu(v8u16 _1, v16u8 _2, v16u8 _3) {
   return __builtin_lsx_vmaddwod_h_bu(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_d_wu_w(v2i64 _1, v4u32 _2, v4i32 _3) {
   return __builtin_lsx_vmaddwev_d_wu_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwev_w_hu_h(v4i32 _1, v8u16 _2, v8i16 _3) {
   return __builtin_lsx_vmaddwev_w_hu_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwev_h_bu_b(v8i16 _1, v16u8 _2, v16i8 _3) {
   return __builtin_lsx_vmaddwev_h_bu_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_d_wu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu.w(<2 x i64> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_d_wu_w(v2i64 _1, v4u32 _2, v4i32 _3) {
   return __builtin_lsx_vmaddwod_d_wu_w(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_w_hu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu.h(<4 x i32> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v4i32 vmaddwod_w_hu_h(v4i32 _1, v8u16 _2, v8i16 _3) {
   return __builtin_lsx_vmaddwod_w_hu_h(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_h_bu_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu.b(<8 x i16> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v8i16 vmaddwod_h_bu_b(v8i16 _1, v16u8 _2, v16i8 _3) {
   return __builtin_lsx_vmaddwod_h_bu_b(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_q_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __builtin_lsx_vmaddwev_q_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_q_d(v2i64 _1, v2i64 _2, v2i64 _3) {
   return __builtin_lsx_vmaddwod_q_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwev_q_du(v2u64 _1, v2u64 _2, v2u64 _3) {
   return __builtin_lsx_vmaddwev_q_du(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_q_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2u64 vmaddwod_q_du(v2u64 _1, v2u64 _2, v2u64 _3) {
   return __builtin_lsx_vmaddwod_q_du(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwev_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwev_q_du_d(v2i64 _1, v2u64 _2, v2i64 _3) {
   return __builtin_lsx_vmaddwev_q_du_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vmaddwod_q_du_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v2i64 vmaddwod_q_du_d(v2i64 _1, v2u64 _2, v2i64 _3) {
   return __builtin_lsx_vmaddwod_q_du_d(_1, _2, _3);
 }
 // CHECK-LABEL: @vrotr_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotr.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vrotr_b(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vrotr_b(_1, _2);
 }
 // CHECK-LABEL: @vrotr_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotr.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vrotr_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vrotr_h(_1, _2);
 }
 // CHECK-LABEL: @vrotr_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotr.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vrotr_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vrotr_w(_1, _2);
 }
 // CHECK-LABEL: @vrotr_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotr.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vrotr_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vrotr_d(_1, _2);
 }
 // CHECK-LABEL: @vadd_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vadd_q(v2i64 _1, v2i64 _2) { return __builtin_lsx_vadd_q(_1, _2); }
 // CHECK-LABEL: @vsub_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsub_q(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsub_q(_1, _2); }
 // CHECK-LABEL: @vldrepl_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vldrepl.b(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vldrepl_b(void *_1) { return __builtin_lsx_vldrepl_b(_1, 1); }
 // CHECK-LABEL: @vldrepl_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vldrepl.h(ptr [[_1:%.*]], i32 2)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v8i16 vldrepl_h(void *_1) { return __builtin_lsx_vldrepl_h(_1, 2); }
 // CHECK-LABEL: @vldrepl_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vldrepl.w(ptr [[_1:%.*]], i32 4)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v4i32 vldrepl_w(void *_1) { return __builtin_lsx_vldrepl_w(_1, 4); }
 // CHECK-LABEL: @vldrepl_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vldrepl.d(ptr [[_1:%.*]], i32 8)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vldrepl_d(void *_1) { return __builtin_lsx_vldrepl_d(_1, 8); }
 // CHECK-LABEL: @vmskgez_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskgez.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskgez.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmskgez_b(v16i8 _1) { return __builtin_lsx_vmskgez_b(_1); }
 // CHECK-LABEL: @vmsknz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsknz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsknz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vmsknz_b(v16i8 _1) { return __builtin_lsx_vmsknz_b(_1); }
 // CHECK-LABEL: @vexth_h_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.h.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.h.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vexth_h_b(v16i8 _1) { return __builtin_lsx_vexth_h_b(_1); }
 // CHECK-LABEL: @vexth_w_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.w.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.w.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vexth_w_h(v8i16 _1) { return __builtin_lsx_vexth_w_h(_1); }
 // CHECK-LABEL: @vexth_d_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.d.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.d.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vexth_d_w(v4i32 _1) { return __builtin_lsx_vexth_d_w(_1); }
 // CHECK-LABEL: @vexth_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.q.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.q.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vexth_q_d(v2i64 _1) { return __builtin_lsx_vexth_q_d(_1); }
 // CHECK-LABEL: @vexth_hu_bu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.hu.bu(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.hu.bu(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8u16 vexth_hu_bu(v16u8 _1) { return __builtin_lsx_vexth_hu_bu(_1); }
 // CHECK-LABEL: @vexth_wu_hu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.wu.hu(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.wu.hu(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4u32 vexth_wu_hu(v8u16 _1) { return __builtin_lsx_vexth_wu_hu(_1); }
 // CHECK-LABEL: @vexth_du_wu(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.du.wu(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.du.wu(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vexth_du_wu(v4u32 _1) { return __builtin_lsx_vexth_du_wu(_1); }
 // CHECK-LABEL: @vexth_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.qu.du(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.qu.du(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vexth_qu_du(v2u64 _1) { return __builtin_lsx_vexth_qu_du(_1); }
 // CHECK-LABEL: @vrotri_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v16i8 vrotri_b(v16i8 _1) { return __builtin_lsx_vrotri_b(_1, 1); }
 // CHECK-LABEL: @vrotri_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v8i16 vrotri_h(v8i16 _1) { return __builtin_lsx_vrotri_h(_1, 1); }
 // CHECK-LABEL: @vrotri_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v4i32 vrotri_w(v4i32 _1) { return __builtin_lsx_vrotri_w(_1, 1); }
 // CHECK-LABEL: @vrotri_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vrotri_d(v2i64 _1) { return __builtin_lsx_vrotri_d(_1, 1); }
 // CHECK-LABEL: @vextl_q_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.q.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.q.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2i64 vextl_q_d(v2i64 _1) { return __builtin_lsx_vextl_q_d(_1); }
 // CHECK-LABEL: @vsrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlni_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsrlni_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlni_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrlni_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlni_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrlni_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrlni_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrlni_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrlrni_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsrlrni_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrlrni_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrlrni_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrlrni_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrlrni_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrlrni_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrlrni_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrlni_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vssrlni_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrlni_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrlni_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrlni_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrlni_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrlni_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrlni_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrlni_bu_h(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vssrlni_bu_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrlni_hu_w(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vssrlni_hu_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrlni_wu_d(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vssrlni_wu_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrlni_du_q(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vssrlni_du_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrlrni_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vssrlrni_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrlrni_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrlrni_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrlrni_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrlrni_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrlrni_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrlrni_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrlrni_bu_h(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vssrlrni_bu_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrlrni_hu_w(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vssrlrni_hu_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrlrni_wu_d(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vssrlrni_wu_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrlrni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrlrni_du_q(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vssrlrni_du_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrani_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsrani_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrani_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrani_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrani_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrani_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrani_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrani_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vsrarni_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vsrarni_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vsrarni_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vsrarni_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vsrarni_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vsrarni_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vsrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vsrarni_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vsrarni_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrani_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vssrani_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrani_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrani_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrani_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrani_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrani_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrani_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrani_bu_h(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vssrani_bu_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrani_hu_w(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vssrani_hu_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrani_wu_d(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vssrani_wu_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrani_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrani_du_q(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vssrani_du_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrarni_b_h(v16i8 _1, v16i8 _2) {
   return __builtin_lsx_vssrarni_b_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrarni_h_w(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrarni_h_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrarni_w_d(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrarni_w_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_d_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vssrarni_d_q(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrarni_d_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_bu_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16u8 vssrarni_bu_h(v16u8 _1, v16i8 _2) {
   return __builtin_lsx_vssrarni_bu_h(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_hu_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8u16 vssrarni_hu_w(v8u16 _1, v8i16 _2) {
   return __builtin_lsx_vssrarni_hu_w(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_wu_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4u32 vssrarni_wu_d(v4u32 _1, v4i32 _2) {
   return __builtin_lsx_vssrarni_wu_d(_1, _2, 1);
 }
 // CHECK-LABEL: @vssrarni_du_q(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2u64 vssrarni_du_q(v2u64 _1, v2i64 _2) {
   return __builtin_lsx_vssrarni_du_q(_1, _2, 1);
 }
 // CHECK-LABEL: @vpermi_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vpermi_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vpermi_w(_1, _2, 1);
@@ -4660,79 +6390,107 @@ v4i32 vpermi_w(v4i32 _1, v4i32 _2) {
 // CHECK-LABEL: @vld(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vld(ptr [[_1:%.*]], i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vld(void *_1) { return __builtin_lsx_vld(_1, 1); }
 // CHECK-LABEL: @vst(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vst(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vst(<16 x i8> [[TMP0]], ptr [[_2:%.*]], i32 1)
 // CHECK-NEXT:    ret void
 //
 void vst(v16i8 _1, void *_2) { return __builtin_lsx_vst(_1, _2, 1); }
 // CHECK-LABEL: @vssrlrn_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrlrn_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrlrn_b_h(_1, _2);
 }
 // CHECK-LABEL: @vssrlrn_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrlrn_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrlrn_h_w(_1, _2);
 }
 // CHECK-LABEL: @vssrlrn_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrlrn_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrlrn_w_d(_1, _2);
 }
 // CHECK-LABEL: @vssrln_b_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.b.h(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vssrln_b_h(v8i16 _1, v8i16 _2) {
   return __builtin_lsx_vssrln_b_h(_1, _2);
 }
 // CHECK-LABEL: @vssrln_h_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.h.w(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v8i16 vssrln_h_w(v4i32 _1, v4i32 _2) {
   return __builtin_lsx_vssrln_h_w(_1, _2);
 }
 // CHECK-LABEL: @vssrln_w_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.w.d(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vssrln_w_d(v2i64 _1, v2i64 _2) {
   return __builtin_lsx_vssrln_w_d(_1, _2);
 }
 // CHECK-LABEL: @vorn_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vorn.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vorn.v(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v16i8 vorn_v(v16i8 _1, v16i8 _2) { return __builtin_lsx_vorn_v(_1, _2); }
 // CHECK-LABEL: @vldi(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vldi(i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vldi() { return __builtin_lsx_vldi(1); }
 // CHECK-LABEL: @vshuf_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i128 [[_3_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf.b(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+// CHECK-NEXT:    ret i128 [[TMP4]]
 //
 v16i8 vshuf_b(v16i8 _1, v16i8 _2, v16i8 _3) {
   return __builtin_lsx_vshuf_b(_1, _2, _3);
@@ -4740,429 +6498,575 @@ v16i8 vshuf_b(v16i8 _1, v16i8 _2, v16i8 _3) {
 // CHECK-LABEL: @vldx(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vldx(ptr [[_1:%.*]], i64 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vldx(void *_1) { return __builtin_lsx_vldx(_1, 1); }
 // CHECK-LABEL: @vstx(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstx(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i64 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstx(<16 x i8> [[TMP0]], ptr [[_2:%.*]], i64 1)
 // CHECK-NEXT:    ret void
 //
 void vstx(v16i8 _1, void *_2) { return __builtin_lsx_vstx(_1, _2, 1); }
 // CHECK-LABEL: @vextl_qu_du(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.qu.du(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.qu.du(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
 //
 v2u64 vextl_qu_du(v2u64 _1) { return __builtin_lsx_vextl_qu_du(_1); }
 // CHECK-LABEL: @bnz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_b(v16u8 _1) { return __builtin_lsx_bnz_b(_1); }
 // CHECK-LABEL: @bnz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_d(v2u64 _1) { return __builtin_lsx_bnz_d(_1); }
 // CHECK-LABEL: @bnz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_h(v8u16 _1) { return __builtin_lsx_bnz_h(_1); }
 // CHECK-LABEL: @bnz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.v(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.v(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_v(v16u8 _1) { return __builtin_lsx_bnz_v(_1); }
 // CHECK-LABEL: @bnz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bnz_w(v4u32 _1) { return __builtin_lsx_bnz_w(_1); }
 // CHECK-LABEL: @bz_b(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.b(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.b(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_b(v16u8 _1) { return __builtin_lsx_bz_b(_1); }
 // CHECK-LABEL: @bz_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.d(<2 x i64> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.d(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_d(v2u64 _1) { return __builtin_lsx_bz_d(_1); }
 // CHECK-LABEL: @bz_h(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.h(<8 x i16> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.h(<8 x i16> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_h(v8u16 _1) { return __builtin_lsx_bz_h(_1); }
 // CHECK-LABEL: @bz_v(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.v(<16 x i8> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.v(<16 x i8> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_v(v16u8 _1) { return __builtin_lsx_bz_v(_1); }
 // CHECK-LABEL: @bz_w(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.w(<4 x i32> [[_1:%.*]])
-// CHECK-NEXT:    ret i32 [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.w(<4 x i32> [[TMP0]])
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 int bz_w(v4u32 _1) { return __builtin_lsx_bz_w(_1); }
 // CHECK-LABEL: @vfcmp_caf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.caf.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.caf.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_caf_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_caf_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_caf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_caf_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_caf_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_ceq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.ceq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.ceq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_ceq_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_ceq_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_ceq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.ceq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.ceq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_ceq_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_ceq_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cle.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cle.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cle_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cle_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cle.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cle.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cle_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cle_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_clt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.clt.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.clt.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_clt_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_clt_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_clt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.clt.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.clt.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_clt_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_clt_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cne.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cne.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cne_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cne_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cne.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cne.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cne_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cne_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cor.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cor.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cor_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cor_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cor.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cor.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cor_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cor_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cueq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cueq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cueq_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cueq_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cueq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cueq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cueq_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cueq_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cule.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cule.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cule_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cule_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cule.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cule.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cule_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cule_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cult.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cult.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cult_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cult_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cult.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cult.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cult_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cult_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cun.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cun.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cun_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cun_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cune.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cune.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_cune_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_cune_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cune.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cune.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cune_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cune_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_cun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cun.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cun.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_cun_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_cun_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_saf_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.saf.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.saf.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_saf_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_saf_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_saf_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.saf.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.saf.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_saf_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_saf_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_seq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.seq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.seq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_seq_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_seq_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_seq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.seq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.seq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_seq_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_seq_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sle_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sle.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sle.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sle_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sle_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sle_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sle.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sle.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sle_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sle_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_slt_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.slt.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.slt.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_slt_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_slt_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_slt_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.slt.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.slt.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_slt_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_slt_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sne_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sne.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sne.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sne_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sne_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sne_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sne.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sne.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sne_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sne_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sor_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sor.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sor.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sor_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sor_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sor_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sor.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sor.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sor_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sor_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sueq_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sueq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sueq.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sueq_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sueq_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sueq_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sueq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sueq.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sueq_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sueq_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sule_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sule.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sule.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sule_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sule_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sule_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sule.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sule.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sule_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sule_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sult_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sult.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sult.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sult_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sult_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sult_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sult.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sult.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sult_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sult_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sun_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sun.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sun.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sun_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sun_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sune_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sune.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sune.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v2i64 vfcmp_sune_d(v2f64 _1, v2f64 _2) {
   return __builtin_lsx_vfcmp_sune_d(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sune_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sune.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sune.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sune_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sune_s(_1, _2);
 }
 // CHECK-LABEL: @vfcmp_sun_s(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sun.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sun.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+// CHECK-NEXT:    ret i128 [[TMP3]]
 //
 v4i32 vfcmp_sun_s(v4f32 _1, v4f32 _2) {
   return __builtin_lsx_vfcmp_sun_s(_1, _2);
@@ -5170,24 +7074,28 @@ v4i32 vfcmp_sun_s(v4f32 _1, v4f32 _2) {
 // CHECK-LABEL: @vrepli_b(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrepli.b(i32 1)
-// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v16i8 vrepli_b() { return __builtin_lsx_vrepli_b(1); }
 // CHECK-LABEL: @vrepli_d(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrepli.d(i32 1)
-// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v2i64 vrepli_d() { return __builtin_lsx_vrepli_d(1); }
 // CHECK-LABEL: @vrepli_h(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrepli.h(i32 1)
-// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v8i16 vrepli_h() { return __builtin_lsx_vrepli_h(1); }
 // CHECK-LABEL: @vrepli_w(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrepli.w(i32 1)
-// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+// CHECK-NEXT:    ret i128 [[TMP1]]
 //
 v4i32 vrepli_w() { return __builtin_lsx_vrepli_w(1); }
diff --git a/clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c b/clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c
index 3a421f8c6cd4..fbc51b4bf144 100644
--- a/clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c
+++ b/clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c
@@ -6,6 +6,8 @@
 // RUN:     -disable-O0-optnone | opt -S -passes=mem2reg \
 // RUN:     | FileCheck %s  -check-prefix=RV64ZBB
 
+#include <riscv_bitmanip.h>
+
 // RV32ZBB-LABEL: @orc_b_32(
 // RV32ZBB-NEXT:  entry:
 // RV32ZBB-NEXT:    [[TMP0:%.*]] = call i32 @llvm.riscv.orc.b.i32(i32 [[A:%.*]])
@@ -16,8 +18,8 @@
 // RV64ZBB-NEXT:    [[TMP0:%.*]] = call i32 @llvm.riscv.orc.b.i32(i32 [[A:%.*]])
 // RV64ZBB-NEXT:    ret i32 [[TMP0]]
 //
-unsigned int orc_b_32(unsigned int a) {
-  return __builtin_riscv_orc_b_32(a);
+uint32_t orc_b_32(uint32_t a) {
+  return __riscv_orc_b_32(a);
 }
 
 #if __riscv_xlen == 64
@@ -26,8 +28,8 @@ unsigned int orc_b_32(unsigned int a) {
 // RV64ZBB-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.orc.b.i64(i64 [[A:%.*]])
 // RV64ZBB-NEXT:    ret i64 [[TMP0]]
 //
-unsigned long orc_b_64(unsigned long a) {
-  return __builtin_riscv_orc_b_64(a);
+uint64_t orc_b_64(uint64_t a) {
+  return __riscv_orc_b_64(a);
 }
 #endif
 
@@ -41,19 +43,19 @@ unsigned long orc_b_64(unsigned long a) {
 // RV64ZBB-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[A:%.*]], i1 false)
 // RV64ZBB-NEXT:    ret i32 [[TMP0]]
 //
-unsigned int clz_32(unsigned int a) {
-  return __builtin_riscv_clz_32(a);
+unsigned int clz_32(uint32_t a) {
+  return __riscv_clz_32(a);
 }
 
 #if __riscv_xlen == 64
 // RV64ZBB-LABEL: @clz_64(
 // RV64ZBB-NEXT:  entry:
 // RV64ZBB-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false)
-// RV64ZBB-NEXT:    [[CAST:%.*]] = trunc i64 [[TMP0]] to i32
-// RV64ZBB-NEXT:    ret i32 [[CAST]]
+// RV64ZBB-NEXT:    [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32
+// RV64ZBB-NEXT:    ret i32 [[CAST_I]]
 //
-unsigned int clz_64(unsigned long a) {
-  return __builtin_riscv_clz_64(a);
+unsigned int clz_64(uint64_t a) {
+  return __riscv_clz_64(a);
 }
 #endif
 
@@ -67,18 +69,44 @@ unsigned int clz_64(unsigned long a) {
 // RV64ZBB-NEXT:    [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[A:%.*]], i1 false)
 // RV64ZBB-NEXT:    ret i32 [[TMP0]]
 //
-unsigned int ctz_32(unsigned int a) {
-  return __builtin_riscv_ctz_32(a);
+unsigned int ctz_32(uint32_t a) {
+  return __riscv_ctz_32(a);
 }
 
 #if __riscv_xlen == 64
 // RV64ZBB-LABEL: @ctz_64(
 // RV64ZBB-NEXT:  entry:
 // RV64ZBB-NEXT:    [[TMP0:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false)
-// RV64ZBB-NEXT:    [[CAST:%.*]] = trunc i64 [[TMP0]] to i32
-// RV64ZBB-NEXT:    ret i32 [[CAST]]
+// RV64ZBB-NEXT:    [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32
+// RV64ZBB-NEXT:    ret i32 [[CAST_I]]
+//
+unsigned int ctz_64(uint64_t a) {
+  return __riscv_ctz_64(a);
+}
+#endif
+
+// RV32ZBB-LABEL: @cpop_32(
+// RV32ZBB-NEXT:  entry:
+// RV32ZBB-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[A:%.*]])
+// RV32ZBB-NEXT:    ret i32 [[TMP0]]
+//
+// RV64ZBB-LABEL: @cpop_32(
+// RV64ZBB-NEXT:  entry:
+// RV64ZBB-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[A:%.*]])
+// RV64ZBB-NEXT:    ret i32 [[TMP0]]
+//
+unsigned int cpop_32(uint32_t a) {
+  return __riscv_cpop_32(a);
+}
+
+#if __riscv_xlen == 64
+// RV64ZBB-LABEL: @cpop_64(
+// RV64ZBB-NEXT:  entry:
+// RV64ZBB-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[A:%.*]])
+// RV64ZBB-NEXT:    [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32
+// RV64ZBB-NEXT:    ret i32 [[CAST_I]]
 //
-unsigned int ctz_64(unsigned long a) {
-  return __builtin_riscv_ctz_64(a);
+unsigned int cpop_64(uint64_t a) {
+  return __riscv_cpop_64(a);
 }
 #endif
diff --git a/clang/test/CodeGen/RISCV/rvb-intrinsics/zbc.c b/clang/test/CodeGen/RISCV/rvb-intrinsics/zbc.c
index ae9153eff155..93db3a482ef2 100644
--- a/clang/test/CodeGen/RISCV/rvb-intrinsics/zbc.c
+++ b/clang/test/CodeGen/RISCV/rvb-intrinsics/zbc.c
@@ -6,7 +6,7 @@
 // RUN:     -disable-O0-optnone | opt -S -passes=mem2reg \
 // RUN:     | FileCheck %s  -check-prefix=RV64ZBC
 
-#include <stdint.h>
+#include <riscv_bitmanip.h>
 
 #if __riscv_xlen == 64
 // RV64ZBC-LABEL: @clmul_64(
@@ -15,7 +15,7 @@
 // RV64ZBC-NEXT:    ret i64 [[TMP0]]
 //
 uint64_t clmul_64(uint64_t a, uint64_t b) {
-  return __builtin_riscv_clmul_64(a, b);
+  return __riscv_clmul_64(a, b);
 }
 
 // RV64ZBC-LABEL: @clmulh_64(
@@ -24,7 +24,7 @@ uint64_t clmul_64(uint64_t a, uint64_t b) {
 // RV64ZBC-NEXT:    ret i64 [[TMP0]]
 //
 uint64_t clmulh_64(uint64_t a, uint64_t b) {
-  return __builtin_riscv_clmulh_64(a, b);
+  return __riscv_clmulh_64(a, b);
 }
 
 // RV64ZBC-LABEL: @clmulr_64(
@@ -33,7 +33,7 @@ uint64_t clmulh_64(uint64_t a, uint64_t b) {
 // RV64ZBC-NEXT:    ret i64 [[TMP0]]
 //
 uint64_t clmulr_64(uint64_t a, uint64_t b) {
-  return __builtin_riscv_clmulr_64(a, b);
+  return __riscv_clmulr_64(a, b);
 }
 #endif
 
@@ -48,7 +48,7 @@ uint64_t clmulr_64(uint64_t a, uint64_t b) {
 // RV64ZBC-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t clmul_32(uint32_t a, uint32_t b) {
-  return __builtin_riscv_clmul_32(a, b);
+  return __riscv_clmul_32(a, b);
 }
 
 #if __riscv_xlen == 32
@@ -58,7 +58,7 @@ uint32_t clmul_32(uint32_t a, uint32_t b) {
 // RV32ZBC-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t clmulh_32(uint32_t a, uint32_t b) {
-  return __builtin_riscv_clmulh_32(a, b);
+  return __riscv_clmulh_32(a, b);
 }
 
 // RV32ZBC-LABEL: @clmulr_32(
@@ -67,6 +67,6 @@ uint32_t clmulh_32(uint32_t a, uint32_t b) {
 // RV32ZBC-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t clmulr_32(uint32_t a, uint32_t b) {
-  return __builtin_riscv_clmulr_32(a, b);
+  return __riscv_clmulr_32(a, b);
 }
 #endif
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmacc_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmacc_4x8x4.c
index 935cb2e007d3..80e1c443eb54 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmacc_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmacc_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmacc_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmacc_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmacc_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m1(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmacc_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmacc_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmacc_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m2(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmacc_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmacc_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmacc_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmacc_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmacc_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmacc_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m8(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccsu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccsu_4x8x4.c
index f34517b24bcf..8c0a6218c1d2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccsu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccsu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccsu_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m1(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccsu_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m2(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccsu_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccsu_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m8(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccu_4x8x4.c
index ab7f6627ad1f..b40891f417f2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccu_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccu_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccu_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m1(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccu_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccu_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccu_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m2(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccu_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccu_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccu_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccu_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccu_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccu_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m8(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccus_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccus_4x8x4.c
index d0bcdcbf40cc..d106aab64c51 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccus_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccus_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccus_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccus_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccus_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m1(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccus_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccus_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccus_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m2(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccus_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccus_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccus_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccus_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccus_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccus_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m8(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmacc_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmacc_4x8x4.c
index 839d09c4f9a9..88fae7306944 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmacc_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmacc_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmacc_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmacc_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmacc_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmacc_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmacc_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmacc_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmacc_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmacc_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmacc_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmacc_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmacc_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmacc_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccsu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccsu_4x8x4.c
index b18853043e92..0aec4bfd9fe2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccsu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccsu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccsu_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccsu_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccsu_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccsu_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccu_4x8x4.c
index 4cb966b08f23..81965e86f77c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccu_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccu_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccu_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccu_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccu_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccu_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccu_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccu_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccu_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccu_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccu_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccu_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccus_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccus_4x8x4.c
index f558151f88a3..f2544cf3ef2a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccus_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccus_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccus_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccus_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccus_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccus_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccus_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccus_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccus_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccus_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccus_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccus_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccus_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccus_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmacc_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmacc_4x8x4.c
index 05c10840cabf..8fdeac62a31f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmacc_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmacc_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmacc_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmacc_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmacc_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m1_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmacc_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmacc_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmacc_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m2_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmacc_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmacc_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmacc_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmacc_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmacc_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmacc_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m8_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccsu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccsu_4x8x4.c
index bce1a4e9443f..e02c790dfbeb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccsu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccsu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccsu_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m1_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccsu_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m2_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccsu_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccsu_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m8_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccu_4x8x4.c
index 36aaae9caebf..ddeb6de00716 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccu_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccu_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccu_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m1_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccu_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccu_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccu_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m2_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccu_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccu_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccu_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccu_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccu_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccu_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m8_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccus_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccus_4x8x4.c
index f5ac2bf0f1f3..397e406c2ee5 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccus_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccus_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccus_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccus_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccus_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m1_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccus_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccus_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccus_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m2_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccus_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccus_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccus_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccus_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccus_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccus_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m8_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmacc_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmacc_4x8x4.c
index 531bc2b2b942..7b3b25a20331 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmacc_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmacc_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmacc_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmacc_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmacc_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmacc_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmacc_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmacc_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmacc_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmacc_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmacc_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmacc_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmacc_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmacc_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccsu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccsu_4x8x4.c
index 23bba523aaa4..2f3cbeec26fc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccsu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccsu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccsu_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccsu_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccsu_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccsu_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccu_4x8x4.c
index 950688c6c785..1f2b2a1c8645 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccu_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccu_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccu_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccu_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccu_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccu_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccu_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccu_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccu_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccu_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccu_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccu_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccus_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccus_4x8x4.c
index 7bdce95043ee..923234fe8e2b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccus_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccus_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccus_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccus_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccus_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccus_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccus_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccus_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccus_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccus_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccus_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccus_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccus_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccus_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vlenb.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vlenb.c
index 9d95acc33ddd..582d5fd812bc 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vlenb.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vlenb.c
@@ -21,19 +21,19 @@ unsigned long test_vlenb(void) {
   return __riscv_vlenb();
 }
 //.
-// RV32: attributes #0 = { mustprogress nofree noinline nosync nounwind willreturn memory(read) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+32bit,+d,+f,+v,+zicsr,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b" }
-// RV32: attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(read) }
+// RV32: attributes #[[ATTR0:[0-9]+]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+32bit,+d,+f,+v,+zicsr,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b" }
+// RV32: attributes #[[ATTR1:[0-9]+]] = { mustprogress nocallback nofree nosync nounwind willreturn memory(read) }
 //.
-// RV64: attributes #0 = { mustprogress nofree noinline nosync nounwind willreturn memory(read) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+d,+f,+v,+zicsr,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b" }
-// RV64: attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(read) }
+// RV64: attributes #[[ATTR0:[0-9]+]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+d,+f,+v,+zicsr,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b" }
+// RV64: attributes #[[ATTR1:[0-9]+]] = { mustprogress nocallback nofree nosync nounwind willreturn memory(read) }
 //.
-// RV32: !0 = !{i32 1, !"wchar_size", i32 4}
-// RV32: !1 = !{i32 1, !"target-abi", !"ilp32d"}
-// RV32: !2 = !{i32 8, !"SmallDataLimit", i32 0}
-// RV32: !3 = !{!"vlenb"}
+// RV32: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// RV32: [[META1:![0-9]+]] = !{i32 1, !"target-abi", !"ilp32d"}
+// RV32: [[META2:![0-9]+]] = !{i32 8, !"SmallDataLimit", i32 0}
+// RV32: [[META3]] = !{!"vlenb"}
 //.
-// RV64: !0 = !{i32 1, !"wchar_size", i32 4}
-// RV64: !1 = !{i32 1, !"target-abi", !"lp64d"}
-// RV64: !2 = !{i32 8, !"SmallDataLimit", i32 0}
-// RV64: !3 = !{!"vlenb"}
+// RV64: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// RV64: [[META1:![0-9]+]] = !{i32 1, !"target-abi", !"lp64d"}
+// RV64: [[META2:![0-9]+]] = !{i32 8, !"SmallDataLimit", i32 0}
+// RV64: [[META3]] = !{!"vlenb"}
 //.
diff --git a/clang/test/CodeGen/X86/ms-x86-intrinsics.c b/clang/test/CodeGen/X86/ms-x86-intrinsics.c
index b4b8f5dc0b8c..a1c90d71c8eb 100644
--- a/clang/test/CodeGen/X86/ms-x86-intrinsics.c
+++ b/clang/test/CodeGen/X86/ms-x86-intrinsics.c
@@ -145,7 +145,7 @@ unsigned __int64 test__shiftleft128(unsigned __int64 l, unsigned __int64 h,
                                     unsigned char d) {
   return __shiftleft128(l, h, d);
 }
-// CHECK-X64-LABEL: define dso_local i64 @test__shiftleft128(i64 noundef %l, i64 noundef %h, i8 noundef %d)
+// CHECK-X64-LABEL: define dso_local noundef i64 @test__shiftleft128(i64 noundef %l, i64 noundef %h, i8 noundef %d)
 // CHECK-X64: = zext i8 %{{.*}} to i64
 // CHECK-X64: = tail call i64 @llvm.fshl.i64(i64 %h, i64 %l, i64 %{{.*}})
 // CHECK-X64:  ret i64 %
@@ -154,7 +154,7 @@ unsigned __int64 test__shiftright128(unsigned __int64 l, unsigned __int64 h,
                                      unsigned char d) {
   return __shiftright128(l, h, d);
 }
-// CHECK-X64-LABEL: define dso_local i64 @test__shiftright128(i64 noundef %l, i64 noundef %h, i8 noundef %d)
+// CHECK-X64-LABEL: define dso_local noundef i64 @test__shiftright128(i64 noundef %l, i64 noundef %h, i8 noundef %d)
 // CHECK-X64: = zext i8 %{{.*}} to i64
 // CHECK-X64: = tail call i64 @llvm.fshr.i64(i64 %h, i64 %l, i64 %{{.*}})
 // CHECK-X64:  ret i64 %
diff --git a/clang/test/CodeGen/aarch64-branch-protection-attr.c b/clang/test/CodeGen/aarch64-branch-protection-attr.c
index 3c2714e2feda..8ab3e17ade42 100644
--- a/clang/test/CodeGen/aarch64-branch-protection-attr.c
+++ b/clang/test/CodeGen/aarch64-branch-protection-attr.c
@@ -46,6 +46,24 @@ __attribute__ ((target("branch-protection=pac-ret+leaf+bti")))
 void btileaf() {}
 // CHECK: define{{.*}} void @btileaf() #[[#BTIPACLEAF:]]
 
+
+__attribute__ ((target("branch-protection=pac-ret+pc")))
+void pauthlr() {}
+// CHECK: define{{.*}} void @pauthlr()  #[[#PAUTHLR:]]
+
+__attribute__ ((target("branch-protection=pac-ret+pc+b-key")))
+void pauthlr_bkey() {}
+// CHECK: define{{.*}} void @pauthlr_bkey()  #[[#PAUTHLR_BKEY:]]
+
+__attribute__ ((target("branch-protection=pac-ret+pc+leaf")))
+void pauthlr_leaf() {}
+// CHECK: define{{.*}} void @pauthlr_leaf()  #[[#PAUTHLR_LEAF:]]
+
+__attribute__ ((target("branch-protection=pac-ret+pc+bti")))
+void pauthlr_bti() {}
+// CHECK: define{{.*}} void @pauthlr_bti()  #[[#PAUTHLR_BTI:]]
+
+
 // CHECK-DAG: attributes #[[#NONE]] = { {{.*}} "branch-target-enforcement"="false" {{.*}} "sign-return-address"="none"
 
 // CHECK-DAG: attributes #[[#STD]] = { {{.*}} "branch-target-enforcement"="true" {{.*}} "sign-return-address"="non-leaf" "sign-return-address-key"="a_key"
@@ -61,3 +79,13 @@ void btileaf() {}
 // CHECK-DAG: attributes #[[#PACBKEYLEAF]] = { {{.*}} "branch-target-enforcement"="false" {{.*}}"sign-return-address"="all" "sign-return-address-key"="b_key"
 
 // CHECK-DAG: attributes #[[#BTIPACLEAF]] = { {{.*}}"branch-target-enforcement"="true" {{.*}} "sign-return-address"="all" "sign-return-address-key"="a_key"
+
+
+// CHECK-DAG: attributes #[[#PAUTHLR]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="false" {{.*}}"sign-return-address"="non-leaf" "sign-return-address-key"="a_key"
+
+// CHECK-DAG: attributes #[[#PAUTHLR_BKEY]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="false" {{.*}}"sign-return-address"="non-leaf" "sign-return-address-key"="b_key"
+
+// CHECK-DAG: attributes #[[#PAUTHLR_LEAF]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="false" {{.*}}"sign-return-address"="all" "sign-return-address-key"="a_key"
+
+// CHECK-DAG: attributes #[[#PAUTHLR_BTI]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="true" {{.*}}"sign-return-address"="non-leaf" "sign-return-address-key"="a_key"
+
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
new file mode 100644
index 000000000000..282819c8ca35
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
@@ -0,0 +1,72 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+// CHECK-LABEL: @test_in_streaming_mode(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
+// CHECK-NEXT:    [[AND_I:%.*]] = and i64 [[TMP1]], 1
+// CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0
+// CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+//
+// CPP-CHECK-LABEL: @_Z22test_in_streaming_modev(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[AND_I:%.*]] = and i64 [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0
+// CPP-CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+//
+bool test_in_streaming_mode(void) __arm_streaming_compatible {
+  return __arm_in_streaming_mode();
+}
+
+// CHECK-LABEL: @test_za_disable(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_za_disablev(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR4:[0-9]+]]
+// CPP-CHECK-NEXT:    ret void
+//
+void test_za_disable(void) __arm_streaming_compatible {
+  __arm_za_disable();
+}
+
+// CHECK-LABEL: @test_has_sme(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
+// CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0
+// CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+//
+// CPP-CHECK-LABEL: @_Z12test_has_smev(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0
+// CPP-CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+//
+bool test_has_sme(void) __arm_streaming_compatible {
+  return __arm_has_sme();
+}
+
+// CHECK-LABEL: @test_svundef_za(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svundef_zav(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svundef_za(void) __arm_streaming_compatible __arm_shared_za {
+  svundef_za();
+}
+
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
new file mode 100644
index 000000000000..ff4176530710
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
@@ -0,0 +1,297 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi (half)
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x2_f16j13svfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_f16,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x4_f16j13svfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_f16,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, single (half)
+// CHECK-LABEL: @test_svdot_single_za32_vg1x2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_f16,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_f16,,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, indexed (half)
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,,_f16,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,,_f16,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+
+//
+// Multi, multi (bfloat)
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_multi_za32_vg1x2_bf16j14svbfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_bf16,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_multi_za32_vg1x4_bf16j14svbfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_bf16,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, single (bfloat)
+// CHECK-LABEL: @test_svdot_single_za32_vg1x2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svdot_single_za32_vg1x2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_bf16,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svdot_single_za32_vg1x4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_bf16,,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, indexed (bfloat)
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_lane_za32_vg1x2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,_bf16,,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_lane_za32_vg1x4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,_bf16,,_vg1x4)(slice_base, zn, zm, 3);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
new file mode 100644
index 000000000000..2a34b0e2878e
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
@@ -0,0 +1,282 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+#endif
+
+// FRINTA
+
+// CHECK-LABEL: @test_svfrinta_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrinta_f32_x213svfloat32x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+svfloat32x2_t test_svfrinta_f32_x2(svfloat32x2_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrinta,_f32_x2)(zn);
+}
+
+// CHECK-LABEL: @test_svfrinta_f32_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrinta_f32_x413svfloat32x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+svfloat32x4_t test_svfrinta_f32_x4(svfloat32x4_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrinta,_f32_x4)(zn);
+}
+
+// FRINTM
+
+// CHECK-LABEL: @test_svfrintam_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svfrintam_f32_x213svfloat32x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+svfloat32x2_t test_svfrintam_f32_x2(svfloat32x2_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrintm,_f32_x2)(zn);
+}
+
+// CHECK-LABEL: @test_svfrintm_f32_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrintm_f32_x413svfloat32x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+svfloat32x4_t test_svfrintm_f32_x4(svfloat32x4_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrintm,_f32_x4)(zn);
+}
+
+// FRINTN
+
+// CHECK-LABEL: @test_svfrintn_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrintn_f32_x213svfloat32x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+svfloat32x2_t test_svfrintn_f32_x2(svfloat32x2_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrintn,_f32_x2)(zn);
+}
+
+// CHECK-LABEL: @test_svfrintn_f32_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrintn_f32_x413svfloat32x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+svfloat32x4_t test_svfrintn_f32_x4(svfloat32x4_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrintn,_f32_x4)(zn);
+}
+
+// FRINTP
+
+// CHECK-LABEL: @test_svfrintp_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrintp_f32_x213svfloat32x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+svfloat32x2_t test_svfrintp_f32_x2(svfloat32x2_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrintp,_f32_x2)(zn);
+}
+
+// CHECK-LABEL: @test_svfrintp_f32_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrintp_f32_x413svfloat32x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+svfloat32x4_t test_svfrintp_f32_x4(svfloat32x4_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrintp,_f32_x4)(zn);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
new file mode 100644
index 000000000000..0d85071b7fc3
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
@@ -0,0 +1,1103 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi (unsigned)
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_u16,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_u16,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x2_u8j11svuint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_u8,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x4_u8j11svuint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_u8,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za64_vg1x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za64,,,_u16,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za64_vg1x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za64,,,_u16,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, multi (signed)
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_s16,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_s16,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x2_s8j10svint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_s8,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x4_s8j10svint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_s8,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za64_vg1x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za64,,,_s16,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za64_vg1x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za64,,,_s16,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, single (unsigned)
+// CHECK-LABEL: @test_svdot_single_za32_vg1x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_u16,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_u16,,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_u8,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_u8,,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za64_vg1x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za64,,_u16,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za64_vg1x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za64,,_u16,,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, single (signed)
+// CHECK-LABEL: @test_svdot_single_za32_vg1x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_s16,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_s16,,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_s8,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_s8,,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za64_vg1x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za64,,_s16,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za64_vg1x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za64,,_s16,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed (unsigned)
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,_u16,,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,_u16,,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,_u8,,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,_u8,,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za64_vg1x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za64,,,_u16,_vg1x2)(slice_base, zn, zm, 1);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za64_vg1x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za64,,,_u16,_vg1x4)(slice_base, zn, zm, 1);
+}
+
+
+//
+// Multi, indexed (signed)
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,,_s16,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,,_s16,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,,_s8,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,,_s8,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za64_vg1x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za64,,,_s16,_vg1x2)(slice_base, zn, zm, 1);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za64_vg1x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za64,,,_s16,_vg1x4)(slice_base, zn, zm, 1);
+}
+
+
+//
+// Multi, multi (unsigned by signed)
+// CHECK-LABEL: @test_svusdot_multi_za32_vg1x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svusdot_multi_za32_vg1x2_u8j11svuint8x2_t10svint8x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusdot_za32,,,_u8,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svusdot_multi_za32_vg1x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svusdot_multi_za32_vg1x4_u8j11svuint8x4_t10svint8x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusdot_za32,,,_u8,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, single (unsigned by signed)
+// CHECK-LABEL: @test_svusdot_single_za32_vg1x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svusdot_single_za32_vg1x2_u8j11svuint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusdot_single_za32,,_u8,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svusdot_single_za32_vg1x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svusdot_single_za32_vg1x4_u8j11svuint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusdot_single_za32,,_u8,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed (unsigned by signed)
+// CHECK-LABEL: @test_svusdot_lane_za32_vg1x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svusdot_lane_za32_vg1x2_u8j11svuint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusdot_lane_za32,,_u8,,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svusdot_lane_za32_vg1x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svusdot_lane_za32_vg1x4_u8j11svuint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusdot_lane_za32,,_u8,,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+
+//
+// Multi, single (signed by unsigned)
+// CHECK-LABEL: @test_svsudot_single_za32_vg1x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svsudot_single_za32_vg1x2_s8j10svint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsudot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsudot_single_za32,,_s8,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsudot_single_za32_vg1x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svsudot_single_za32_vg1x4_s8j10svint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsudot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsudot_single_za32,,_s8,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, multi (signed by unsigned)
+// CHECK-LABEL: @test_svsudot_multi_za32_vg1x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svsudot_multi_za32_vg1x2_s8j10svint8x2_t11svuint8x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsudot_multi_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsudot_za32,,_s8,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsudot_multi_za32_vg1x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svsudot_multi_za32_vg1x4_s8j10svint8x4_t11svuint8x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsudot_multi_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsudot_za32,,_s8,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed (signed by unsigned)
+// CHECK-LABEL: @test_svsudot_lane_za32_vg1x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svsudot_lane_za32_vg1x2_s8j10svint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsudot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsudot_lane_za32,,_s8,,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svsudot_lane_za32_vg1x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svsudot_lane_za32_vg1x4_s8j10svint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsudot_lane_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsudot_lane_za32,,_s8,,_vg1x4)(slice_base, zn, zm, 3);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
new file mode 100644
index 000000000000..f52edd9888da
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
@@ -0,0 +1,292 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmla2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_f32j13svfloat32x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_za32,_f32,_vg1x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_f32j13svfloat32x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_za32,_f32,_vg1x4,,)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmla_single2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f32j13svfloat32x2_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_single_za32,,_f32,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f32j13svfloat32x4_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_single_za32,,_f32,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+// CHECK-LABEL: @test_svmla_lane2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f32j13svfloat32x2_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_lane_za32,_f32,_vg1x2,,)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f32j13svfloat32x4_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_lane_za32,_f32,_vg1x4,,)(slice_base, zn, zm, 3);
+}
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmla2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_f64j13svfloat64x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_za64,_f64,_vg1x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_f64j13svfloat64x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_za64,_f64,_vg1x4,,)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmla_single2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f64j13svfloat64x2_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_single_za64,,_f64,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f64j13svfloat64x4_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_single_za64,,_f64,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+// CHECK-LABEL: @test_svmla_lane2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f64j13svfloat64x2_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_lane_za64,_f64,_vg1x2,,)(slice_base, zn, zm, 1);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f64j13svfloat64x4_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_lane_za64,_f64,_vg1x4,,)(slice_base, zn, zm, 1);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
new file mode 100644
index 000000000000..834ade753507
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
@@ -0,0 +1,696 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmla2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_f16j13svfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svmla2_bf16j14svbfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_f16j13svfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svmla4_bf16j14svbfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmla_single1_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single1_f16ju13__SVFloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single1_bf16ju14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_f16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_bf16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_u16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_s16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_f16,,_vg2x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_bf16,,_vg2x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_u16,,_vg2x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_s16,,_vg2x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+//
+
+// CHECK-LABEL: @test_svmla_lane1_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_f16ju13__SVFloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_f16,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmla_lane1_bf16ju14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_bf16,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_u16,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_s16,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_f16,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmla_lane2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_bf16,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_u16,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_s16,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_f16,_vg2x4)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmla_lane4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_bf16,_vg2x4)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_u16,_vg2x4)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_s16,_vg2x4)(slice_base, zn, zm, 7);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
new file mode 100644
index 000000000000..fceb16a48260
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
@@ -0,0 +1,1790 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Single x 1
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_svmla_single_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x1_s8ju10__SVInt8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_uvmlal_single_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_uvmlal_single_x1_u8ju11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_uvmlal_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_uvmlal_single_x1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_uvmlal_single_x1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_uvmlal_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_svmls_single_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x1_s8ju10__SVInt8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_uvmlsl_single_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_uvmlsl_single_x1_u8ju11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_uvmlsl_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_uvmlsl_single_x1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_uvmlsl_single_x1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_uvmlsl_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlall_single_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_sumlall_single_x1_s8ju10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlall_single_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_single_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x1_u8ju11__SVUint8_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_single_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x1,,)(slice_base, zn, zm);
+}
+
+//
+// Single x 2
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_svmla_single_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za32,,_s8,,_vg4x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za64,,_s16,,_vg4x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za32,,_u8,,_vg4x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za64,,_u16,,_vg4x2)(slice_base, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_svmls_single_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za32,,_s8,,_vg4x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za64,,_s16,,_vg4x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za32,,_u8,,_vg4x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za64,,_u16,,_vg4x2)(slice_base, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_svsumla_single_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsumla_single_x2_s8j10svint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsumla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_single_za32,,_s8,,_vg4x2)(slice_base, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_single_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x2_u8j11svuint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_single_za32,,_u8,,_vg4x2)(slice_base, zn, zm);
+}
+
+//
+// Single x 4
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_svmla_single_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za32,,_s8,,_vg4x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za64,,_s16,,_vg4x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za32,,_u8,,_vg4x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za64,,_u16,,_vg4x4)(slice_base, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_svmls_single_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za32,,_s8,,_vg4x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za64,,_s16,,_vg4x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za32,,_u8,,_vg4x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za64,,_u16,,_vg4x4)(slice_base, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_svsumla_single_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsumla_single_x4_s8j10svint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsumla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_single_za32,,_s8,,_vg4x4)(slice_base, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_single_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x4_u8j11svuint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_single_za32,,_u8,,_vg4x4)(slice_base, zn, zm);
+}
+
+//
+// Multi x 2
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_mlal_multi_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x2_s8j10svint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x2_u8j11svuint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_mlsl_multi_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x2_s8j10svint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x2_u8j11svuint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlal_multi_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlal_multi_x2_s8j10svint8x2_t11svuint8x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlal_multi_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlal_multi_x2_u8j11svuint8x2_t10svint8x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x2,,)(slice_base, zn, zm);
+}
+
+//
+// Multi x 4
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_mlal_multi_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x4_s8j10svint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x4_u8j11svuint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_mlsl_multi_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x4_s8j10svint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x4_u8j11svuint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlal_multi_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlal_multi_x4_s8j10svint8x4_t11svuint8x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlal_multi_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlal_multi_x4_u8j11svuint8x4_t10svint8x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x4,,)(slice_base, zn, zm);
+}
+
+//
+// Indexed x 1
+//
+
+// SMLAL
+
+// CHECK-LABEL: @test_smlal_lane_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x1_s8ju10__SVInt8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x1,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x1,,)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x1_u8ju11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x1,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x1,,)(slice_base, zn, zm, 7);
+}
+
+// SMLSL
+
+// CHECK-LABEL: @test_smlsl_lane_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x1_s8ju10__SVInt8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x1,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x1,,)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x1_u8ju11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x1,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x1,,)(slice_base, zn, zm, 7);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlall_lane_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x1_s8ju10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlall_lane_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x1,,)(slice_base, zn, zm, 15);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_lane_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x1_u8ju11__SVUint8_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x1,,)(slice_base, zn, zm, 15);
+}
+
+//
+// Indexed x 2
+//
+
+// SMLAL
+
+// CHECK-LABEL: @test_smlal_lane_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x2,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x2,,)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x2,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x2,,)(slice_base, zn, zm, 7);
+}
+
+// SMLSL
+
+// CHECK-LABEL: @test_smlsl_lane_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x2,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x2,,)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x2,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x2,,)(slice_base, zn, zm, 7);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlall_lane_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x2_s8j10svint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlall_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x2,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_usmlall_lane_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x2_u8j11svuint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x2,,)(slice_base, zn, zm, 15);
+}
+
+//
+// Indexed x 4
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_smlal_lane_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x4,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x4,,)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x4,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x4,,)(slice_base, zn, zm, 7);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_smlsl_lane_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x4,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x4,,)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x4,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x4,,)(slice_base, zn, zm, 7);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlall_lane_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x4_s8j10svint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlall_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x4,,)(slice_base, zn, zm, 15);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_lane_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x4_s8j11svuint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_lane_x4_s8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x4,,)(slice_base, zn, zm, 15);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
new file mode 100644
index 000000000000..6830a399e91d
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
@@ -0,0 +1,292 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmls2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_f32j13svfloat32x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_za32,,_f32,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_f32j13svfloat32x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_za32,,_f32,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmls_single2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f32j13svfloat32x2_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_single_za32,,_f32,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f32j13svfloat32x4_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_single_za32,,_f32,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+// CHECK-LABEL: @test_svmls_lane2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f32j13svfloat32x2_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_lane_za32,,_f32,,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f32j13svfloat32x4_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_lane_za32,,_f32,,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmls2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_f64j13svfloat64x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_za64,,_f64,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_f64j13svfloat64x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_za64,,_f64,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmls_single2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f64j13svfloat64x2_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_single_za64,,_f64,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f64j13svfloat64x4_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_single_za64,,_f64,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+// CHECK-LABEL: @test_svmls_lane2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f64j13svfloat64x2_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_lane_za64,,_f64,,_vg1x2)(slice_base, zn, zm, 1);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f64j13svfloat64x4_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_lane_za64,,_f64,,_vg1x4)(slice_base, zn, zm, 1);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
new file mode 100644
index 000000000000..0a87d97f649c
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
@@ -0,0 +1,696 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmls2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_f16j13svfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svmls2_bf16j14svbfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_f16j13svfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svmls4_bf16j14svbfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmls_single1_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single1_f16ju13__SVFloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single1_bf16ju14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_f16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_bf16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_u16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_s16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_f16,,_vg2x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_bf16,,_vg2x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_u16,,_vg2x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_s16,,_vg2x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+//
+
+// CHECK-LABEL: @test_svmls_lane1_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_f16ju13__SVFloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_f16,,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmls_lane1_bf16ju14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_bf16,,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_u16,,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_s16,,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_f16,,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmls_lane2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_bf16,,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_u16,,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_s16,,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_f16,,_vg2x4)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmls_lane4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_bf16,,_vg2x4)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_u16,,_vg2x4)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_s16,,_vg2x4)(slice_base, zn, zm, 7);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
new file mode 100644
index 000000000000..fb313d4cebd7
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
@@ -0,0 +1,222 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svvdot_lane_za32_bf16_vg1x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svvdot_lane_za32_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za32_bf16_vg1x2(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za32,_bf16,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za32_f16_vg1x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za32_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za32_f16_vg1x2(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za32,_f16,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za32_s16_vg1x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za32_s16_vg1x2j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za32_s16_vg1x2(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za32,_s16,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za32_u16_vg1x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za32_u16_vg1x2j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za32_u16_vg1x2(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za32,_u16,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za32_s8_vg1x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svvdot_lane_za32_s8_vg1x4j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za32,_s8,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za32_u8_vg1x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svvdot_lane_za32_u8_vg1x4j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za32_u8_vg1x4(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za32,_u8,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za64_s16_vg1x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za64_s16_vg1x4j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za64_s16_vg1x4(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za64,_s16,_vg1x4)(slice_base, zn, zm, 1);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za64_u16_vg1x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za64_u16_vg1x4j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za64_u16_vg1x4(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za64,_u16,_vg1x4)(slice_base, zn, zm, 1);
+}
+
+
+// CHECK-LABEL: @test_svsuvdot_lane_za32_s8_vg1x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svsuvdot_lane_za32_s8_vg1x4j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsuvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsuvdot_lane_za32,_s8,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+
+// CHECK-LABEL: @test_svusvdot_lane_za32_u8_vg1x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svusvdot_lane_za32_u8_vg1x4j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusvdot_lane_za32_u8_vg1x4(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusvdot_lane_za32,_u8,_vg1x4)(slice_base, zn, zm, 3);
+}
+
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
index 7a25d31de013..6f1231e776aa 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
@@ -1,9 +1,12 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -13,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svld1_u8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
@@ -31,7 +40,7 @@
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svuint8x2_t test_svld1_u8_x2(svcount_t pn, const uint8_t *base)
+svuint8x2_t test_svld1_u8_x2(svcount_t pn, const uint8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u8,_x2,)(pn, base);
 }
@@ -54,7 +63,7 @@ svuint8x2_t test_svld1_u8_x2(svcount_t pn, const uint8_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svuint16x2_t test_svld1_u16_x2(svcount_t pn, const uint16_t *base)
+svuint16x2_t test_svld1_u16_x2(svcount_t pn, const uint16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u16,_x2,)(pn, base);
 }
@@ -77,7 +86,7 @@ svuint16x2_t test_svld1_u16_x2(svcount_t pn, const uint16_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svuint32x2_t test_svld1_u32_x2(svcount_t pn, const uint32_t *base)
+svuint32x2_t test_svld1_u32_x2(svcount_t pn, const uint32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u32,_x2,)(pn, base);
 }
@@ -100,7 +109,7 @@ svuint32x2_t test_svld1_u32_x2(svcount_t pn, const uint32_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svuint64x2_t test_svld1_u64_x2(svcount_t pn, const uint64_t *base)
+svuint64x2_t test_svld1_u64_x2(svcount_t pn, const uint64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u64,_x2,)(pn, base);
 }
@@ -131,7 +140,7 @@ svuint64x2_t test_svld1_u64_x2(svcount_t pn, const uint64_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svuint8x4_t test_svld1_u8_x4(svcount_t pn, const uint8_t *base)
+svuint8x4_t test_svld1_u8_x4(svcount_t pn, const uint8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u8,_x4,)(pn, base);
 }
@@ -162,7 +171,7 @@ svuint8x4_t test_svld1_u8_x4(svcount_t pn, const uint8_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svuint16x4_t test_svld1_u16_x4(svcount_t pn, const uint16_t *base)
+svuint16x4_t test_svld1_u16_x4(svcount_t pn, const uint16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u16,_x4,)(pn, base);
 }
@@ -193,7 +202,7 @@ svuint16x4_t test_svld1_u16_x4(svcount_t pn, const uint16_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svuint32x4_t test_svld1_u32_x4(svcount_t pn, const uint32_t *base)
+svuint32x4_t test_svld1_u32_x4(svcount_t pn, const uint32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u32,_x4,)(pn, base);
 }
@@ -224,7 +233,7 @@ svuint32x4_t test_svld1_u32_x4(svcount_t pn, const uint32_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svuint64x4_t test_svld1_u64_x4(svcount_t pn, const uint64_t *base)
+svuint64x4_t test_svld1_u64_x4(svcount_t pn, const uint64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u64,_x4,)(pn, base);
 }
@@ -247,7 +256,7 @@ svuint64x4_t test_svld1_u64_x4(svcount_t pn, const uint64_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svint8x2_t test_svld1_s8_x2(svcount_t pn, const int8_t *base)
+svint8x2_t test_svld1_s8_x2(svcount_t pn, const int8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s8,_x2,)(pn, base);
 }
@@ -270,7 +279,7 @@ svint8x2_t test_svld1_s8_x2(svcount_t pn, const int8_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svint16x2_t test_svld1_s16_x2(svcount_t pn, const int16_t *base)
+svint16x2_t test_svld1_s16_x2(svcount_t pn, const int16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s16,_x2,)(pn, base);
 }
@@ -293,7 +302,7 @@ svint16x2_t test_svld1_s16_x2(svcount_t pn, const int16_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svint32x2_t test_svld1_s32_x2(svcount_t pn, const int32_t *base)
+svint32x2_t test_svld1_s32_x2(svcount_t pn, const int32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s32,_x2,)(pn, base);
 }
@@ -316,7 +325,7 @@ svint32x2_t test_svld1_s32_x2(svcount_t pn, const int32_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svint64x2_t test_svld1_s64_x2(svcount_t pn, const int64_t *base)
+svint64x2_t test_svld1_s64_x2(svcount_t pn, const int64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s64,_x2,)(pn, base);
 }
@@ -347,7 +356,7 @@ svint64x2_t test_svld1_s64_x2(svcount_t pn, const int64_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svint8x4_t test_svld1_s8_x4(svcount_t pn, const int8_t *base)
+svint8x4_t test_svld1_s8_x4(svcount_t pn, const int8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s8,_x4,)(pn, base);
 }
@@ -378,7 +387,7 @@ svint8x4_t test_svld1_s8_x4(svcount_t pn, const int8_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svint16x4_t test_svld1_s16_x4(svcount_t pn, const int16_t *base)
+svint16x4_t test_svld1_s16_x4(svcount_t pn, const int16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s16,_x4,)(pn, base);
 }
@@ -409,7 +418,7 @@ svint16x4_t test_svld1_s16_x4(svcount_t pn, const int16_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svint32x4_t test_svld1_s32_x4(svcount_t pn, const int32_t *base)
+svint32x4_t test_svld1_s32_x4(svcount_t pn, const int32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s32,_x4,)(pn, base);
 }
@@ -440,7 +449,7 @@ svint32x4_t test_svld1_s32_x4(svcount_t pn, const int32_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svint64x4_t test_svld1_s64_x4(svcount_t pn, const int64_t *base)
+svint64x4_t test_svld1_s64_x4(svcount_t pn, const int64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s64,_x4,)(pn, base);
 }
@@ -463,7 +472,7 @@ svint64x4_t test_svld1_s64_x4(svcount_t pn, const int64_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
 //
-svfloat16x2_t test_svld1_f16_x2(svcount_t pn, const float16_t *base)
+svfloat16x2_t test_svld1_f16_x2(svcount_t pn, const float16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_f16,_x2,)(pn, base);
 }
@@ -486,7 +495,7 @@ svfloat16x2_t test_svld1_f16_x2(svcount_t pn, const float16_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
 //
-svfloat32x2_t test_svld1_f32_x2(svcount_t pn, const float32_t *base)
+svfloat32x2_t test_svld1_f32_x2(svcount_t pn, const float32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_f32,_x2,)(pn, base);
 }
@@ -509,7 +518,7 @@ svfloat32x2_t test_svld1_f32_x2(svcount_t pn, const float32_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
 //
-svfloat64x2_t test_svld1_f64_x2(svcount_t pn, const float64_t *base)
+svfloat64x2_t test_svld1_f64_x2(svcount_t pn, const float64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_f64,_x2,)(pn, base);
 }
@@ -540,7 +549,7 @@ svfloat64x2_t test_svld1_f64_x2(svcount_t pn, const float64_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
 //
-svfloat16x4_t test_svld1_f16_x4(svcount_t pn, const float16_t *base)
+svfloat16x4_t test_svld1_f16_x4(svcount_t pn, const float16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_f16,_x4,)(pn, base);
 }
@@ -571,7 +580,7 @@ svfloat16x4_t test_svld1_f16_x4(svcount_t pn, const float16_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
 //
-svfloat32x4_t test_svld1_f32_x4(svcount_t pn, const float32_t *base)
+svfloat32x4_t test_svld1_f32_x4(svcount_t pn, const float32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_f32,_x4,)(pn, base);
 }
@@ -602,7 +611,7 @@ svfloat32x4_t test_svld1_f32_x4(svcount_t pn, const float32_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
 //
-svfloat64x4_t test_svld1_f64_x4(svcount_t pn, const float64_t *base)
+svfloat64x4_t test_svld1_f64_x4(svcount_t pn, const float64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_f64,_x4,)(pn, base);
 }
@@ -631,7 +640,7 @@ svfloat64x4_t test_svld1_f64_x4(svcount_t pn, const float64_t *base)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
 //
-svuint8x2_t test_svld1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum)
+svuint8x2_t test_svld1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u8,_x2,)(pn, base, vnum);
 }
@@ -656,7 +665,7 @@ svuint8x2_t test_svld1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnu
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
 //
-svuint16x2_t test_svld1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum)
+svuint16x2_t test_svld1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u16,_x2,)(pn, base, vnum);
 }
@@ -681,7 +690,7 @@ svuint16x2_t test_svld1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
 //
-svuint32x2_t test_svld1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum)
+svuint32x2_t test_svld1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u32,_x2,)(pn, base, vnum);
 }
@@ -706,7 +715,7 @@ svuint32x2_t test_svld1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
 //
-svuint64x2_t test_svld1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum)
+svuint64x2_t test_svld1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u64,_x2,)(pn, base, vnum);
 }
@@ -739,7 +748,7 @@ svuint64x2_t test_svld1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
 //
-svuint8x4_t test_svld1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum)
+svuint8x4_t test_svld1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u8,_x4,)(pn, base, vnum);
 }
@@ -772,7 +781,7 @@ svuint8x4_t test_svld1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnu
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
 //
-svuint16x4_t test_svld1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum)
+svuint16x4_t test_svld1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u16,_x4,)(pn, base, vnum);
 }
@@ -805,7 +814,7 @@ svuint16x4_t test_svld1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
 //
-svuint32x4_t test_svld1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum)
+svuint32x4_t test_svld1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u32,_x4,)(pn, base, vnum);
 }
@@ -838,7 +847,7 @@ svuint32x4_t test_svld1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
 //
-svuint64x4_t test_svld1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum)
+svuint64x4_t test_svld1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u64,_x4,)(pn, base, vnum);
 }
@@ -863,7 +872,7 @@ svuint64x4_t test_svld1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
 //
-svint8x2_t test_svld1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum)
+svint8x2_t test_svld1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s8,_x2,)(pn, base, vnum);
 }
@@ -888,7 +897,7 @@ svint8x2_t test_svld1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
 //
-svint16x2_t test_svld1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum)
+svint16x2_t test_svld1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s16,_x2,)(pn, base, vnum);
 }
@@ -913,7 +922,7 @@ svint16x2_t test_svld1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vn
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
 //
-svint32x2_t test_svld1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum)
+svint32x2_t test_svld1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s32,_x2,)(pn, base, vnum);
 }
@@ -938,7 +947,7 @@ svint32x2_t test_svld1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vn
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
 //
-svint64x2_t test_svld1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum)
+svint64x2_t test_svld1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s64,_x2,)(pn, base, vnum);
 }
@@ -971,7 +980,7 @@ svint64x2_t test_svld1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vn
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
 //
-svint8x4_t test_svld1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum)
+svint8x4_t test_svld1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s8,_x4,)(pn, base, vnum);
 }
@@ -1004,7 +1013,7 @@ svint8x4_t test_svld1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
 //
-svint16x4_t test_svld1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum)
+svint16x4_t test_svld1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s16,_x4,)(pn, base, vnum);
 }
@@ -1037,7 +1046,7 @@ svint16x4_t test_svld1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vn
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
 //
-svint32x4_t test_svld1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum)
+svint32x4_t test_svld1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s32,_x4,)(pn, base, vnum);
 }
@@ -1070,7 +1079,7 @@ svint32x4_t test_svld1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vn
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
 //
-svint64x4_t test_svld1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum)
+svint64x4_t test_svld1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s64,_x4,)(pn, base, vnum);
 }
@@ -1095,7 +1104,7 @@ svint64x4_t test_svld1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vn
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP5]]
 //
-svfloat16x2_t test_svld1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum)
+svfloat16x2_t test_svld1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_f16,_x2,)(pn, base, vnum);
 }
@@ -1120,7 +1129,7 @@ svfloat16x2_t test_svld1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP5]]
 //
-svfloat32x2_t test_svld1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum)
+svfloat32x2_t test_svld1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_f32,_x2,)(pn, base, vnum);
 }
@@ -1145,7 +1154,7 @@ svfloat32x2_t test_svld1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP5]]
 //
-svfloat64x2_t test_svld1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum)
+svfloat64x2_t test_svld1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_f64,_x2,)(pn, base, vnum);
 }
@@ -1178,7 +1187,7 @@ svfloat64x2_t test_svld1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP9]]
 //
-svfloat16x4_t test_svld1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum)
+svfloat16x4_t test_svld1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_f16,_x4,)(pn, base, vnum);
 }
@@ -1211,7 +1220,7 @@ svfloat16x4_t test_svld1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP9]]
 //
-svfloat32x4_t test_svld1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum)
+svfloat32x4_t test_svld1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_f32,_x4,)(pn, base, vnum);
 }
@@ -1244,7 +1253,7 @@ svfloat32x4_t test_svld1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP9]]
 //
-svfloat64x4_t test_svld1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum)
+svfloat64x4_t test_svld1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_f64,_x4,)(pn, base, vnum);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c
index 7a0fcde819dc..3f61cc3de139 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c
@@ -1,9 +1,12 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -13,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svldnt1_u8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
@@ -31,7 +40,7 @@
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svuint8x2_t test_svldnt1_u8_x2(svcount_t pn, const uint8_t *base)
+svuint8x2_t test_svldnt1_u8_x2(svcount_t pn, const uint8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u8,_x2,)(pn, base);
 }
@@ -54,7 +63,7 @@ svuint8x2_t test_svldnt1_u8_x2(svcount_t pn, const uint8_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svuint16x2_t test_svldnt1_u16_x2(svcount_t pn, const uint16_t *base)
+svuint16x2_t test_svldnt1_u16_x2(svcount_t pn, const uint16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u16,_x2,)(pn, base);
 }
@@ -77,7 +86,7 @@ svuint16x2_t test_svldnt1_u16_x2(svcount_t pn, const uint16_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svuint32x2_t test_svldnt1_u32_x2(svcount_t pn, const uint32_t *base)
+svuint32x2_t test_svldnt1_u32_x2(svcount_t pn, const uint32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u32,_x2,)(pn, base);
 }
@@ -100,7 +109,7 @@ svuint32x2_t test_svldnt1_u32_x2(svcount_t pn, const uint32_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svuint64x2_t test_svldnt1_u64_x2(svcount_t pn, const uint64_t *base)
+svuint64x2_t test_svldnt1_u64_x2(svcount_t pn, const uint64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u64,_x2,)(pn, base);
 }
@@ -131,7 +140,7 @@ svuint64x2_t test_svldnt1_u64_x2(svcount_t pn, const uint64_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svuint8x4_t test_svldnt1_u8_x4(svcount_t pn, const uint8_t *base)
+svuint8x4_t test_svldnt1_u8_x4(svcount_t pn, const uint8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u8,_x4,)(pn, base);
 }
@@ -162,7 +171,7 @@ svuint8x4_t test_svldnt1_u8_x4(svcount_t pn, const uint8_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svuint16x4_t test_svldnt1_u16_x4(svcount_t pn, const uint16_t *base)
+svuint16x4_t test_svldnt1_u16_x4(svcount_t pn, const uint16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u16,_x4,)(pn, base);
 }
@@ -193,7 +202,7 @@ svuint16x4_t test_svldnt1_u16_x4(svcount_t pn, const uint16_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svuint32x4_t test_svldnt1_u32_x4(svcount_t pn, const uint32_t *base)
+svuint32x4_t test_svldnt1_u32_x4(svcount_t pn, const uint32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u32,_x4,)(pn, base);
 }
@@ -224,7 +233,7 @@ svuint32x4_t test_svldnt1_u32_x4(svcount_t pn, const uint32_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svuint64x4_t test_svldnt1_u64_x4(svcount_t pn, const uint64_t *base)
+svuint64x4_t test_svldnt1_u64_x4(svcount_t pn, const uint64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u64,_x4,)(pn, base);
 }
@@ -247,7 +256,7 @@ svuint64x4_t test_svldnt1_u64_x4(svcount_t pn, const uint64_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svint8x2_t test_svldnt1_s8_x2(svcount_t pn, const int8_t *base)
+svint8x2_t test_svldnt1_s8_x2(svcount_t pn, const int8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s8,_x2,)(pn, base);
 }
@@ -270,7 +279,7 @@ svint8x2_t test_svldnt1_s8_x2(svcount_t pn, const int8_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svint16x2_t test_svldnt1_s16_x2(svcount_t pn, const int16_t *base)
+svint16x2_t test_svldnt1_s16_x2(svcount_t pn, const int16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s16,_x2,)(pn, base);
 }
@@ -293,7 +302,7 @@ svint16x2_t test_svldnt1_s16_x2(svcount_t pn, const int16_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svint32x2_t test_svldnt1_s32_x2(svcount_t pn, const int32_t *base)
+svint32x2_t test_svldnt1_s32_x2(svcount_t pn, const int32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s32,_x2,)(pn, base);
 }
@@ -316,7 +325,7 @@ svint32x2_t test_svldnt1_s32_x2(svcount_t pn, const int32_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svint64x2_t test_svldnt1_s64_x2(svcount_t pn, const int64_t *base)
+svint64x2_t test_svldnt1_s64_x2(svcount_t pn, const int64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s64,_x2,)(pn, base);
 }
@@ -347,7 +356,7 @@ svint64x2_t test_svldnt1_s64_x2(svcount_t pn, const int64_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svint8x4_t test_svldnt1_s8_x4(svcount_t pn, const int8_t *base)
+svint8x4_t test_svldnt1_s8_x4(svcount_t pn, const int8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s8,_x4,)(pn, base);
 }
@@ -378,7 +387,7 @@ svint8x4_t test_svldnt1_s8_x4(svcount_t pn, const int8_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svint16x4_t test_svldnt1_s16_x4(svcount_t pn, const int16_t *base)
+svint16x4_t test_svldnt1_s16_x4(svcount_t pn, const int16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s16,_x4,)(pn, base);
 }
@@ -409,7 +418,7 @@ svint16x4_t test_svldnt1_s16_x4(svcount_t pn, const int16_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svint32x4_t test_svldnt1_s32_x4(svcount_t pn, const int32_t *base)
+svint32x4_t test_svldnt1_s32_x4(svcount_t pn, const int32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s32,_x4,)(pn, base);
 }
@@ -440,7 +449,7 @@ svint32x4_t test_svldnt1_s32_x4(svcount_t pn, const int32_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svint64x4_t test_svldnt1_s64_x4(svcount_t pn, const int64_t *base)
+svint64x4_t test_svldnt1_s64_x4(svcount_t pn, const int64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s64,_x4,)(pn, base);
 }
@@ -463,7 +472,7 @@ svint64x4_t test_svldnt1_s64_x4(svcount_t pn, const int64_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
 //
-svfloat16x2_t test_svldnt1_f16_x2(svcount_t pn, const float16_t *base)
+svfloat16x2_t test_svldnt1_f16_x2(svcount_t pn, const float16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_f16,_x2,)(pn, base);
 }
@@ -486,7 +495,7 @@ svfloat16x2_t test_svldnt1_f16_x2(svcount_t pn, const float16_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
 //
-svfloat32x2_t test_svldnt1_f32_x2(svcount_t pn, const float32_t *base)
+svfloat32x2_t test_svldnt1_f32_x2(svcount_t pn, const float32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_f32,_x2,)(pn, base);
 }
@@ -509,7 +518,7 @@ svfloat32x2_t test_svldnt1_f32_x2(svcount_t pn, const float32_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
 //
-svfloat64x2_t test_svldnt1_f64_x2(svcount_t pn, const float64_t *base)
+svfloat64x2_t test_svldnt1_f64_x2(svcount_t pn, const float64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_f64,_x2,)(pn, base);
 }
@@ -540,7 +549,7 @@ svfloat64x2_t test_svldnt1_f64_x2(svcount_t pn, const float64_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
 //
-svfloat16x4_t test_svldnt1_f16_x4(svcount_t pn, const float16_t *base)
+svfloat16x4_t test_svldnt1_f16_x4(svcount_t pn, const float16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_f16,_x4,)(pn, base);
 }
@@ -571,7 +580,7 @@ svfloat16x4_t test_svldnt1_f16_x4(svcount_t pn, const float16_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
 //
-svfloat32x4_t test_svldnt1_f32_x4(svcount_t pn, const float32_t *base)
+svfloat32x4_t test_svldnt1_f32_x4(svcount_t pn, const float32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_f32,_x4,)(pn, base);
 }
@@ -602,7 +611,7 @@ svfloat32x4_t test_svldnt1_f32_x4(svcount_t pn, const float32_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
 //
-svfloat64x4_t test_svldnt1_f64_x4(svcount_t pn, const float64_t *base)
+svfloat64x4_t test_svldnt1_f64_x4(svcount_t pn, const float64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_f64,_x4,)(pn, base);
 }
@@ -631,7 +640,7 @@ svfloat64x4_t test_svldnt1_f64_x4(svcount_t pn, const float64_t *base)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
 //
-svuint8x2_t test_svldnt1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum)
+svuint8x2_t test_svldnt1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u8,_x2,)(pn, base, vnum);
 }
@@ -656,7 +665,7 @@ svuint8x2_t test_svldnt1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t v
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
 //
-svuint16x2_t test_svldnt1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum)
+svuint16x2_t test_svldnt1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u16,_x2,)(pn, base, vnum);
 }
@@ -681,7 +690,7 @@ svuint16x2_t test_svldnt1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
 //
-svuint32x2_t test_svldnt1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum)
+svuint32x2_t test_svldnt1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u32,_x2,)(pn, base, vnum);
 }
@@ -706,7 +715,7 @@ svuint32x2_t test_svldnt1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
 //
-svuint64x2_t test_svldnt1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum)
+svuint64x2_t test_svldnt1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u64,_x2,)(pn, base, vnum);
 }
@@ -739,7 +748,7 @@ svuint64x2_t test_svldnt1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
 //
-svuint8x4_t test_svldnt1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum)
+svuint8x4_t test_svldnt1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u8,_x4,)(pn, base, vnum);
 }
@@ -772,7 +781,7 @@ svuint8x4_t test_svldnt1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t v
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
 //
-svuint16x4_t test_svldnt1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum)
+svuint16x4_t test_svldnt1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u16,_x4,)(pn, base, vnum);
 }
@@ -805,7 +814,7 @@ svuint16x4_t test_svldnt1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
 //
-svuint32x4_t test_svldnt1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum)
+svuint32x4_t test_svldnt1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u32,_x4,)(pn, base, vnum);
 }
@@ -838,7 +847,7 @@ svuint32x4_t test_svldnt1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
 //
-svuint64x4_t test_svldnt1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum)
+svuint64x4_t test_svldnt1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u64,_x4,)(pn, base, vnum);
 }
@@ -863,7 +872,7 @@ svuint64x4_t test_svldnt1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
 //
-svint8x2_t test_svldnt1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum)
+svint8x2_t test_svldnt1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s8,_x2,)(pn, base, vnum);
 }
@@ -888,7 +897,7 @@ svint8x2_t test_svldnt1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnu
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
 //
-svint16x2_t test_svldnt1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum)
+svint16x2_t test_svldnt1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s16,_x2,)(pn, base, vnum);
 }
@@ -913,7 +922,7 @@ svint16x2_t test_svldnt1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
 //
-svint32x2_t test_svldnt1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum)
+svint32x2_t test_svldnt1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s32,_x2,)(pn, base, vnum);
 }
@@ -938,7 +947,7 @@ svint32x2_t test_svldnt1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
 //
-svint64x2_t test_svldnt1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum)
+svint64x2_t test_svldnt1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s64,_x2,)(pn, base, vnum);
 }
@@ -971,7 +980,7 @@ svint64x2_t test_svldnt1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
 //
-svint8x4_t test_svldnt1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum)
+svint8x4_t test_svldnt1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s8,_x4,)(pn, base, vnum);
 }
@@ -1004,7 +1013,7 @@ svint8x4_t test_svldnt1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnu
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
 //
-svint16x4_t test_svldnt1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum)
+svint16x4_t test_svldnt1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s16,_x4,)(pn, base, vnum);
 }
@@ -1037,7 +1046,7 @@ svint16x4_t test_svldnt1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
 //
-svint32x4_t test_svldnt1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum)
+svint32x4_t test_svldnt1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s32,_x4,)(pn, base, vnum);
 }
@@ -1070,7 +1079,7 @@ svint32x4_t test_svldnt1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
 //
-svint64x4_t test_svldnt1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum)
+svint64x4_t test_svldnt1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s64,_x4,)(pn, base, vnum);
 }
@@ -1095,7 +1104,7 @@ svint64x4_t test_svldnt1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP5]]
 //
-svfloat16x2_t test_svldnt1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum)
+svfloat16x2_t test_svldnt1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_f16,_x2,)(pn, base, vnum);
 }
@@ -1120,7 +1129,7 @@ svfloat16x2_t test_svldnt1_vnum_f16_x2(svcount_t pn, const float16_t *base, int6
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP5]]
 //
-svfloat32x2_t test_svldnt1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum)
+svfloat32x2_t test_svldnt1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_f32,_x2,)(pn, base, vnum);
 }
@@ -1145,7 +1154,7 @@ svfloat32x2_t test_svldnt1_vnum_f32_x2(svcount_t pn, const float32_t *base, int6
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP5]]
 //
-svfloat64x2_t test_svldnt1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum)
+svfloat64x2_t test_svldnt1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_f64,_x2,)(pn, base, vnum);
 }
@@ -1178,7 +1187,7 @@ svfloat64x2_t test_svldnt1_vnum_f64_x2(svcount_t pn, const float64_t *base, int6
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP9]]
 //
-svfloat16x4_t test_svldnt1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum)
+svfloat16x4_t test_svldnt1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_f16,_x4,)(pn, base, vnum);
 }
@@ -1211,7 +1220,7 @@ svfloat16x4_t test_svldnt1_vnum_f16_x4(svcount_t pn, const float16_t *base, int6
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP9]]
 //
-svfloat32x4_t test_svldnt1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum)
+svfloat32x4_t test_svldnt1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_f32,_x4,)(pn, base, vnum);
 }
@@ -1244,7 +1253,7 @@ svfloat32x4_t test_svldnt1_vnum_f32_x4(svcount_t pn, const float32_t *base, int6
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP9]]
 //
-svfloat64x4_t test_svldnt1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum)
+svfloat64x4_t test_svldnt1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_f64,_x4,)(pn, base, vnum);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qcvtn.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qcvtn.c
index be7ec719edea..fb53ea456c81 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qcvtn.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_qcvtn.c
@@ -8,13 +8,11 @@
 // RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME2 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
@@ -23,6 +21,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_qcvtn_s16_s32_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
@@ -37,7 +41,7 @@
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqcvtn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
-svint16_t test_qcvtn_s16_s32_x2(svint32x2_t zn) __arm_streaming_compatible {
+svint16_t test_qcvtn_s16_s32_x2(svint32x2_t zn) ATTR {
   return SVE_ACLE_FUNC(svqcvtn_s16,_s32_x2,,)(zn);
 }
 
@@ -55,7 +59,7 @@ svint16_t test_qcvtn_s16_s32_x2(svint32x2_t zn) __arm_streaming_compatible {
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uqcvtn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
-svuint16_t test_qcvtn_u16_u32_x2(svuint32x2_t zn) __arm_streaming_compatible {
+svuint16_t test_qcvtn_u16_u32_x2(svuint32x2_t zn) ATTR {
   return SVE_ACLE_FUNC(svqcvtn_u16,_u32_x2,,)(zn);
 }
 
@@ -73,6 +77,6 @@ svuint16_t test_qcvtn_u16_u32_x2(svuint32x2_t zn) __arm_streaming_compatible {
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqcvtun.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
-svuint16_t test_qcvtn_u16_s32_x2(svint32x2_t zn) __arm_streaming_compatible {
+svuint16_t test_qcvtn_u16_s32_x2(svint32x2_t zn) ATTR {
   return SVE_ACLE_FUNC(svqcvtn_u16,_s32_x2,,)(zn);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c
index 9efc37a1dd58..7aa994345a8c 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c
@@ -1,9 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -13,6 +15,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svst1_u8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V:%.*]], i64 0)
@@ -27,7 +35,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v)
+void test_svst1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u8_x2,,)(pn, base, v);
 }
@@ -46,7 +54,7 @@ void test_svst1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v)
+void test_svst1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u16_x2,,)(pn, base, v);
 }
@@ -65,7 +73,7 @@ void test_svst1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v)
+void test_svst1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u32_x2,,)(pn, base, v);
 }
@@ -84,7 +92,7 @@ void test_svst1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v)
+void test_svst1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u64_x2,,)(pn, base, v);
 }
@@ -107,7 +115,7 @@ void test_svst1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v)
+void test_svst1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u8_x4,,)(pn, base, v);
 }
@@ -130,7 +138,7 @@ void test_svst1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v)
+void test_svst1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u16_x4,,)(pn, base, v);
 }
@@ -153,7 +161,7 @@ void test_svst1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v)
+void test_svst1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u32_x4,,)(pn, base, v);
 }
@@ -176,7 +184,7 @@ void test_svst1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v)
+void test_svst1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u64_x4,,)(pn, base, v);
 }
@@ -195,7 +203,7 @@ void test_svst1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v)
+void test_svst1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s8_x2,,)(pn, base, v);
 }
@@ -214,7 +222,7 @@ void test_svst1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v)
+void test_svst1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s16_x2,,)(pn, base, v);
 }
@@ -233,7 +241,7 @@ void test_svst1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v)
+void test_svst1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s32_x2,,)(pn, base, v);
 }
@@ -252,7 +260,7 @@ void test_svst1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v)
+void test_svst1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s64_x2,,)(pn, base, v);
 }
@@ -275,7 +283,7 @@ void test_svst1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v)
+void test_svst1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s8_x4,,)(pn, base, v);
 }
@@ -298,7 +306,7 @@ void test_svst1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v)
+void test_svst1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s16_x4,,)(pn, base, v);
 }
@@ -321,7 +329,7 @@ void test_svst1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v)
+void test_svst1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s32_x4,,)(pn, base, v);
 }
@@ -344,7 +352,7 @@ void test_svst1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v)
+void test_svst1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s64_x4,,)(pn, base, v);
 }
@@ -363,7 +371,7 @@ void test_svst1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v)
+void test_svst1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_f16_x2,,)(pn, base, v);
 }
@@ -382,7 +390,7 @@ void test_svst1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v)
+void test_svst1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_f32_x2,,)(pn, base, v);
 }
@@ -401,7 +409,7 @@ void test_svst1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v)
+void test_svst1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_f64_x2,,)(pn, base, v);
 }
@@ -424,7 +432,7 @@ void test_svst1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v)
+void test_svst1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_f16_x4,,)(pn, base, v);
 }
@@ -447,7 +455,7 @@ void test_svst1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v)
+void test_svst1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_f32_x4,,)(pn, base, v);
 }
@@ -470,7 +478,7 @@ void test_svst1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v)
+void test_svst1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_f64_x4,,)(pn, base, v);
 }
@@ -495,7 +503,7 @@ void test_svst1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v)
+void test_svst1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u8_x2,,)(pn, base, vnum, v);
 }
@@ -516,7 +524,7 @@ void test_svst1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v)
+void test_svst1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u16_x2,,)(pn, base, vnum, v);
 }
@@ -537,7 +545,7 @@ void test_svst1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v)
+void test_svst1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u32_x2,,)(pn, base, vnum, v);
 }
@@ -558,7 +566,7 @@ void test_svst1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v)
+void test_svst1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u64_x2,,)(pn, base, vnum, v);
 }
@@ -583,7 +591,7 @@ void test_svst1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v)
+void test_svst1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u8_x4,,)(pn, base, vnum, v);
 }
@@ -608,7 +616,7 @@ void test_svst1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v)
+void test_svst1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u16_x4,,)(pn, base, vnum, v);
 }
@@ -633,7 +641,7 @@ void test_svst1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v)
+void test_svst1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u32_x4,,)(pn, base, vnum, v);
 }
@@ -658,7 +666,7 @@ void test_svst1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v)
+void test_svst1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u64_x4,,)(pn, base, vnum, v);
 }
@@ -679,7 +687,7 @@ void test_svst1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v)
+void test_svst1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s8_x2,,)(pn, base, vnum, v);
 }
@@ -700,7 +708,7 @@ void test_svst1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v)
+void test_svst1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s16_x2,,)(pn, base, vnum, v);
 }
@@ -721,7 +729,7 @@ void test_svst1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v)
+void test_svst1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s32_x2,,)(pn, base, vnum, v);
 }
@@ -742,7 +750,7 @@ void test_svst1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v)
+void test_svst1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s64_x2,,)(pn, base, vnum, v);
 }
@@ -767,7 +775,7 @@ void test_svst1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v)
+void test_svst1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s8_x4,,)(pn, base, vnum, v);
 }
@@ -792,7 +800,7 @@ void test_svst1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v)
+void test_svst1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s16_x4,,)(pn, base, vnum, v);
 }
@@ -817,7 +825,7 @@ void test_svst1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v)
+void test_svst1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s32_x4,,)(pn, base, vnum, v);
 }
@@ -842,7 +850,7 @@ void test_svst1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v)
+void test_svst1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s64_x4,,)(pn, base, vnum, v);
 }
@@ -865,7 +873,7 @@ void test_svst1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v)
+void test_svst1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_f16_x2,,)(pn, base, vnum, v);
 }
@@ -888,7 +896,7 @@ void test_svst1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v)
+void test_svst1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_f32_x2,,)(pn, base, vnum, v);
 }
@@ -911,7 +919,7 @@ void test_svst1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v)
+void test_svst1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_f64_x2,,)(pn, base, vnum, v);
 }
@@ -938,7 +946,7 @@ void test_svst1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v)
+void test_svst1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_f16_x4,,)(pn, base, vnum, v);
 }
@@ -965,7 +973,7 @@ void test_svst1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v)
+void test_svst1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_f32_x4,,)(pn, base, vnum, v);
 }
@@ -992,7 +1000,7 @@ void test_svst1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v)
+void test_svst1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_f64_x4,,)(pn, base, vnum, v);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
index 9b860fe7180e..0d8696a7634a 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
@@ -1,9 +1,12 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -13,6 +16,11 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
 
 // CHECK-LABEL: @test_svstnt1_u8_x2(
 // CHECK-NEXT:  entry:
@@ -28,7 +36,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v)
+void test_svstnt1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u8_x2,,)(pn, base, v);
 }
@@ -48,7 +56,7 @@ void test_svstnt1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v)
+void test_svstnt1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u16_x2,,)(pn, base, v);
 }
@@ -68,7 +76,7 @@ void test_svstnt1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v)
+void test_svstnt1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u32_x2,,)(pn, base, v);
 }
@@ -88,7 +96,7 @@ void test_svstnt1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v)
+void test_svstnt1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u64_x2,,)(pn, base, v);
 }
@@ -112,7 +120,7 @@ void test_svstnt1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v)
+void test_svstnt1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u8_x4,,)(pn, base, v);
 }
@@ -136,7 +144,7 @@ void test_svstnt1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v)
+void test_svstnt1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u16_x4,,)(pn, base, v);
 }
@@ -160,7 +168,7 @@ void test_svstnt1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v)
+void test_svstnt1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u32_x4,,)(pn, base, v);
 }
@@ -184,7 +192,7 @@ void test_svstnt1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v)
+void test_svstnt1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u64_x4,,)(pn, base, v);
 }
@@ -204,7 +212,7 @@ void test_svstnt1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v)
+void test_svstnt1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s8_x2,,)(pn, base, v);
 }
@@ -224,7 +232,7 @@ void test_svstnt1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v)
+void test_svstnt1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s16_x2,,)(pn, base, v);
 }
@@ -244,7 +252,7 @@ void test_svstnt1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v)
+void test_svstnt1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s32_x2,,)(pn, base, v);
 }
@@ -264,7 +272,7 @@ void test_svstnt1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v)
+void test_svstnt1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s64_x2,,)(pn, base, v);
 }
@@ -288,7 +296,7 @@ void test_svstnt1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v)
+void test_svstnt1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s8_x4,,)(pn, base, v);
 }
@@ -312,7 +320,7 @@ void test_svstnt1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v)
+void test_svstnt1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s16_x4,,)(pn, base, v);
 }
@@ -336,7 +344,7 @@ void test_svstnt1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v)
+void test_svstnt1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s32_x4,,)(pn, base, v);
 }
@@ -360,7 +368,7 @@ void test_svstnt1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v)
+void test_svstnt1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s64_x4,,)(pn, base, v);
 }
@@ -380,7 +388,7 @@ void test_svstnt1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v)
+void test_svstnt1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_f16_x2,,)(pn, base, v);
 }
@@ -400,7 +408,7 @@ void test_svstnt1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v)
+void test_svstnt1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_f32_x2,,)(pn, base, v);
 }
@@ -420,7 +428,7 @@ void test_svstnt1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v)
+void test_svstnt1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_f64_x2,,)(pn, base, v);
 }
@@ -444,7 +452,7 @@ void test_svstnt1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v)
+void test_svstnt1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_f16_x4,,)(pn, base, v);
 }
@@ -468,7 +476,7 @@ void test_svstnt1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v)
+void test_svstnt1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_f32_x4,,)(pn, base, v);
 }
@@ -492,7 +500,7 @@ void test_svstnt1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v)
+void test_svstnt1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_f64_x4,,)(pn, base, v);
 }
@@ -518,7 +526,7 @@ void test_svstnt1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v)
+void test_svstnt1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u8_x2,,)(pn, base, vnum, v);
 }
@@ -540,7 +548,7 @@ void test_svstnt1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v)
+void test_svstnt1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u16_x2,,)(pn, base, vnum, v);
 }
@@ -562,7 +570,7 @@ void test_svstnt1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v)
+void test_svstnt1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u32_x2,,)(pn, base, vnum, v);
 }
@@ -584,7 +592,7 @@ void test_svstnt1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v)
+void test_svstnt1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u64_x2,,)(pn, base, vnum, v);
 }
@@ -610,7 +618,7 @@ void test_svstnt1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v)
+void test_svstnt1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u8_x4,,)(pn, base, vnum, v);
 }
@@ -636,7 +644,7 @@ void test_svstnt1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v)
+void test_svstnt1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u16_x4,,)(pn, base, vnum, v);
 }
@@ -662,7 +670,7 @@ void test_svstnt1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v)
+void test_svstnt1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u32_x4,,)(pn, base, vnum, v);
 }
@@ -688,7 +696,7 @@ void test_svstnt1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v)
+void test_svstnt1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u64_x4,,)(pn, base, vnum, v);
 }
@@ -710,7 +718,7 @@ void test_svstnt1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v)
+void test_svstnt1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s8_x2,,)(pn, base, vnum, v);
 }
@@ -732,7 +740,7 @@ void test_svstnt1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v)
+void test_svstnt1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s16_x2,,)(pn, base, vnum, v);
 }
@@ -754,7 +762,7 @@ void test_svstnt1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v)
+void test_svstnt1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s32_x2,,)(pn, base, vnum, v);
 }
@@ -776,7 +784,7 @@ void test_svstnt1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v)
+void test_svstnt1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s64_x2,,)(pn, base, vnum, v);
 }
@@ -802,7 +810,7 @@ void test_svstnt1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v)
+void test_svstnt1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s8_x4,,)(pn, base, vnum, v);
 }
@@ -828,7 +836,7 @@ void test_svstnt1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v)
+void test_svstnt1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s16_x4,,)(pn, base, vnum, v);
 }
@@ -854,7 +862,7 @@ void test_svstnt1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v)
+void test_svstnt1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s32_x4,,)(pn, base, vnum, v);
 }
@@ -880,7 +888,7 @@ void test_svstnt1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v)
+void test_svstnt1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s64_x4,,)(pn, base, vnum, v);
 }
@@ -904,7 +912,7 @@ void test_svstnt1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v)
+void test_svstnt1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_f16_x2,,)(pn, base, vnum, v);
 }
@@ -928,7 +936,7 @@ void test_svstnt1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v)
+void test_svstnt1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_f32_x2,,)(pn, base, vnum, v);
 }
@@ -952,7 +960,7 @@ void test_svstnt1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v)
+void test_svstnt1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_f64_x2,,)(pn, base, vnum, v);
 }
@@ -980,7 +988,7 @@ void test_svstnt1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v)
+void test_svstnt1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_f16_x4,,)(pn, base, vnum, v);
 }
@@ -1008,7 +1016,7 @@ void test_svstnt1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v)
+void test_svstnt1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_f32_x4,,)(pn, base, vnum, v);
 }
@@ -1036,7 +1044,7 @@ void test_svstnt1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v)
+void test_svstnt1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_f64_x4,,)(pn, base, vnum, v);
 }
diff --git a/clang/test/CodeGen/arm-bf16-params-returns.c b/clang/test/CodeGen/arm-bf16-params-returns.c
index 2d33e3f45eac..21be0bb15169 100644
--- a/clang/test/CodeGen/arm-bf16-params-returns.c
+++ b/clang/test/CodeGen/arm-bf16-params-returns.c
@@ -11,7 +11,7 @@
 __bf16 test_ret_bf16(__bf16 v) {
   return v;
 }
-// CHECK32-HARD: define{{.*}} arm_aapcs_vfpcc bfloat @test_ret_bf16(bfloat noundef returned %v) {{.*}} {
+// CHECK32-HARD: define{{.*}} arm_aapcs_vfpcc noundef bfloat @test_ret_bf16(bfloat noundef returned %v) {{.*}} {
 // CHECK32-HARD: ret bfloat %v
 // CHECK32-SOFTFP: define{{.*}} bfloat @test_ret_bf16(bfloat noundef returned %v) {{.*}} {
 // CHECK32-SOFTFP: ret bfloat %v
@@ -23,11 +23,11 @@ __bf16 test_ret_bf16(__bf16 v) {
 bfloat16x4_t test_ret_bf16x4_t(bfloat16x4_t v) {
   return v;
 }
-// CHECK32-HARD: define{{.*}} arm_aapcs_vfpcc <4 x bfloat> @test_ret_bf16x4_t(<4 x bfloat> noundef returned %v) {{.*}} {
+// CHECK32-HARD: define{{.*}} arm_aapcs_vfpcc noundef <4 x bfloat> @test_ret_bf16x4_t(<4 x bfloat> noundef returned %v) {{.*}} {
 // CHECK32-HARD: ret <4 x bfloat> %v
-// CHECK32-SOFTFP: define{{.*}} <2 x i32> @test_ret_bf16x4_t(<2 x i32> [[V0:.*]]) {{.*}} {
+// CHECK32-SOFTFP: define{{.*}} noundef <2 x i32> @test_ret_bf16x4_t(<2 x i32> [[V0:.*]]) {{.*}} {
 // CHECK32-SOFTFP: ret <2 x i32> %v
-// CHECK64NEON: define{{.*}} <4 x bfloat> @test_ret_bf16x4_t(<4 x bfloat> noundef returned %v) {{.*}} {
+// CHECK64NEON: define{{.*}} noundef <4 x bfloat> @test_ret_bf16x4_t(<4 x bfloat> noundef returned %v) {{.*}} {
 // CHECK64NEON: ret <4 x bfloat> %v
 
 #endif
 \ No newline at end of file
diff --git a/clang/test/CodeGen/arm-vector_type-params-returns.c b/clang/test/CodeGen/arm-vector_type-params-returns.c
index 14c3512ab81a..a55aba9ce066 100644
--- a/clang/test/CodeGen/arm-vector_type-params-returns.c
+++ b/clang/test/CodeGen/arm-vector_type-params-returns.c
@@ -27,7 +27,7 @@
 #endif
 
 // function return types
-// CHECK-LABEL: define dso_local <8 x half> @test_ret_v8f16(
+// CHECK-LABEL: define dso_local noundef <8 x half> @test_ret_v8f16(
 // CHECK-SAME: <8 x half> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <8 x half> [[V]]
@@ -36,7 +36,7 @@ float16x8_t test_ret_v8f16(float16x8_t v) {
   return v;
 }
 
-// CHECK-LABEL: define dso_local <4 x float> @test_ret_v4f32(
+// CHECK-LABEL: define dso_local noundef <4 x float> @test_ret_v4f32(
 // CHECK-SAME: <4 x float> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x float> [[V]]
@@ -45,7 +45,7 @@ float32x4_t test_ret_v4f32(float32x4_t v) {
   return v;
 }
 
-// CHECK-LABEL: define dso_local <2 x double> @test_ret_v2f64(
+// CHECK-LABEL: define dso_local noundef <2 x double> @test_ret_v2f64(
 // CHECK-SAME: <2 x double> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <2 x double> [[V]]
@@ -54,7 +54,7 @@ float64x2_t test_ret_v2f64(float64x2_t v) {
   return v;
 }
 
-// CHECK-LABEL: define dso_local <8 x bfloat> @test_ret_v8bf16(
+// CHECK-LABEL: define dso_local noundef <8 x bfloat> @test_ret_v8bf16(
 // CHECK-SAME: <8 x bfloat> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <8 x bfloat> [[V]]
@@ -63,7 +63,7 @@ bfloat16x8_t test_ret_v8bf16(bfloat16x8_t v) {
   return v;
 }
 
-// CHECK-LABEL: define dso_local <16 x i8> @test_ret_v16s8(
+// CHECK-LABEL: define dso_local noundef <16 x i8> @test_ret_v16s8(
 // CHECK-SAME: <16 x i8> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <16 x i8> [[V]]
@@ -72,7 +72,7 @@ int8x16_t test_ret_v16s8(int8x16_t v) {
   return v;
 }
 
-// CHECK-LABEL: define dso_local <8 x i16> @test_ret_v8s16(
+// CHECK-LABEL: define dso_local noundef <8 x i16> @test_ret_v8s16(
 // CHECK-SAME: <8 x i16> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <8 x i16> [[V]]
@@ -81,7 +81,7 @@ int16x8_t test_ret_v8s16(int16x8_t v) {
   return v;
 }
 
-// CHECK-LABEL: define dso_local <4 x i32> @test_ret_v32s4(
+// CHECK-LABEL: define dso_local noundef <4 x i32> @test_ret_v32s4(
 // CHECK-SAME: <4 x i32> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> [[V]]
@@ -90,7 +90,7 @@ int32x4_t test_ret_v32s4(int32x4_t v) {
   return v;
 }
 
-// CHECK-LABEL: define dso_local <2 x i64> @test_ret_v64s2(
+// CHECK-LABEL: define dso_local noundef <2 x i64> @test_ret_v64s2(
 // CHECK-SAME: <2 x i64> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <2 x i64> [[V]]
@@ -99,7 +99,7 @@ int64x2_t test_ret_v64s2(int64x2_t v) {
   return v;
 }
 
-// CHECK-LABEL: define dso_local <16 x i8> @test_ret_v16u8(
+// CHECK-LABEL: define dso_local noundef <16 x i8> @test_ret_v16u8(
 // CHECK-SAME: <16 x i8> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <16 x i8> [[V]]
@@ -108,7 +108,7 @@ uint8x16_t test_ret_v16u8(uint8x16_t v) {
   return v;
 }
 
-// CHECK-LABEL: define dso_local <8 x i16> @test_ret_v8u16(
+// CHECK-LABEL: define dso_local noundef <8 x i16> @test_ret_v8u16(
 // CHECK-SAME: <8 x i16> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <8 x i16> [[V]]
@@ -117,7 +117,7 @@ uint16x8_t test_ret_v8u16(uint16x8_t v) {
   return v;
 }
 
-// CHECK-LABEL: define dso_local <4 x i32> @test_ret_v32u4(
+// CHECK-LABEL: define dso_local noundef <4 x i32> @test_ret_v32u4(
 // CHECK-SAME: <4 x i32> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> [[V]]
@@ -126,7 +126,7 @@ uint32x4_t test_ret_v32u4(uint32x4_t v) {
   return v;
 }
 
-// CHECK-LABEL: define dso_local <2 x i64> @test_ret_v64u2(
+// CHECK-LABEL: define dso_local noundef <2 x i64> @test_ret_v64u2(
 // CHECK-SAME: <2 x i64> noundef returned [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <2 x i64> [[V]]
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
index 447c3803dec8..8bdcd9af20ef 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-globals.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=1 -mvscale-max=1 -S -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-128
-// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -S -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-512
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=1 -mvscale-max=1 -S -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-64
+// RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=4 -mvscale-max=4 -S -O1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-256
 
 // REQUIRES: riscv-registered-target
 
@@ -25,17 +25,17 @@ fixed_int64m1_t global_i64;
 // WRITES
 //===----------------------------------------------------------------------===//
 
-// CHECK-128-LABEL: @write_global_i64(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[CASTFIXEDSVE:%.*]] = tail call <1 x i64> @llvm.vector.extract.v1i64.nxv1i64(<vscale x 1 x i64> [[V:%.*]], i64 0)
-// CHECK-128-NEXT:    store <1 x i64> [[CASTFIXEDSVE]], ptr @global_i64, align 8, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-128-NEXT:    ret void
+// CHECK-64-LABEL: @write_global_i64(
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i64> @llvm.vector.extract.v1i64.nxv1i64(<vscale x 1 x i64> [[V:%.*]], i64 0)
+// CHECK-64-NEXT:    store <1 x i64> [[CAST_FIXED]], ptr @global_i64, align 8, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-64-NEXT:    ret void
 //
-// CHECK-512-LABEL: @write_global_i64(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[CASTFIXEDSVE:%.*]] = tail call <4 x i64> @llvm.vector.extract.v4i64.nxv1i64(<vscale x 1 x i64> [[V:%.*]], i64 0)
-// CHECK-512-NEXT:    store <4 x i64> [[CASTFIXEDSVE]], ptr @global_i64, align 8, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-512-NEXT:    ret void
+// CHECK-256-LABEL: @write_global_i64(
+// CHECK-256-NEXT:  entry:
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i64> @llvm.vector.extract.v4i64.nxv1i64(<vscale x 1 x i64> [[V:%.*]], i64 0)
+// CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr @global_i64, align 8, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-256-NEXT:    ret void
 //
 void write_global_i64(vint64m1_t v) { global_i64 = v; }
 
@@ -43,16 +43,16 @@ void write_global_i64(vint64m1_t v) { global_i64 = v; }
 // READS
 //===----------------------------------------------------------------------===//
 
-// CHECK-128-LABEL: @read_global_i64(
-// CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @global_i64, align 8, !tbaa [[TBAA4]]
-// CHECK-128-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v1i64(<vscale x 1 x i64> undef, <1 x i64> [[TMP0]], i64 0)
-// CHECK-128-NEXT:    ret <vscale x 1 x i64> [[CASTSCALABLESVE]]
+// CHECK-64-LABEL: @read_global_i64(
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @global_i64, align 8, !tbaa [[TBAA4]]
+// CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v1i64(<vscale x 1 x i64> undef, <1 x i64> [[TMP0]], i64 0)
+// CHECK-64-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
-// CHECK-512-LABEL: @read_global_i64(
-// CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr @global_i64, align 8, !tbaa [[TBAA4]]
-// CHECK-512-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v4i64(<vscale x 1 x i64> undef, <4 x i64> [[TMP0]], i64 0)
-// CHECK-512-NEXT:    ret <vscale x 1 x i64> [[CASTSCALABLESVE]]
+// CHECK-256-LABEL: @read_global_i64(
+// CHECK-256-NEXT:  entry:
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr @global_i64, align 8, !tbaa [[TBAA4]]
+// CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v4i64(<vscale x 1 x i64> undef, <4 x i64> [[TMP0]], i64 0)
+// CHECK-256-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
 vint64m1_t read_global_i64() { return global_i64; }
diff --git a/clang/test/CodeGen/ifunc.c b/clang/test/CodeGen/ifunc.c
index a29b500e80bd..3aa29f7dff74 100644
--- a/clang/test/CodeGen/ifunc.c
+++ b/clang/test/CodeGen/ifunc.c
@@ -51,11 +51,11 @@ void* goo_ifunc(void) {
 // CHECK: call i32 @foo(i32
 // CHECK: call void @goo()
 
-// SAN: define internal nonnull ptr @foo_ifunc() #[[#FOO_IFUNC:]] {
-// MACSAN: define internal nonnull ptr @foo_ifunc() #[[#FOO_IFUNC:]] {
+// SAN: define internal nonnull {{(noundef )?}}ptr @foo_ifunc() #[[#FOO_IFUNC:]] {
+// MACSAN: define internal nonnull {{(noundef )?}}ptr @foo_ifunc() #[[#FOO_IFUNC:]] {
 
-// SAN: define dso_local noalias ptr @goo_ifunc() #[[#GOO_IFUNC:]] {
-// MACSAN: define noalias ptr @goo_ifunc() #[[#GOO_IFUNC:]] {
+// SAN: define dso_local noalias {{(noundef )?}}ptr @goo_ifunc() #[[#GOO_IFUNC:]] {
+// MACSAN: define noalias {{(noundef )?}}ptr @goo_ifunc() #[[#GOO_IFUNC:]] {
 
 // SAN-DAG: attributes #[[#FOO_IFUNC]] = {{{.*}} disable_sanitizer_instrumentation {{.*}}
 // MACSAN-DAG: attributes #[[#FOO_IFUNC]] = {{{.*}} disable_sanitizer_instrumentation {{.*}}
diff --git a/clang/test/CodeGen/isfpclass.c b/clang/test/CodeGen/isfpclass.c
index 34873c08e04f..88d7a21b9733 100644
--- a/clang/test/CodeGen/isfpclass.c
+++ b/clang/test/CodeGen/isfpclass.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -S -O1 -emit-llvm %s -o - | FileCheck %s
 
-// CHECK-LABEL: define dso_local i1 @check_isfpclass_finite
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_finite
 // CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
@@ -12,7 +12,7 @@ _Bool check_isfpclass_finite(float x) {
   return __builtin_isfpclass(x, 504 /*Finite*/);
 }
 
-// CHECK-LABEL: define dso_local i1 @check_isfpclass_finite_strict
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_finite_strict
 // CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[X]], i32 504) #[[ATTR6:[0-9]+]]
@@ -23,7 +23,7 @@ _Bool check_isfpclass_finite_strict(float x) {
   return __builtin_isfpclass(x, 504 /*Finite*/);
 }
 
-// CHECK-LABEL: define dso_local i1 @check_isfpclass_nan_f32
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_nan_f32
 // CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = fcmp uno float [[X]], 0.000000e+00
@@ -33,7 +33,7 @@ _Bool check_isfpclass_nan_f32(float x) {
   return __builtin_isfpclass(x, 3 /*NaN*/);
 }
 
-// CHECK-LABEL: define dso_local i1 @check_isfpclass_nan_f32_strict
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_nan_f32_strict
 // CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[X]], i32 3) #[[ATTR6]]
@@ -44,7 +44,7 @@ _Bool check_isfpclass_nan_f32_strict(float x) {
   return __builtin_isfpclass(x, 3 /*NaN*/);
 }
 
-// CHECK-LABEL: define dso_local i1 @check_isfpclass_snan_f64
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_snan_f64
 // CHECK-SAME: (double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f64(double [[X]], i32 1)
@@ -54,7 +54,7 @@ _Bool check_isfpclass_snan_f64(double x) {
   return __builtin_isfpclass(x, 1 /*SNaN*/);
 }
 
-// CHECK-LABEL: define dso_local i1 @check_isfpclass_snan_f64_strict
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_snan_f64_strict
 // CHECK-SAME: (double noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f64(double [[X]], i32 1) #[[ATTR6]]
@@ -65,7 +65,7 @@ _Bool check_isfpclass_snan_f64_strict(double x) {
   return __builtin_isfpclass(x, 1 /*NaN*/);
 }
 
-// CHECK-LABEL: define dso_local i1 @check_isfpclass_zero_f16
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_zero_f16
 // CHECK-SAME: (half noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = fcmp oeq half [[X]], 0xH0000
@@ -75,7 +75,7 @@ _Bool check_isfpclass_zero_f16(_Float16 x) {
   return __builtin_isfpclass(x, 96 /*Zero*/);
 }
 
-// CHECK-LABEL: define dso_local i1 @check_isfpclass_zero_f16_strict
+// CHECK-LABEL: define dso_local noundef i1 @check_isfpclass_zero_f16_strict
 // CHECK-SAME: (half noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f16(half [[X]], i32 96) #[[ATTR6]]
@@ -86,7 +86,7 @@ _Bool check_isfpclass_zero_f16_strict(_Float16 x) {
   return __builtin_isfpclass(x, 96 /*Zero*/);
 }
 
-// CHECK-LABEL: define dso_local i1 @check_isnan
+// CHECK-LABEL: define dso_local noundef i1 @check_isnan
 // CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[X]], i32 3) #[[ATTR6]]
@@ -97,7 +97,7 @@ _Bool check_isnan(float x) {
   return __builtin_isnan(x);
 }
 
-// CHECK-LABEL: define dso_local i1 @check_isinf
+// CHECK-LABEL: define dso_local noundef i1 @check_isinf
 // CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[X]], i32 516) #[[ATTR6]]
@@ -108,7 +108,7 @@ _Bool check_isinf(float x) {
   return __builtin_isinf(x);
 }
 
-// CHECK-LABEL: define dso_local i1 @check_isfinite
+// CHECK-LABEL: define dso_local noundef i1 @check_isfinite
 // CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[X]], i32 504) #[[ATTR6]]
@@ -119,7 +119,7 @@ _Bool check_isfinite(float x) {
   return __builtin_isfinite(x);
 }
 
-// CHECK-LABEL: define dso_local i1 @check_isnormal
+// CHECK-LABEL: define dso_local noundef i1 @check_isnormal
 // CHECK-SAME: (float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[X]], i32 264) #[[ATTR6]]
@@ -136,7 +136,7 @@ typedef double __attribute__((ext_vector_type(4))) double4;
 typedef int __attribute__((ext_vector_type(4))) int4;
 typedef long __attribute__((ext_vector_type(4))) long4;
 
-// CHECK-LABEL: define dso_local <4 x i32> @check_isfpclass_nan_v4f32
+// CHECK-LABEL: define dso_local noundef <4 x i32> @check_isfpclass_nan_v4f32
 // CHECK-SAME: (<4 x float> noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = fcmp uno <4 x float> [[X]], zeroinitializer
@@ -147,7 +147,7 @@ int4 check_isfpclass_nan_v4f32(float4 x) {
   return __builtin_isfpclass(x, 3 /*NaN*/);
 }
 
-// CHECK-LABEL: define dso_local <4 x i32> @check_isfpclass_nan_strict_v4f32
+// CHECK-LABEL: define dso_local noundef <4 x i32> @check_isfpclass_nan_strict_v4f32
 // CHECK-SAME: (<4 x float> noundef [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[X]], i32 3) #[[ATTR6]]
diff --git a/clang/test/CodeGen/ms-mixed-ptr-sizes.c b/clang/test/CodeGen/ms-mixed-ptr-sizes.c
index a853c6083cb8..89d05fd30b72 100644
--- a/clang/test/CodeGen/ms-mixed-ptr-sizes.c
+++ b/clang/test/CodeGen/ms-mixed-ptr-sizes.c
@@ -49,7 +49,7 @@ void test_other(struct Foo *f, __attribute__((address_space(10))) int *i) {
 }
 
 int test_compare1(int *__ptr32 __uptr i, int *__ptr64 j) {
-  // ALL-LABEL: define dso_local i32 @test_compare1
+  // ALL-LABEL: define dso_local noundef i32 @test_compare1
   // X64: %{{.+}} = addrspacecast ptr %j to ptr addrspace(271)
   // X64: %cmp = icmp eq ptr addrspace(271) %{{.+}}, %i
   // X86: %{{.+}} = addrspacecast ptr addrspace(272) %j to ptr addrspace(271)
@@ -58,7 +58,7 @@ int test_compare1(int *__ptr32 __uptr i, int *__ptr64 j) {
 }
 
 int test_compare2(int *__ptr32 __sptr i, int *__ptr64 j) {
-  // ALL-LABEL: define dso_local i32 @test_compare2
+  // ALL-LABEL: define dso_local noundef i32 @test_compare2
   // X64: %{{.+}} = addrspacecast ptr %j to ptr addrspace(270)
   // X64: %cmp = icmp eq ptr addrspace(270) %{{.+}}, %i
   // X86: %{{.+}} = addrspacecast ptr addrspace(272) %j to ptr
@@ -67,7 +67,7 @@ int test_compare2(int *__ptr32 __sptr i, int *__ptr64 j) {
 }
 
 int test_compare3(int *__ptr32 __uptr i, int *__ptr64 j) {
-  // ALL-LABEL: define dso_local i32 @test_compare3
+  // ALL-LABEL: define dso_local noundef i32 @test_compare3
   // X64: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr
   // X64: %cmp = icmp eq ptr %{{.+}}, %j
   // X86: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr addrspace(272)
@@ -76,7 +76,7 @@ int test_compare3(int *__ptr32 __uptr i, int *__ptr64 j) {
 }
 
 int test_compare4(int *__ptr32 __sptr i, int *__ptr64 j) {
-  // ALL-LABEL: define dso_local i32 @test_compare4
+  // ALL-LABEL: define dso_local noundef i32 @test_compare4
   // X64: %{{.+}} = addrspacecast ptr addrspace(270) %i to ptr
   // X64: %cmp = icmp eq ptr %{{.+}}, %j
   // X86: %{{.+}} = addrspacecast ptr %i to ptr addrspace(272)
diff --git a/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu b/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu
index f12eacfb53fa..ef02668c3697 100644
--- a/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu
+++ b/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu
@@ -111,16 +111,16 @@ __global__ void kernel_f64(double* out, double* a, double* b, double* c) {
 }
 }
 
-// INTERNALIZE: define internal half @do_f16_stuff({{.*}}) #[[$FUNCATTR:[0-9]+]]
-// INTERNALIZE: define internal float @do_f32_stuff({{.*}}) #[[$FUNCATTR]]
-// INTERNALIZE: define internal double @do_f64_stuff({{.*}}) #[[$FUNCATTR]]
-// INTERNALIZE: define internal float @weak_do_f32_stuff({{.*}}) #[[$WEAK_FUNCATTR:[0-9]+]]
+// INTERNALIZE: define internal {{(noundef )?}}half @do_f16_stuff({{.*}}) #[[$FUNCATTR:[0-9]+]]
+// INTERNALIZE: define internal {{(noundef )?}}float @do_f32_stuff({{.*}}) #[[$FUNCATTR]]
+// INTERNALIZE: define internal {{(noundef )?}}double @do_f64_stuff({{.*}}) #[[$FUNCATTR]]
+// INTERNALIZE: define internal {{(noundef )?}}float @weak_do_f32_stuff({{.*}}) #[[$WEAK_FUNCATTR:[0-9]+]]
 
 
-// NOINTERNALIZE: define dso_local half @do_f16_stuff({{.*}}) #[[$FUNCATTR:[0-9]+]]
-// NOINTERNALIZE: define dso_local float @do_f32_stuff({{.*}}) #[[$FUNCATTR]]
-// NOINTERNALIZE: define dso_local double @do_f64_stuff({{.*}}) #[[$FUNCATTR]]
-// NOINTERNALIZE: define weak float @weak_do_f32_stuff({{.*}}) #[[$WEAK_FUNCATTR:[0-9]+]]
+// NOINTERNALIZE: define dso_local {{(noundef )?}}half @do_f16_stuff({{.*}}) #[[$FUNCATTR:[0-9]+]]
+// NOINTERNALIZE: define dso_local {{(noundef )?}}float @do_f32_stuff({{.*}}) #[[$FUNCATTR]]
+// NOINTERNALIZE: define dso_local {{(noundef )?}}double @do_f64_stuff({{.*}}) #[[$FUNCATTR]]
+// NOINTERNALIZE: define weak {{(noundef )?}}float @weak_do_f32_stuff({{.*}}) #[[$WEAK_FUNCATTR:[0-9]+]]
 
 
 
diff --git a/clang/test/CodeGenOpenCL/as_type.cl b/clang/test/CodeGenOpenCL/as_type.cl
index d8e75db936f7..1fe26fbeafdb 100644
--- a/clang/test/CodeGenOpenCL/as_type.cl
+++ b/clang/test/CodeGenOpenCL/as_type.cl
@@ -27,7 +27,7 @@ char3 f3(int x) {
   return __builtin_astype(x, char3);
 }
 
-//CHECK: define{{.*}} spir_func <4 x i8> @f4(i32 noundef %[[x:.*]])
+//CHECK: define{{.*}} spir_func noundef <4 x i8> @f4(i32 noundef %[[x:.*]])
 //CHECK: %[[astype:.*]] = bitcast i32 %[[x]] to <4 x i8>
 //CHECK-NOT: shufflevector
 //CHECK: ret <4 x i8> %[[astype]]
@@ -43,7 +43,7 @@ int f5(char3 x) {
   return __builtin_astype(x, int);
 }
 
-//CHECK: define{{.*}} spir_func i32 @f6(<4 x i8> noundef %[[x:.*]])
+//CHECK: define{{.*}} spir_func noundef i32 @f6(<4 x i8> noundef %[[x:.*]])
 //CHECK: %[[astype:.*]] = bitcast <4 x i8> %[[x]] to i32
 //CHECK-NOT: shufflevector
 //CHECK: ret i32 %[[astype]]
@@ -51,7 +51,7 @@ int f6(char4 x) {
   return __builtin_astype(x, int);
 }
 
-//CHECK: define{{.*}} spir_func <3 x i8> @f7(<3 x i8> noundef returned %[[x:.*]])
+//CHECK: define{{.*}} spir_func noundef <3 x i8> @f7(<3 x i8> noundef returned %[[x:.*]])
 //CHECK-NOT: bitcast
 //CHECK-NOT: shufflevector
 //CHECK: ret <3 x i8> %[[x]]
@@ -67,21 +67,21 @@ int3 f8(char16 x) {
   return __builtin_astype(x, int3);
 }
 
-//CHECK: define{{.*}} spir_func ptr addrspace(1) @addr_cast(ptr noundef readnone %[[x:.*]])
+//CHECK: define{{.*}} spir_func noundef ptr addrspace(1) @addr_cast(ptr noundef readnone %[[x:.*]])
 //CHECK: %[[cast:.*]] ={{.*}} addrspacecast ptr %[[x]] to ptr addrspace(1)
 //CHECK: ret ptr addrspace(1) %[[cast]]
 global int* addr_cast(int *x) {
   return __builtin_astype(x, global int*);
 }
 
-//CHECK: define{{.*}} spir_func ptr addrspace(1) @int_to_ptr(i32 noundef %[[x:.*]])
+//CHECK: define{{.*}} spir_func noundef ptr addrspace(1) @int_to_ptr(i32 noundef %[[x:.*]])
 //CHECK: %[[cast:.*]] = inttoptr i32 %[[x]] to ptr addrspace(1)
 //CHECK: ret ptr addrspace(1) %[[cast]]
 global int* int_to_ptr(int x) {
   return __builtin_astype(x, global int*);
 }
 
-//CHECK: define{{.*}} spir_func i32 @ptr_to_int(ptr noundef %[[x:.*]])
+//CHECK: define{{.*}} spir_func noundef i32 @ptr_to_int(ptr noundef %[[x:.*]])
 //CHECK: %[[cast:.*]] = ptrtoint ptr %[[x]] to i32
 //CHECK: ret i32 %[[cast]]
 int ptr_to_int(int *x) {
diff --git a/clang/test/Driver/aarch64-pauth-lr.c b/clang/test/Driver/aarch64-pauth-lr.c
new file mode 100644
index 000000000000..2e1b530fc989
--- /dev/null
+++ b/clang/test/Driver/aarch64-pauth-lr.c
@@ -0,0 +1,23 @@
+// Check the -cc1 flags for the various forms of -mbranch-protection=pac-ret+pc.
+
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc                  2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+b-key            2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf             2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+bti              2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc                  -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+b-key            -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf             -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+bti              -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
+
+// PAUTH-LR: "-msign-return-address=non-leaf" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
+// PAUTH-LR-B-KEY: "-msign-return-address=non-leaf" "-msign-return-address-key=b_key" "-mbranch-protection-pauth-lr"
+// PAUTH-LR-LEAF: "-msign-return-address=all" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
+// PAUTH-LR-BTI: "-msign-return-address=non-leaf" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
+// PAUTH-LR-LEAF-B-KEY-BTI: "-msign-return-address=all" "-msign-return-address-key=b_key" "-mbranch-protection-pauth-lr" "-mbranch-target-enforce"
+
+// NOT-PAUTH-LR: "-mbranch-target-enforce"
+// NOT-PAUTH-LR-B-KEY: "-mbranch-target-enforce"
+// NOT-PAUTH-LR-LEAF: "-mbranch-target-enforce"
+// NOT-PAUTH-LR-BTI: "-mbranch-target-enforce"
diff --git a/clang/test/Driver/aarch64-v95a.c b/clang/test/Driver/aarch64-v95a.c
index 366cade86a9f..13069c04c8d1 100644
--- a/clang/test/Driver/aarch64-v95a.c
+++ b/clang/test/Driver/aarch64-v95a.c
@@ -1,3 +1,5 @@
+// ===== Base v9.5a architecture =====
+
 // RUN: %clang -target aarch64 -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64 -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64 -mlittle-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
@@ -5,6 +7,7 @@
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // GENERICV95A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a"
+
 // RUN: %clang -target aarch64_be -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64_be -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64 -mbig-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
@@ -18,3 +21,11 @@
 // RUN: %clang -target aarch64 -march=armv9.5a+cpa -### -c %s 2>&1 | FileCheck -check-prefix=V95A-CPA %s
 // RUN: %clang -target aarch64 -march=armv9.5-a+cpa -### -c %s 2>&1 | FileCheck -check-prefix=V95A-CPA %s
 // V95A-CPA: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a" "-target-feature" "+cpa"
+
+// RUN: %clang -target aarch64 -march=armv9.5a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
+// RUN: %clang -target aarch64 -march=armv9.5-a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
+// V95A-PAUTHLR: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a" "-target-feature" "+pauth-lr"
+
+// RUN: %clang -target aarch64 -march=armv9.5a+tlbiw -### -c %s 2>&1 | FileCheck -check-prefix=V95A-TLBIW %s
+// RUN: %clang -target aarch64 -march=armv9.5-a+tlbiw -### -c %s 2>&1 | FileCheck -check-prefix=V95A-TLBIW %s
+// V95A-TLBIW: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a" "-target-feature" "+tlbiw"
diff --git a/clang/test/Driver/fbasic-block-sections.c b/clang/test/Driver/fbasic-block-sections.c
index 417cf9b6319b..24262209d1e4 100644
--- a/clang/test/Driver/fbasic-block-sections.c
+++ b/clang/test/Driver/fbasic-block-sections.c
@@ -1,15 +1,15 @@
-// RUN: %clang -### -target x86_64 -fbasic-block-sections=none %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-NONE %s
-// RUN: %clang -### -target x86_64 -fbasic-block-sections=all %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-ALL %s
-// RUN: %clang -### -target x86_64 -fbasic-block-sections=list=%s %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-LIST %s
-// RUN: %clang -### -target x86_64 -fbasic-block-sections=labels %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-LABELS %s
-// RUN: not %clang -c -target arm-unknown-linux -fbasic-block-sections=all %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
-// RUN: %clang -### -target arm-unknown-linux -fbasic-block-sections=all -fbasic-block-sections=none %s -S 2>&1 \
+// RUN: %clang -### --target=x86_64 -fbasic-block-sections=none %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-NONE %s
+// RUN: %clang -### --target=x86_64 -fbasic-block-sections=all %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-ALL %s
+// RUN: %clang -### --target=x86_64 -fbasic-block-sections=list=%s %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-LIST %s
+// RUN: %clang -### --target=x86_64 -fbasic-block-sections=labels %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-LABELS %s
+// RUN: not %clang -c --target=arm-unknown-linux -fbasic-block-sections=all %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
+// RUN: %clang -### --target=arm-unknown-linux -fbasic-block-sections=all -fbasic-block-sections=none %s -S 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-NOOPT %s
-// RUN: not %clang -c -target x86_64-apple-darwin10 -fbasic-block-sections=all %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
-// RUN: not %clang -### -target x86_64 -fbasic-block-sections=alll %s -S 2>&1 | FileCheck -check-prefix=CHECK-INVALID-VALUE %s
-// RUN: not %clang -### -target x86_64 -fbasic-block-sections=list %s -S 2>&1 | FileCheck -check-prefix=CHECK-INVALID-VALUE %s
-// RUN: %clang -### -target x86_64 -fbasic-block-sections=list= %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-NULL-LIST %s
-// RUN: %clang -### -target x86_64 -fbasic-block-sections=none %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-NONE %s
+// RUN: not %clang -c --target=x86_64-apple-darwin10 -fbasic-block-sections=all %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
+// RUN: not %clang -### --target=x86_64 -fbasic-block-sections=alll %s -S 2>&1 | FileCheck -check-prefix=CHECK-INVALID-VALUE %s
+// RUN: not %clang -### --target=x86_64 -fbasic-block-sections=list %s -S 2>&1 | FileCheck -check-prefix=CHECK-INVALID-VALUE %s
+// RUN: %clang -### --target=x86_64 -fbasic-block-sections=list= %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-NULL-LIST %s
+// RUN: %clang -### --target=x86_64 -fbasic-block-sections=none %s -S 2>&1 | FileCheck -check-prefix=CHECK-OPT-NONE %s
 // RUN: %clang -### -x cuda -nocudainc -nocudalib --target=x86_64 -fbasic-block-sections=all --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s -c 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-CUDA %s
 
diff --git a/clang/test/Driver/modules.m b/clang/test/Driver/modules.m
index 9eb356980556..d1a65f5cb071 100644
--- a/clang/test/Driver/modules.m
+++ b/clang/test/Driver/modules.m
@@ -57,7 +57,21 @@
 // CHECK-MODULE-MAP-FILES: "-fmodule-map-file=foo.map"
 // CHECK-MODULE-MAP-FILES: "-fmodule-map-file=bar.map"
 
-// RUN: %clang -fmodules -fbuiltin-module-map -### %s 2>&1 | FileCheck -check-prefix=CHECK-BUILTIN-MODULE-MAP %s
+// Verify that the driver propagates -fmodule-name and -fmodule-map-file flags when
+// -fmodules-decluse or -fmodules-strict-decluse, as used for layering check.
+// RUN: %clang -fmodules-decluse -fmodule-name=foo -c -### %s 2>&1 | FileCheck -check-prefix=CHECK-DECLUSE-PROPAGATE-MODULE-NAME %s
+// CHECK-DECLUSE-PROPAGATE-MODULE-NAME: -fmodule-name=foo
+
+// RUN: %clang -fmodules-decluse -fmodule-map-file=foo.map -c -### %s 2>&1 | FileCheck -check-prefix=CHECK-DECLUSE-PROPAGATE-MODULE-MAPS %s
+// CHECK-DECLUSE-PROPAGATE-MODULE-MAPS: -fmodule-map-file=foo.map
+
+// RUN: %clang -fmodules-strict-decluse -fmodule-name=foo -c -### %s 2>&1 | FileCheck -check-prefix=CHECK-STRICT-DECLUSE-PROPAGATE-MODULE-NAME %s
+// CHECK-STRICT-DECLUSE-PROPAGATE-MODULE-NAME: -fmodule-name=foo
+
+// RUN: %clang -fmodules-strict-decluse -fmodule-map-file=foo.map -c -### %s 2>&1 | FileCheck -check-prefix=CHECK-STRICT-DECLUSE-PROPAGATE-MODULE-MAPS %s
+// CHECK-STRICT-DECLUSE-PROPAGATE-MODULE-MAPS: -fmodule-map-file=foo.map
+
+    // RUN: %clang -fmodules -fbuiltin-module-map -### %s 2>&1 | FileCheck -check-prefix=CHECK-BUILTIN-MODULE-MAP %s
 // CHECK-BUILTIN-MODULE-MAP: "-fmodules"
 // CHECK-BUILTIN-MODULE-MAP: "-fmodule-map-file={{.*}}include{{/|\\\\}}module.modulemap"
 
diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c
index f32d511ec3d1..d181755bb585 100644
--- a/clang/test/Driver/riscv-cpus.c
+++ b/clang/test/Driver/riscv-cpus.c
@@ -153,7 +153,6 @@
 // MCPU-SIFIVE-S76: "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f" "-target-feature" "+d"
 // MCPU-SIFIVE-S76: "-target-feature" "+c"
 // MCPU-SIFIVE-S76: "-target-feature" "+zicsr" "-target-feature" "+zifencei" "-target-feature" "+zihintpause"
-// MCPU-SIFIVE-S76: "-target-feature" "+xsfcie"
 // MCPU-SIFIVE-S76: "-target-abi" "lp64d"
 
 // mcpu with default march
diff --git a/clang/test/Driver/riscv-default-features.c b/clang/test/Driver/riscv-default-features.c
index 6e48f7cc37dc..4c3883c1cc11 100644
--- a/clang/test/Driver/riscv-default-features.c
+++ b/clang/test/Driver/riscv-default-features.c
@@ -2,9 +2,7 @@
 // RUN: %clang --target=riscv64-unknown-elf -S -emit-llvm %s -o - | FileCheck %s -check-prefix=RV64
 
 // RV32: "target-features"="+32bit,+a,+c,+m,+relax,
-// RV32-SAME: -save-restore
 // RV64: "target-features"="+64bit,+a,+c,+m,+relax,
-// RV64-SAME: -save-restore
 
 // Dummy function
 int foo(void){
diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c
index 716f3f6da57b..d3700f71aa7e 100644
--- a/clang/test/Driver/riscv-features.c
+++ b/clang/test/Driver/riscv-features.c
@@ -24,7 +24,7 @@
 
 // SAVE-RESTORE: "-target-feature" "+save-restore"
 // NO-SAVE-RESTORE: "-target-feature" "-save-restore"
-// DEFAULT: "-target-feature" "-save-restore"
+// DEFAULT-NOT: "-target-feature" "-save-restore"
 // DEFAULT-NOT: "-target-feature" "+save-restore"
 
 // RUN: %clang --target=riscv32-unknown-elf -### %s -munaligned-access 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS
diff --git a/clang/test/Format/clang-format-ignore.cpp b/clang/test/Format/clang-format-ignore.cpp
new file mode 100644
index 000000000000..0d6396a64a66
--- /dev/null
+++ b/clang/test/Format/clang-format-ignore.cpp
@@ -0,0 +1,33 @@
+// RUN: rm -rf %t.dir
+// RUN: mkdir -p %t.dir/level1/level2
+
+// RUN: cd %t.dir
+// RUN: echo "*" > .clang-format-ignore
+// RUN: echo "level*/*.c*" >> .clang-format-ignore
+// RUN: echo "*/*2/foo.*" >> .clang-format-ignore
+// RUN: touch foo.cc
+// RUN: clang-format -verbose .clang-format-ignore foo.cc 2> %t.stderr
+// RUN: not grep Formatting %t.stderr
+
+// RUN: cd level1
+// RUN: touch bar.cc baz.c
+// RUN: clang-format -verbose bar.cc baz.c 2> %t.stderr
+// RUN: not grep Formatting %t.stderr
+
+// RUN: cd level2
+// RUN: touch foo.c foo.js
+// RUN: clang-format -verbose foo.c foo.js 2> %t.stderr
+// RUN: not grep Formatting %t.stderr
+
+// RUN: touch .clang-format-ignore
+// RUN: clang-format -verbose foo.c foo.js 2> %t.stderr
+// RUN: grep "Formatting \[1/2] foo.c" %t.stderr
+// RUN: grep "Formatting \[2/2] foo.js" %t.stderr
+
+// RUN: echo "*.js" > .clang-format-ignore
+// RUN: clang-format -verbose foo.c foo.js 2> %t.stderr
+// RUN: grep "Formatting \[1/2] foo.c" %t.stderr
+// RUN: not grep "Formatting \[2/2] foo.js" %t.stderr
+
+// RUN: cd ../../..
+// RUN: rm -rf %t.dir
diff --git a/clang/test/Modules/pr64755.cppm b/clang/test/Modules/pr64755.cppm
index 75ef84315461..2d656868eb60 100644
--- a/clang/test/Modules/pr64755.cppm
+++ b/clang/test/Modules/pr64755.cppm
@@ -7,6 +7,11 @@
 // RUN: %clang_cc1 -std=c++20 %t/use.cpp -fmodule-file=a0=%t/a0.pcm -verify -fsyntax-only
 // RUN: %clang_cc1 -std=c++20 %t/use.cpp -fprebuilt-module-path=%t -verify -fsyntax-only
 
+// RUN: %clang_cc1 -std=c++20 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I%t -fmodule-name=a0 -x objective-c++ -emit-module %t/module.modulemap -o %t/a0.pcm
+// RUN: %clang_cc1 -std=c++20 -x objective-c++ %t/use_obj.cpp -fmodule-file=%t/a0.pcm -verify -fsyntax-only
+// RUN: %clang_cc1 -std=c++20 -x objective-c++ %t/use_obj.cpp -fmodule-file=a0=%t/a0.pcm -verify -fsyntax-only
+// RUN: %clang_cc1 -std=c++20 -x objective-c++ %t/use_obj.cpp -fprebuilt-module-path=%t -verify -fsyntax-only
+
 //--- module.modulemap
 module a0 { header "a0.h" export * }
 
@@ -15,3 +20,7 @@ void a0() {}
 
 //--- use.cpp
 import a0; // expected-error {{import of module 'a0' imported non C++20 importable modules}}
+
+//--- use_obj.cpp
+// expected-no-diagnostics
+@import a0;
diff --git a/clang/test/OpenMP/atomic_compare_codegen.cpp b/clang/test/OpenMP/atomic_compare_codegen.cpp
index 43607cc65ad9..03e5081a5c1d 100644
--- a/clang/test/OpenMP/atomic_compare_codegen.cpp
+++ b/clang/test/OpenMP/atomic_compare_codegen.cpp
@@ -13806,6 +13806,64 @@ double dxevd() {
   return dv;
 }
 
+
+double fail_dxevd() {
+  double dx, dv, de, dd;
+
+#pragma omp atomic compare capture relaxed fail(relaxed)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture acquire fail(relaxed)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture release fail(relaxed)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture acq_rel fail(relaxed)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture seq_cst fail(relaxed)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture relaxed fail(acquire)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture acquire fail(acquire)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture release fail(acquire)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture acq_rel fail(acquire)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture seq_cst fail(acquire)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture relaxed fail(seq_cst)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture acquire fail(seq_cst)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture release fail(seq_cst)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture acq_rel fail(seq_cst)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare capture seq_cst fail(seq_cst)
+  {if(dx == de) { dx = dv; } else { dd = dx; }}
+
+#pragma omp atomic compare seq_cst fail(acquire)
+  dx = dx < de ? de : dx;
+
+#pragma omp atomic compare relaxed fail(seq_cst)
+  dx = dx > de ? de : dx;
+
+  return dx;
+}
+
 #endif
 // CHECK-LABEL: @foo(
 // CHECK-NEXT:  entry:
@@ -61966,3 +62024,89 @@ double dxevd() {
 // SIMD-ONLY0-NEXT:    [[TMP180:%.*]] = load double, ptr [[DV]], align 8
 // SIMD-ONLY0-NEXT:    ret double [[TMP180]]
 //
+// CHECK-LABEL: {{.+}}fail_dxevd{{.+}}
+// CHECK-NEXT:  entry:
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:    {{.+}}cmpxchg ptr {{.+}} monotonic monotonic{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} acquire monotonic{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} release monotonic{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} acq_rel monotonic{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}}cmpxchg ptr {{.+}} seq_cst monotonic{{.+}}
+// CHECK:    {{.+}}__kmpc_flush{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} monotonic acquire{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} acquire acquire{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} release acquire{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} acq_rel acquire{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} seq_cst acquire{{.+}}
+// CHECK:    {{.+}}__kmpc_flush{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} monotonic seq_cst{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} acquire seq_cst{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} release seq_cst{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} acq_rel seq_cst{{.+}}
+// CHECK:       {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK-NEXT:  {{.+}} bitcast double{{.+}}
+// CHECK:    {{.+}}cmpxchg ptr {{.+}} seq_cst seq_cst{{.+}}
+// CHECK:    call void {{.+}}__kmpc_flush{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} atomicrmw fmax {{.+}} seq_cst{{.+}}
+// CHECK-NEXT:  call void {{.+}}__kmpc_flush{{.+}}
+// CHECK-NEXT:  {{.+}} load double,{{.+}}
+// CHECK-NEXT:  {{.+}} atomicrmw fmin {{.+}} monotonic{{.+}}
+// CHECK:    ret double {{.+}}
diff --git a/clang/test/PCH/pr76443.cpp b/clang/test/PCH/pr76443.cpp
new file mode 100644
index 000000000000..5b3e23de7da0
--- /dev/null
+++ b/clang/test/PCH/pr76443.cpp
@@ -0,0 +1,24 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+//
+// RUN: %clang_cc1 -std=c++17 -emit-pch %s -o %t/h.pcm
+
+//--- header.h
+template <int... Nx> int stringData(const char (&...x)[Nx]) {
+  return 0;
+}
+int qt_meta_stringdata_CLASSQStyleENDCLASS = stringData(
+    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
+    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
+    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
+    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
+    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
+    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
+    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
+    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
+    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
+    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
+    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
+    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
+    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
+    "", "", "", "", "", "", "", "", "", "");
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index db89aa7b608a..b3da54162da0 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -600,6 +600,7 @@
 // RUN: %clang -target aarch64-none-elf -march=armv9.1-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // RUN: %clang -target aarch64-none-elf -march=armv9.2-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // RUN: %clang -target aarch64-none-elf -march=armv9.3-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
+// RUN: %clang -target aarch64-none-elf -march=armv9.4-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // RUN: %clang -target aarch64-none-elf -march=armv9.5-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // CHECK-V81-OR-LATER: __ARM_FEATURE_ATOMICS 1
 // CHECK-V85-OR-LATER: __ARM_FEATURE_BTI 1
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index 1ae6faea7767..27c7b4a271fe 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -2515,7 +2515,7 @@
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SRF_M32
 // RUN: %clang -march=grandridge -m32 -E -dM %s -o - 2>&1 \
 // RUN:     --target=i386 \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32,CHECK_GRR_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32
 // RUN: %clang -march=arrowlake -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SRF_M32
@@ -2572,7 +2572,6 @@
 // CHECK_SRF_M32: #define __PRFCHW__ 1
 // CHECK_SRF_M32: #define __PTWRITE__ 1
 // CHECK_SRF_M32-NOT: #define __RAOINT__ 1
-// CHECK_GRR_M32: #define __RAOINT__ 1
 // CHECK_SRF_M32: #define __RDPID__ 1
 // CHECK_SRF_M32: #define __RDRND__ 1
 // CHECK_SRF_M32: #define __RDSEED__ 1
@@ -2618,7 +2617,7 @@
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SRF_M64
 // RUN: %clang -march=grandridge -m64 -E -dM %s -o - 2>&1 \
 // RUN:     --target=i386 \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M64,CHECK_GRR_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M64
 // RUN: %clang -march=arrowlake -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SRF_M64
@@ -2675,7 +2674,6 @@
 // CHECK_SRF_M64: #define __PRFCHW__ 1
 // CHECK_SRF_M64: #define __PTWRITE__ 1
 // CHECK_SRF_M64-NOT: #define __RAOINT__ 1
-// CHECK_GRR_M64: #define __RAOINT__ 1
 // CHECK_SRF_M64: #define __RDPID__ 1
 // CHECK_SRF_M64: #define __RDRND__ 1
 // CHECK_SRF_M64: #define __RDSEED__ 1
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 35208b2eae8f..02d8d34116f8 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -33,7 +33,6 @@
 // CHECK-NOT: __riscv_xcvmac {{.*$}}
 // CHECK-NOT: __riscv_xcvmem {{.*$}}
 // CHECK-NOT: __riscv_xcvsimd {{.*$}}
-// CHECK-NOT: __riscv_xsfcie {{.*$}}
 // CHECK-NOT: __riscv_xsfvcp {{.*$}}
 // CHECK-NOT: __riscv_xsfvfnrclipxfqf {{.*$}}
 // CHECK-NOT: __riscv_xsfvfwmaccqqq {{.*$}}
@@ -119,7 +118,10 @@
 // CHECK-NOT: __riscv_zfa {{.*$}}
 // CHECK-NOT: __riscv_zfbfmin {{.*$}}
 // CHECK-NOT: __riscv_zicfilp {{.*$}}
+// CHECK-NOT: __riscv_zicfiss {{.*$}}
 // CHECK-NOT: __riscv_zicond {{.*$}}
+// CHECK-NOT: __riscv_zimop {{.*$}}
+// CHECK-NOT: __riscv_zcmop {{.*$}}
 // CHECK-NOT: __riscv_ztso {{.*$}}
 // CHECK-NOT: __riscv_zvbb {{.*$}}
 // CHECK-NOT: __riscv_zvbc {{.*$}}
@@ -314,14 +316,6 @@
 // CHECK-XCVSIMD-EXT: __riscv_xcvsimd 1000000{{$}}
 
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
-// RUN: -march=rv32ixsfcie -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-XSFCIE-EXT %s
-// RUN: %clang --target=riscv64-unknown-linux-gnu \
-// RUN: -march=rv64ixsfcie -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-XSFCIE-EXT %s
-// CHECK-XSFCIE-EXT: __riscv_xsfcie 1000000{{$}}
-
-// RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN: -march=rv32ixsfvcp -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-XSFVCP-EXT %s
 // RUN: %clang --target=riscv64-unknown-linux-gnu \
@@ -1071,6 +1065,22 @@
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s
 // CHECK-ZICOND-EXT: __riscv_zicond  1000000{{$}}
 
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32i_zimop0p1 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZIMOP-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64i_zimop0p1 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZIMOP-EXT %s
+// CHECK-ZIMOP-EXT: __riscv_zimop  1000{{$}}
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32i_zcmop0p2 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZCMOP-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64i_zcmop0p2 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZCMOP-EXT %s
+// CHECK-ZCMOP-EXT: __riscv_zcmop  2000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu -menable-experimental-extensions \
 // RUN: -march=rv32iztso0p1 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZTSO-EXT %s
@@ -1278,3 +1288,11 @@
 // RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64i -E -dM %s \
 // RUN:   -munaligned-access -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
 // CHECK-MISALIGNED-FAST: __riscv_misaligned_fast 1
+
+// RUN: %clang -target riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32izicfiss0p4 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICFISS-EXT %s
+// RUN: %clang -target riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64izicfiss0p4 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICFISS-EXT %s
+// CHECK-ZICFISS-EXT: __riscv_zicfiss 4000{{$}}
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
index 6ab6dabd92b1..6a6370bf99b1 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
@@ -241,3 +241,112 @@ void test_bfmlslb_bad_lane(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) __
   svbfmlslb_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
   svbfmlslt_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
 }
+
+void test_multiply_add_sub_long(uint32_t base, svint8_t s8, svuint8_t u8,
+                                svint16_t s16, svuint16_t u16, svint8x2_t s8x2,
+                                svuint8x2_t u8x2, svint16x2_t s16x2, svuint16x2_t u16x2,
+                                svint8x4_t s8x4, svuint8x4_t u8x4, svint16x4_t s16x4, svuint16x4_t u16x4) __arm_streaming __arm_shared_za {
+
+  svmla_lane_za32_s8_vg4x1(base, s8, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za32_u8_vg4x1(base, u8, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za64_s16_vg4x1(base, s16, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmla_lane_za64_u16_vg4x1(base, u16, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmla_lane_za32_s8_vg4x2(base, s8x2, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za32_u8_vg4x2(base, u8x2, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za64_s16_vg4x2(base, s16x2, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmla_lane_za64_u16_vg4x2(base, u16x2, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmla_lane_za32_s8_vg4x4(base, s8x4, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za32_u8_vg4x4(base, u8x4, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za64_s16_vg4x4(base, s16x4, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmla_lane_za64_u16_vg4x4(base, u16x4, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmls_lane_za32_s8_vg4x1(base, s8, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za32_u8_vg4x1(base, u8, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za64_s16_vg4x1(base, s16, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmls_lane_za64_u16_vg4x1(base, u16, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmls_lane_za32_s8_vg4x2(base, s8x2, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za32_u8_vg4x2(base, u8x2, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za64_s16_vg4x2(base, s16x2, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmls_lane_za64_u16_vg4x2(base, u16x2, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmls_lane_za32_s8_vg4x4(base, s8x4, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za32_u8_vg4x4(base, u8x4, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za64_s16_vg4x4(base, s16x4, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmls_lane_za64_u16_vg4x4(base, u16x4, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svsumla_lane_za32_s8_vg4x1(base, s8, u8, 16);  // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svsumla_lane_za32_s8_vg4x2(base, s8x2, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svsumla_lane_za32_s8_vg4x4(base, s8x4, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+
+  svusmla_lane_za32_u8_vg4x1(base, u8, s8, 16);  // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svusmla_lane_za32_u8_vg4x2(base, u8x2, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svusmla_lane_za32_u8_vg4x4(base, u8x4, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+}
+
+void test_vertical_dot_product(uint32_t base, svint16x2_t s16x2, svuint16x2_t u16x2,
+                               svint8x4_t s8x4, svuint8x4_t u8x4,
+                               svint16x4_t s16x4, svuint16x4_t u16x4,
+                               svfloat16x2_t f16x2, svbfloat16x2_t bf16x2,
+                               svint16_t s16, svuint16_t u16,
+                               svint8_t s8, svuint8_t u8,
+                               svfloat16_t f16, svbfloat16_t b16) __arm_streaming __arm_shared_za {
+  // Test lane indices.
+  svvdot_lane_za32_s16_vg1x2(base, s16x2, s16, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svvdot_lane_za32_u16_vg1x2(base, u16x2, u16, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svvdot_lane_za32_s8_vg1x4(base, s8x4, s8, 4);      // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svvdot_lane_za32_u8_vg1x4(base, u8x4, u8, 4);      // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svvdot_lane_za64_s16_vg1x4(base, s16x4, s16, 2);   // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  svvdot_lane_za64_u16_vg1x4(base, u16x4, u16, 2);   // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  svvdot_lane_za32_f16_vg1x2(base, f16x2, f16, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svvdot_lane_za32_bf16_vg1x2(base, bf16x2, b16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svsuvdot_lane_za32_s8_vg1x4(base, s8x4, s8, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svusvdot_lane_za32_u8_vg1x4(base, u8x4, u8, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+}
+
+void test_fdot_za32_bad_lane(uint32_t slice_base, svfloat16_t z_f16,
+                             svfloat16x2_t z_f16x2, svfloat16x4_t z_f16x4,
+                             svbfloat16_t z_bf16, svbfloat16x2_t z_bf16x2,
+                             svbfloat16x4_t z_bf16x4) __arm_streaming __arm_shared_za {
+  // 16-bit float
+  svdot_lane_za32_f16_vg1x2(slice_base, z_f16x2, z_f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_f16_vg1x4(slice_base, z_f16x4, z_f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+
+  // 16-bit binary float
+  svdot_lane_za32_bf16_vg1x2(slice_base, z_bf16x2, z_bf16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_bf16_vg1x4(slice_base, z_bf16x4, z_bf16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+}
+
+void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16,
+                                    svuint16x2_t z_u16x2, svuint16x4_t z_u16x4,
+                                    svint16_t z_s16, svint16x2_t z_s16x2,
+                                    svint16x4_t z_s16x4, svuint8_t z_u8,
+                                    svuint8x2_t z_u8x2, svuint8x4_t z_u8x4,
+                                    svint8_t z_s8, svint8x2_t z_s8x2,
+                                    svint8x4_t z_s8x4) __arm_streaming __arm_shared_za {
+  // Multi, indexed (unsigned)
+  svdot_lane_za32_u16_vg1x2(slice_base, z_u16x2, z_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_u16_vg1x4(slice_base, z_u16x4, z_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_u8_vg1x2(slice_base, z_u8x2, z_u8, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_u8_vg1x4(slice_base, z_u8x4, z_u8, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za64_u16_vg1x2(slice_base, z_u16x2, z_u16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  svdot_lane_za64_u16_vg1x4(slice_base, z_u16x4, z_u16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+
+  // Multi, indexed (signed)
+  svdot_lane_za32_s16_vg1x2(slice_base, z_s16x2, z_s16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_s16_vg1x4(slice_base, z_s16x4, z_s16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_s8_vg1x2(slice_base, z_s8x2, z_s8, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_s8, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za64_s16_vg1x2(slice_base, z_s16x2, z_s16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  svdot_lane_za64_s16_vg1x4(slice_base, z_s16x4, z_s16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+
+  // Multi, indexed (unsigned by signed)
+  svusdot_lane_za32_u8_vg1x2(slice_base, z_u8x2, z_s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svusdot_lane_za32_u8_vg1x4(slice_base, z_u8x4, z_s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+
+  // Multi, indexed (unsigned by signed)
+  svsudot_lane_za32_s8_vg1x2(slice_base, z_s8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svsudot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+}
diff --git a/clang/test/Sema/riscv-vector-zve32x-check.c b/clang/test/Sema/riscv-vector-zve32x-check.c
index a021de8bf31f..aff6e477378c 100644
--- a/clang/test/Sema/riscv-vector-zve32x-check.c
+++ b/clang/test/Sema/riscv-vector-zve32x-check.c
@@ -97,11 +97,3 @@ __rvv_bool32_t vbool32 () { /* expected-error {{RISC-V type '__rvv_bool32_t' req
 
   return b32; /* expected-error {{RISC-V type '__rvv_bool32_t' requires the 'zve32x' extension}} */
 }
-
-__rvv_bool64_t vbool64 () { /* expected-error {{RISC-V type '__rvv_bool64_t' requires the 'zve32x' extension}} */
-  __rvv_bool64_t b64; /* expected-error {{RISC-V type '__rvv_bool64_t' requires the 'zve32x' extension}} */
-
-  (void)b64; /* expected-error {{RISC-V type '__rvv_bool64_t' requires the 'zve32x' extension}} */
-
-  return b64; /* expected-error {{RISC-V type '__rvv_bool64_t' requires the 'zve32x' extension}} */
-}
diff --git a/clang/test/Sema/riscv-vector-zve64x-check.c b/clang/test/Sema/riscv-vector-zve64x-check.c
index 5fb2ad483f63..7ef156832702 100644
--- a/clang/test/Sema/riscv-vector-zve64x-check.c
+++ b/clang/test/Sema/riscv-vector-zve64x-check.c
@@ -37,3 +37,11 @@ __rvv_int64m1_t foo64() { /* expected-error {{RISC-V type '__rvv_int64m1_t' requ
 
   return i64m1; /* expected-error {{RISC-V type '__rvv_int64m1_t' requires the 'zve64x' extension}} */
 }
+
+__rvv_bool64_t vbool64 () { /* expected-error {{RISC-V type '__rvv_bool64_t' requires the 'zve64x' extension}} */
+  __rvv_bool64_t b64; /* expected-error {{RISC-V type '__rvv_bool64_t' requires the 'zve64x' extension}} */
+
+  (void)b64; /* expected-error {{RISC-V type '__rvv_bool64_t' requires the 'zve64x' extension}} */
+
+  return b64; /* expected-error {{RISC-V type '__rvv_bool64_t' requires the 'zve64x' extension}} */
+}
diff --git a/clang/test/Sema/rvv-required-features.c b/clang/test/Sema/rvv-required-features.c
index 2714ef04b9bf..5846f338aa80 100644
--- a/clang/test/Sema/rvv-required-features.c
+++ b/clang/test/Sema/rvv-required-features.c
@@ -1,8 +1,6 @@
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +xsfvcp \
-// RUN:     -target-feature +xsfvqmaccdod -target-feature +xsfvqmaccqoq \
-// RUN:     -target-feature +experimental-zvfbfmin -target-feature +xsfvfwmaccqqq \
-// RUN:     -target-feature +xsfvfnrclipxfqf %s -fsyntax-only -verify
+// RUN:     -target-feature +xsfvqmaccdod -target-feature +xsfvqmaccqoq %s -fsyntax-only -verify
 
 // expected-no-diagnostics
 
@@ -25,18 +23,6 @@ void test_xsfvqmaccdod(vint32m1_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   __riscv_sf_vqmacc_2x8x2(vd, vs1, vs2, vl);
 }
 
-void test_xsfvqmaccqoq(vint32m1_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+void test_xsfvqmaccqoq(vint32m1_t vd, vint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   __riscv_sf_vqmacc_4x8x4(vd, vs1, vs2, vl);
 }
-
-void test_xsfvfwmaccqqq(vfloat32m1_t vd, vbfloat16m1_t vs1, vbfloat16mf2_t vs2, size_t vl) {
-  __riscv_sf_vfwmacc_4x4x4(vd, vs1, vs2, vl);
-}
-
-void test_xsfvfnrclipxufqf(vfloat32m1_t vs1, float rs2, size_t vl) {
-  __riscv_sf_vfnrclip_xu_f_qf(vs1, rs2, vl);
-}
-
-void test_xsfvfnrclipxfqf(vfloat32m1_t vs1, float rs2, size_t vl) {
-  __riscv_sf_vfnrclip_x_f_qf(vs1, rs2, vl);
-}
diff --git a/clang/test/Sema/switch-default.c b/clang/test/Sema/switch-default.c
deleted file mode 100644
index 342a97ee68b1..000000000000
--- a/clang/test/Sema/switch-default.c
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -Wswitch-default %s
-
-int f1(int a) {
-  switch (a) {                // expected-warning {{'switch' missing 'default' label}}
-    case 1: a++; break;
-    case 2: a += 2; break;
-  }
-  return a;
-}
-
-int f2(int a) {
-  switch (a) {                // no-warning
-    default:
-      ;
-  }
-  return a;
-}
-
-// Warn even completely covered Enum cases(GCC compatibility).
-enum E { A, B };
-enum E check_enum(enum E e) {
-  switch (e) {                // expected-warning {{'switch' missing 'default' label}}
-    case A: break;
-    case B: break;
-  }
-  return e;
-}
-
diff --git a/clang/test/Sema/switch-default.cpp b/clang/test/Sema/switch-default.cpp
new file mode 100644
index 000000000000..32d03dae8827
--- /dev/null
+++ b/clang/test/Sema/switch-default.cpp
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wswitch-default %s
+
+int f1(int a) {
+  switch (a) {                // expected-warning {{'switch' missing 'default' label}}
+    case 1: a++; break;
+    case 2: a += 2; break;
+  }
+  return a;
+}
+
+int f2(int a) {
+  switch (a) {                // no-warning
+    default:
+      ;
+  }
+  return a;
+}
+
+// Warn even completely covered Enum cases(GCC compatibility).
+enum E { A, B };
+enum E check_enum(enum E e) {
+  switch (e) {                // expected-warning {{'switch' missing 'default' label}}
+    case A: break;
+    case B: break;
+  }
+  return e;
+}
+
+template<typename Index>
+int t1(Index i)
+{
+  switch (i) {              // expected-warning {{'switch' missing 'default' label}}
+    case 0: return 0;
+    case 1: return 1;
+  }
+  return 0;
+}
+
+template<typename Index>
+int t2(Index i)
+{
+  switch (i) {            // no-warning
+    case 0: return 0;
+    case 1: return 1;
+    default: return 2;
+  }
+  return 0;
+}
+
+int main() {
+  return t1(1);       // expected-note {{in instantiation of function template specialization 't1<int>' requested here}}
+}
+
diff --git a/clang/test/SemaCXX/GH63151.cpp b/clang/test/SemaCXX/GH63151.cpp
index 2c7533ed88f3..a4d0da0beee2 100644
--- a/clang/test/SemaCXX/GH63151.cpp
+++ b/clang/test/SemaCXX/GH63151.cpp
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,narrowing %s
+// RUN: %clang_cc1 -fsyntax-only -Wno-c++11-narrowing-const-reference -verify %s
 
 struct A { A(const unsigned &x) {} };
 
 void foo(int p) {
-  A a { -1 }; // expected-error {{constant expression evaluates to -1 which cannot be narrowed to type 'unsigned int'}}
+  A a { -1 }; // narrowing-error {{constant expression evaluates to -1 which cannot be narrowed to type 'unsigned int'}}
   A b { 0 };
-  A c { p }; // expected-error {{non-constant-expression cannot be narrowed from type 'int' to 'unsigned int' in initializer list}}
-  A d { 0.5 }; // expected-error {{type 'double' cannot be narrowed to 'unsigned int' in initializer list}}
+  A c { p }; // narrowing-error {{non-constant-expression cannot be narrowed from type 'int' to 'unsigned int' in initializer list}}
+  A d { 0.5 }; // narrowing-error {{type 'double' cannot be narrowed to 'unsigned int' in initializer list}}
                // expected-warning@-1 {{implicit conversion from 'double' to 'unsigned int' changes value from 0.5 to 0}}
 }
diff --git a/clang/test/SemaCXX/crash-GH76228.cpp b/clang/test/SemaCXX/crash-GH76228.cpp
new file mode 100644
index 000000000000..33a939582312
--- /dev/null
+++ b/clang/test/SemaCXX/crash-GH76228.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -std=c++20 -verify %s
+// Check we don't crash on incomplete members and bases when handling parenthesized initialization.
+class incomplete; // expected-note@-0 3  {{forward declaration of 'incomplete'}}
+struct foo {
+  int a;
+  incomplete b;
+  // expected-error@-1 {{incomplete type}}
+};
+foo a1(0);
+
+struct one_int {
+    int a;
+};
+struct bar : one_int, incomplete {};
+// expected-error@-1 {{incomplete type}}
+bar a2(0);
+
+incomplete a3[3](1,2,3);
+// expected-error@-1 {{incomplete type}}
+
+struct qux : foo {
+};
+qux a4(0);
+
+struct fred {
+    foo a[3];
+};
+fred a5(0);
diff --git a/clang/test/SemaCXX/paren-list-agg-init.cpp b/clang/test/SemaCXX/paren-list-agg-init.cpp
index f60b20e0d465..c1964a5a9eb0 100644
--- a/clang/test/SemaCXX/paren-list-agg-init.cpp
+++ b/clang/test/SemaCXX/paren-list-agg-init.cpp
@@ -289,7 +289,7 @@ int test() {
   // used to crash
   S a(0, 1);
   S b(0);
-  S c(0, 0, 1); // beforecxx20-warning {{aggregate initialization of type 'S' from a parenthesized list of values is a C++20 extension}}
+  S c(0, 0, 1);
 
   S d {0, 1};
   S e {0};
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index d2e3d8d43aef..be34dbbe886a 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -12,6 +12,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "../../lib/Format/MatchFilePath.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/FileManager.h"
@@ -570,6 +571,69 @@ static int dumpConfig(bool IsSTDIN) {
   return 0;
 }
 
+// Check whether `FilePath` is ignored according to the nearest
+// .clang-format-ignore file based on the rules below:
+// - A blank line is skipped.
+// - Leading and trailing spaces of a line are trimmed.
+// - A line starting with a hash (`#`) is a comment.
+// - A non-comment line is a single pattern.
+// - The slash (`/`) is used as the directory separator.
+// - A pattern is relative to the directory of the .clang-format-ignore file (or
+//   the root directory if the pattern starts with a slash).
+// - A pattern is negated if it starts with a bang (`!`).
+static bool isIgnored(StringRef FilePath) {
+  using namespace llvm::sys::fs;
+  if (!is_regular_file(FilePath))
+    return false;
+
+  using namespace llvm::sys::path;
+  SmallString<128> Path, AbsPath{FilePath};
+
+  make_absolute(AbsPath);
+  remove_dots(AbsPath, /*remove_dot_dot=*/true);
+
+  StringRef IgnoreDir{AbsPath};
+  do {
+    IgnoreDir = parent_path(IgnoreDir);
+    if (IgnoreDir.empty())
+      return false;
+
+    Path = IgnoreDir;
+    append(Path, ".clang-format-ignore");
+  } while (!is_regular_file(Path));
+
+  std::ifstream IgnoreFile{Path.c_str()};
+  if (!IgnoreFile.good())
+    return false;
+
+  const auto Pathname = convert_to_slash(AbsPath);
+  for (std::string Line; std::getline(IgnoreFile, Line);) {
+    auto Pattern = StringRef(Line).trim();
+    if (Pattern.empty() || Pattern[0] == '#')
+      continue;
+
+    const bool IsNegated = Pattern[0] == '!';
+    if (IsNegated)
+      Pattern = Pattern.drop_front();
+
+    if (Pattern.empty())
+      continue;
+
+    Pattern = Pattern.ltrim();
+    if (Pattern[0] != '/') {
+      Path = convert_to_slash(IgnoreDir);
+      append(Path, Style::posix, Pattern);
+      remove_dots(Path, /*remove_dot_dot=*/true, Style::posix);
+      Pattern = Path.str();
+    }
+
+    if (clang::format::matchFilePath(Pattern, Pathname) == !IsNegated)
+      return true;
+  }
+
+  return false;
+}
+
 int main(int argc, const char **argv) {
   llvm::InitLLVM X(argc, argv);
 
@@ -618,11 +682,14 @@ int main(int argc, const char **argv) {
   unsigned FileNo = 1;
   bool Error = false;
   for (const auto &FileName : FileNames) {
+    const bool IsSTDIN = FileName == "-";
+    if (!IsSTDIN && isIgnored(FileName))
+      continue;
     if (Verbose) {
       errs() << "Formatting [" << FileNo++ << "/" << FileNames.size() << "] "
              << FileName << "\n";
     }
-    Error |= clang::format::format(FileName, FileName == "-");
+    Error |= clang::format::format(FileName, IsSTDIN);
   }
   return Error ? 1 : 0;
 }
diff --git a/clang/tools/libclang/CIndexer.cpp b/clang/tools/libclang/CIndexer.cpp
index 430147b2aa77..12d9d418dea5 100644
--- a/clang/tools/libclang/CIndexer.cpp
+++ b/clang/tools/libclang/CIndexer.cpp
@@ -14,10 +14,10 @@
 #include "CXString.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/Version.h"
+#include "clang/Config/config.h"
 #include "clang/Driver/Driver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/Path.h"
@@ -127,7 +127,7 @@ const std::string &CIndexer::getClangResourcesPath() {
   getClangResourcesPathImplAIX(LibClangPath);
 #else
   bool PathFound = false;
-#if defined(HAVE_DLFCN_H) && defined(HAVE_DLADDR)
+#if defined(CLANG_HAVE_DLFCN_H) && defined(CLANG_HAVE_DLADDR)
   Dl_info info;
   // This silly cast below avoids a C++ warning.
   if (dladdr((void *)(uintptr_t)clang_createTranslationUnit, &info) != 0) {
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 4dd7510bf8dd..ed8ecb080e26 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -561,6 +561,18 @@ TEST_P(ImportExpr, ImportVAArgExpr) {
                  cStyleCastExpr(hasSourceExpression(vaArgExpr())))));
 }
 
+const internal::VariadicDynCastAllOfMatcher<Stmt, BuiltinBitCastExpr>
+    builtinBitCastExpr;
+
+TEST_P(ImportExpr, ImportBuiltinBitCastExpr) {
+  MatchVerifier<Decl> Verifier;
+  testImport("void declToImport(int X) {"
+             "  (void)__builtin_bit_cast(float, X); }",
+             Lang_CXX20, "", Lang_CXX20, Verifier,
+             functionDecl(hasDescendant(
+                 cStyleCastExpr(hasSourceExpression(builtinBitCastExpr())))));
+}
+
 TEST_P(ImportExpr, CXXTemporaryObjectExpr) {
   MatchVerifier<Decl> Verifier;
   testImport(
@@ -7250,6 +7262,26 @@ TEST_P(ImportAutoFunctions, ReturnWithAutoTemplateType) {
       Lang_CXX14, /*FindLast=*/true);
 }
 
+TEST_P(ImportAutoFunctions, ReturnWithSubstNonTypeTemplateParmExpr) {
+  const char *Code =
+      R"(
+      template<int>
+      struct array {};
+
+      template <int N>
+      auto foo() { return array<N>(); }
+
+      void bar() { foo<0>(); }
+      )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX17);
+
+  auto *FromBar = FirstDeclMatcher<FunctionDecl>().match(
+      FromTU, functionDecl(hasName("bar")));
+
+  auto *ToBar = Import(FromBar, Lang_CXX17);
+  EXPECT_TRUE(ToBar);
+}
+
 struct ImportSourceLocations : ASTImporterOptionSpecificTestBase {};
 
 TEST_P(ImportSourceLocations, PreserveFileIDTreeStructure) {
@@ -7425,67 +7457,46 @@ void ImportAttributes::checkImported<Decl>(const Decl *From, const Decl *To) {
             ToAST->getASTContext().getTranslationUnitDecl());
 }
 
-// FIXME: Use ImportAttributes for this test.
-TEST_P(ASTImporterOptionSpecificTestBase, ImportExprOfAlignmentAttr) {
-  // Test if import of these packed and aligned attributes does not trigger an
-  // error situation where source location from 'From' context is referenced in
-  // 'To' context through evaluation of the alignof attribute.
-  // This happens if the 'alignof(A)' expression is not imported correctly.
-  Decl *FromTU = getTuDecl(
+TEST_P(ImportAttributes, ImportAligned) {
+  AlignedAttr *FromAttr, *ToAttr;
+  importAttr<RecordDecl>(
       R"(
       struct __attribute__((packed)) A { int __attribute__((aligned(8))) X; };
-      struct alignas(alignof(A)) S {};
+      struct alignas(alignof(A)) test {};
       )",
-      Lang_CXX11, "input.cc");
-  auto *FromD = FirstDeclMatcher<CXXRecordDecl>().match(
-      FromTU, cxxRecordDecl(hasName("S"), unless(isImplicit())));
-  ASSERT_TRUE(FromD);
-
-  auto *ToD = Import(FromD, Lang_CXX11);
-  ASSERT_TRUE(ToD);
-
-  auto *FromAttr = FromD->getAttr<AlignedAttr>();
-  auto *ToAttr = ToD->getAttr<AlignedAttr>();
-  EXPECT_EQ(FromAttr->isInherited(), ToAttr->isInherited());
-  EXPECT_EQ(FromAttr->isPackExpansion(), ToAttr->isPackExpansion());
-  EXPECT_EQ(FromAttr->isImplicit(), ToAttr->isImplicit());
-  EXPECT_EQ(FromAttr->getSyntax(), ToAttr->getSyntax());
-  EXPECT_EQ(FromAttr->getSemanticSpelling(), ToAttr->getSemanticSpelling());
-  EXPECT_TRUE(ToAttr->getAlignmentExpr());
+      FromAttr, ToAttr);
+  checkImported(FromAttr->getAlignmentExpr(), ToAttr->getAlignmentExpr());
 
   auto *ToA = FirstDeclMatcher<CXXRecordDecl>().match(
-      ToD->getTranslationUnitDecl(),
+      ToAST->getASTContext().getTranslationUnitDecl(),
       cxxRecordDecl(hasName("A"), unless(isImplicit())));
   // Ensure that 'struct A' was imported (through reference from attribute of
-  // 'S').
+  // struct 'test').
   EXPECT_TRUE(ToA);
 }
 
-// FIXME: Use ImportAttributes for this test.
-TEST_P(ASTImporterOptionSpecificTestBase, ImportFormatAttr) {
-  Decl *FromTU = getTuDecl(
+TEST_P(ImportAttributes, ImportAlignValue) {
+  AlignValueAttr *FromAttr, *ToAttr;
+  importAttr<VarDecl>(
       R"(
-      int foo(const char * fmt, ...)
+      void *test __attribute__((align_value(64)));
+      )",
+      FromAttr, ToAttr);
+  checkImported(FromAttr->getAlignment(), ToAttr->getAlignment());
+}
+
+TEST_P(ImportAttributes, ImportFormat) {
+  FormatAttr *FromAttr, *ToAttr;
+  importAttr<FunctionDecl>(
+      R"(
+      int test(const char * fmt, ...)
       __attribute__ ((__format__ (__scanf__, 1, 2)));
       )",
-      Lang_CXX03, "input.cc");
-  auto *FromD = FirstDeclMatcher<FunctionDecl>().match(
-      FromTU, functionDecl(hasName("foo")));
-  ASSERT_TRUE(FromD);
+      FromAttr, ToAttr);
 
-  auto *ToD = Import(FromD, Lang_CXX03);
-  ASSERT_TRUE(ToD);
-  ToD->dump(); // Should not crash!
-
-  auto *FromAttr = FromD->getAttr<FormatAttr>();
-  auto *ToAttr = ToD->getAttr<FormatAttr>();
-  EXPECT_EQ(FromAttr->isInherited(), ToAttr->isInherited());
-  EXPECT_EQ(FromAttr->isPackExpansion(), ToAttr->isPackExpansion());
-  EXPECT_EQ(FromAttr->isImplicit(), ToAttr->isImplicit());
-  EXPECT_EQ(FromAttr->getSyntax(), ToAttr->getSyntax());
-  EXPECT_EQ(FromAttr->getAttributeSpellingListIndex(),
-            ToAttr->getAttributeSpellingListIndex());
   EXPECT_EQ(FromAttr->getType()->getName(), ToAttr->getType()->getName());
+  EXPECT_EQ(FromAttr->getFirstArg(), ToAttr->getFirstArg());
+  EXPECT_EQ(FromAttr->getFormatIdx(), ToAttr->getFormatIdx());
 }
 
 TEST_P(ImportAttributes, ImportEnableIf) {
@@ -9284,6 +9295,53 @@ TEST_P(ASTImporterOptionSpecificTestBase,
   // EXPECT_EQ(ToF1Imported->getPreviousDecl(), ToF1);
 }
 
+TEST_P(ASTImporterOptionSpecificTestBase,
+       ImportTypeAliasTemplateAfterSimilarCalledTemplateTypeParm) {
+  const char *Code =
+      R"(
+      struct S;
+      template <typename>
+      using Callable = S;
+      template <typename Callable>
+      int bindingFunctionVTable;
+      )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX17);
+
+  auto *FromCallable = FirstDeclMatcher<TypeAliasTemplateDecl>().match(
+      FromTU, typeAliasTemplateDecl(hasName("Callable")));
+
+  auto *FromCallableParm = FirstDeclMatcher<TemplateTypeParmDecl>().match(
+      FromTU, templateTypeParmDecl(hasName("Callable")));
+
+  auto *ToFromCallableParm = Import(FromCallableParm, Lang_CXX17);
+  auto *ToCallable = Import(FromCallable, Lang_CXX17);
+  EXPECT_TRUE(ToFromCallableParm);
+  EXPECT_TRUE(ToCallable);
+}
+
+TEST_P(ASTImporterOptionSpecificTestBase, ImportConflictTypeAliasTemplate) {
+  const char *ToCode =
+      R"(
+      struct S;
+      template <typename, typename>
+      using Callable = S;
+      )";
+  const char *Code =
+      R"(
+      struct S;
+      template <typename>
+      using Callable = S;
+      )";
+  (void)getToTuDecl(ToCode, Lang_CXX17);
+  Decl *FromTU = getTuDecl(Code, Lang_CXX17);
+
+  auto *FromCallable = FirstDeclMatcher<TypeAliasTemplateDecl>().match(
+      FromTU, typeAliasTemplateDecl(hasName("Callable")));
+
+  auto *ImportedCallable = Import(FromCallable, Lang_CXX17);
+  EXPECT_FALSE(ImportedCallable);
+}
+
 INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ASTImporterLookupTableTest,
                          DefaultTestValuesForRunOptions);
 
diff --git a/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp b/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp
index 84fe675c32c2..cd6a37d370e8 100644
--- a/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp
@@ -89,8 +89,6 @@ TEST(RecordOpsTest, CopyRecord) {
         auto *S2Val = cast<RecordValue>(Env.getValue(S2));
         EXPECT_NE(S1Val, S2Val);
 
-        S1Val->setProperty("prop", Env.getBoolLiteralValue(true));
-
         copyRecord(S1, S2, Env);
 
         EXPECT_EQ(getFieldValue(&S1, *OuterIntDecl, Env),
@@ -104,8 +102,6 @@ TEST(RecordOpsTest, CopyRecord) {
         S1Val = cast<RecordValue>(Env.getValue(S1));
         S2Val = cast<RecordValue>(Env.getValue(S2));
         EXPECT_NE(S1Val, S2Val);
-
-        EXPECT_EQ(S2Val->getProperty("prop"), &Env.getBoolLiteralValue(true));
       });
 }
 
@@ -150,9 +146,6 @@ TEST(RecordOpsTest, RecordsEqual) {
         Env.setValue(S1.getSyntheticField("synth_int"),
                      Env.create<IntegerValue>());
 
-        cast<RecordValue>(Env.getValue(S1))
-            ->setProperty("prop", Env.getBoolLiteralValue(true));
-
         // Strategy: Create two equal records, then verify each of the various
         // ways in which records can differ causes recordsEqual to return false.
         // changes we can make to the record.
@@ -202,36 +195,6 @@ TEST(RecordOpsTest, RecordsEqual) {
         EXPECT_FALSE(recordsEqual(S1, S2, Env));
         copyRecord(S1, S2, Env);
         EXPECT_TRUE(recordsEqual(S1, S2, Env));
-
-        // S1 and S2 have the same property with different values.
-        cast<RecordValue>(Env.getValue(S2))
-            ->setProperty("prop", Env.getBoolLiteralValue(false));
-        EXPECT_FALSE(recordsEqual(S1, S2, Env));
-        copyRecord(S1, S2, Env);
-        EXPECT_TRUE(recordsEqual(S1, S2, Env));
-
-        // S1 has a property that S2 doesn't have.
-        cast<RecordValue>(Env.getValue(S1))
-            ->setProperty("other_prop", Env.getBoolLiteralValue(false));
-        EXPECT_FALSE(recordsEqual(S1, S2, Env));
-        // We modified S1 this time, so need to copy back the other way.
-        copyRecord(S2, S1, Env);
-        EXPECT_TRUE(recordsEqual(S1, S2, Env));
-
-        // S2 has a property that S1 doesn't have.
-        cast<RecordValue>(Env.getValue(S2))
-            ->setProperty("other_prop", Env.getBoolLiteralValue(false));
-        EXPECT_FALSE(recordsEqual(S1, S2, Env));
-        copyRecord(S1, S2, Env);
-        EXPECT_TRUE(recordsEqual(S1, S2, Env));
-
-        // S1 and S2 have the same number of properties, but with different
-        // names.
-        cast<RecordValue>(Env.getValue(S1))
-            ->setProperty("prop1", Env.getBoolLiteralValue(false));
-        cast<RecordValue>(Env.getValue(S2))
-            ->setProperty("prop2", Env.getBoolLiteralValue(false));
-        EXPECT_FALSE(recordsEqual(S1, S2, Env));
       });
 }
 
diff --git a/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
index 362b0dea58d6..b5fc7bbc431e 100644
--- a/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
@@ -133,7 +133,7 @@ void transferBinary(const BinaryOperator *BO, const MatchFinder::MatchResult &M,
                     LatticeTransferState &State) {
   auto &A = State.Env.arena();
   const Formula *Comp;
-  if (BoolValue *V = cast_or_null<BoolValue>(State.Env.getValue(*BO))) {
+  if (BoolValue *V = State.Env.get<BoolValue>(*BO)) {
     Comp = &V->formula();
   } else {
     Comp = &A.makeAtomRef(A.makeAtom());
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index 4c3cb322eacf..8d481788af20 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -623,11 +623,11 @@ TEST_F(JoinFlowConditionsTest, JoinDistinctButProvablyEquivalentValues) {
       });
 }
 
-class OptionalIntAnalysis final
-    : public DataflowAnalysis<OptionalIntAnalysis, NoopLattice> {
+class NullPointerAnalysis final
+    : public DataflowAnalysis<NullPointerAnalysis, NoopLattice> {
 public:
-  explicit OptionalIntAnalysis(ASTContext &Context)
-      : DataflowAnalysis<OptionalIntAnalysis, NoopLattice>(Context) {}
+  explicit NullPointerAnalysis(ASTContext &Context)
+      : DataflowAnalysis<NullPointerAnalysis, NoopLattice>(Context) {}
 
   static NoopLattice initialElement() { return {}; }
 
@@ -636,40 +636,37 @@ public:
     if (!CS)
       return;
     const Stmt *S = CS->getStmt();
-    auto OptionalIntRecordDecl = recordDecl(hasName("OptionalInt"));
-    auto HasOptionalIntType = hasType(OptionalIntRecordDecl);
-
-    SmallVector<BoundNodes, 1> Matches = match(
-        stmt(anyOf(cxxConstructExpr(HasOptionalIntType).bind("construct"),
-                   cxxOperatorCallExpr(
-                       callee(cxxMethodDecl(ofClass(OptionalIntRecordDecl))))
-                       .bind("operator"))),
-        *S, getASTContext());
-    if (const auto *E = selectFirst<CXXConstructExpr>(
-            "construct", Matches)) {
-      cast<RecordValue>(Env.getValue(*E))
-          ->setProperty("has_value", Env.getBoolLiteralValue(false));
-    } else if (const auto *E =
-                   selectFirst<CXXOperatorCallExpr>("operator", Matches)) {
-      assert(E->getNumArgs() > 0);
-      auto *Object = E->getArg(0);
-      assert(Object != nullptr);
-
-      refreshRecordValue(*Object, Env)
-          .setProperty("has_value", Env.getBoolLiteralValue(true));
+    const Expr *E = dyn_cast<Expr>(S);
+    if (!E)
+      return;
+
+    if (!E->getType()->isPointerType())
+      return;
+
+    // Make sure we have a `PointerValue` for `E`.
+    auto *PtrVal = cast_or_null<PointerValue>(Env.getValue(*E));
+    if (PtrVal == nullptr) {
+      PtrVal = cast<PointerValue>(Env.createValue(E->getType()));
+      Env.setValue(*E, *PtrVal);
     }
+
+    if (auto *Cast = dyn_cast<ImplicitCastExpr>(E);
+        Cast && Cast->getCastKind() == CK_NullToPointer)
+      PtrVal->setProperty("is_null", Env.getBoolLiteralValue(true));
+    else if (auto *Op = dyn_cast<UnaryOperator>(E);
+             Op && Op->getOpcode() == UO_AddrOf)
+      PtrVal->setProperty("is_null", Env.getBoolLiteralValue(false));
   }
 
   ComparisonResult compare(QualType Type, const Value &Val1,
                            const Environment &Env1, const Value &Val2,
                            const Environment &Env2) override {
-    // Nothing to say about a value that does not model an `OptionalInt`.
-    if (!Type->isRecordType() ||
-        Type->getAsCXXRecordDecl()->getQualifiedNameAsString() != "OptionalInt")
+    // Nothing to say about a value that is not a pointer.
+    if (!Type->isPointerType())
       return ComparisonResult::Unknown;
 
-    auto *Prop1 = Val1.getProperty("has_value");
-    auto *Prop2 = Val2.getProperty("has_value");
+    auto *Prop1 = Val1.getProperty("is_null");
+    auto *Prop2 = Val2.getProperty("is_null");
     assert(Prop1 != nullptr && Prop2 != nullptr);
     return areEquivalentValues(*Prop1, *Prop2) ? ComparisonResult::Same
                                                : ComparisonResult::Different;
@@ -678,23 +675,22 @@ public:
   bool merge(QualType Type, const Value &Val1, const Environment &Env1,
              const Value &Val2, const Environment &Env2, Value &MergedVal,
              Environment &MergedEnv) override {
-    // Nothing to say about a value that does not model an `OptionalInt`.
-    if (!Type->isRecordType() ||
-        Type->getAsCXXRecordDecl()->getQualifiedNameAsString() != "OptionalInt")
+    // Nothing to say about a value that is not a pointer.
+    if (!Type->isPointerType())
       return false;
 
-    auto *HasValue1 = cast_or_null<BoolValue>(Val1.getProperty("has_value"));
-    if (HasValue1 == nullptr)
+    auto *IsNull1 = cast_or_null<BoolValue>(Val1.getProperty("is_null"));
+    if (IsNull1 == nullptr)
       return false;
 
-    auto *HasValue2 = cast_or_null<BoolValue>(Val2.getProperty("has_value"));
-    if (HasValue2 == nullptr)
+    auto *IsNull2 = cast_or_null<BoolValue>(Val2.getProperty("is_null"));
+    if (IsNull2 == nullptr)
       return false;
 
-    if (HasValue1 == HasValue2)
-      MergedVal.setProperty("has_value", *HasValue1);
+    if (IsNull1 == IsNull2)
+      MergedVal.setProperty("is_null", *IsNull1);
     else
-      MergedVal.setProperty("has_value", MergedEnv.makeTopBoolValue());
+      MergedVal.setProperty("is_null", MergedEnv.makeTopBoolValue());
     return true;
   }
 };
@@ -703,23 +699,14 @@ class WideningTest : public Test {
 protected:
   template <typename Matcher>
   void runDataflow(llvm::StringRef Code, Matcher Match) {
-    tooling::FileContentMappings FilesContents;
-    FilesContents.push_back(
-        std::make_pair<std::string, std::string>("widening_test_defs.h", R"(
-      struct OptionalInt {
-        OptionalInt() = default;
-        OptionalInt& operator=(int);
-      };
-    )"));
     ASSERT_THAT_ERROR(
-        checkDataflow<OptionalIntAnalysis>(
-            AnalysisInputs<OptionalIntAnalysis>(
+        checkDataflow<NullPointerAnalysis>(
+            AnalysisInputs<NullPointerAnalysis>(
                 Code, ast_matchers::hasName("target"),
                 [](ASTContext &Context, Environment &Env) {
-                  return OptionalIntAnalysis(Context);
+                  return NullPointerAnalysis(Context);
                 })
-                .withASTBuildArgs({"-fsyntax-only", "-std=c++17"})
-                .withASTBuildVirtualMappedFiles(std::move(FilesContents)),
+                .withASTBuildArgs({"-fsyntax-only", "-std=c++17"}),
             /*VerifyResults=*/[&Match](const llvm::StringMap<
                                            DataflowAnalysisState<NoopLattice>>
                                            &Results,
@@ -731,13 +718,12 @@ protected:
 
 TEST_F(WideningTest, JoinDistinctValuesWithDistinctProperties) {
   std::string Code = R"(
-    #include "widening_test_defs.h"
-
     void target(bool Cond) {
-      OptionalInt Foo;
+      int *Foo = nullptr;
+      int i = 0;
       /*[[p1]]*/
       if (Cond) {
-        Foo = 1;
+        Foo = &i;
         /*[[p2]]*/
       }
       (void)0;
@@ -760,27 +746,27 @@ TEST_F(WideningTest, JoinDistinctValuesWithDistinctProperties) {
           return Env.getValue(*FooDecl);
         };
 
-        EXPECT_EQ(GetFooValue(Env1)->getProperty("has_value"),
-                  &Env1.getBoolLiteralValue(false));
-        EXPECT_EQ(GetFooValue(Env2)->getProperty("has_value"),
-                  &Env2.getBoolLiteralValue(true));
+        EXPECT_EQ(GetFooValue(Env1)->getProperty("is_null"),
+                  &Env1.getBoolLiteralValue(true));
+        EXPECT_EQ(GetFooValue(Env2)->getProperty("is_null"),
+                  &Env2.getBoolLiteralValue(false));
         EXPECT_TRUE(
-            isa<TopBoolValue>(GetFooValue(Env3)->getProperty("has_value")));
+            isa<TopBoolValue>(GetFooValue(Env3)->getProperty("is_null")));
       });
 }
 
 TEST_F(WideningTest, JoinDistinctValuesWithSameProperties) {
   std::string Code = R"(
-    #include "widening_test_defs.h"
-
     void target(bool Cond) {
-      OptionalInt Foo;
+      int *Foo = nullptr;
+      int i1 = 0;
+      int i2 = 0;
       /*[[p1]]*/
       if (Cond) {
-        Foo = 1;
+        Foo = &i1;
         /*[[p2]]*/
       } else {
-        Foo = 2;
+        Foo = &i2;
         /*[[p3]]*/
       }
       (void)0;
@@ -805,14 +791,14 @@ TEST_F(WideningTest, JoinDistinctValuesWithSameProperties) {
           return Env.getValue(*FooDecl);
         };
 
-        EXPECT_EQ(GetFooValue(Env1)->getProperty("has_value"),
-                  &Env1.getBoolLiteralValue(false));
-        EXPECT_EQ(GetFooValue(Env2)->getProperty("has_value"),
-                  &Env2.getBoolLiteralValue(true));
-        EXPECT_EQ(GetFooValue(Env3)->getProperty("has_value"),
-                  &Env3.getBoolLiteralValue(true));
-        EXPECT_EQ(GetFooValue(Env4)->getProperty("has_value"),
-                  &Env4.getBoolLiteralValue(true));
+        EXPECT_EQ(GetFooValue(Env1)->getProperty("is_null"),
+                  &Env1.getBoolLiteralValue(true));
+        EXPECT_EQ(GetFooValue(Env2)->getProperty("is_null"),
+                  &Env2.getBoolLiteralValue(false));
+        EXPECT_EQ(GetFooValue(Env3)->getProperty("is_null"),
+                  &Env3.getBoolLiteralValue(false));
+        EXPECT_EQ(GetFooValue(Env4)->getProperty("is_null"),
+                  &Env4.getBoolLiteralValue(false));
       });
 }
 
@@ -849,13 +835,13 @@ TEST_F(WideningTest, DistinctPointersToTheSameLocationAreEquivalent) {
 
 TEST_F(WideningTest, DistinctValuesWithSamePropertiesAreEquivalent) {
   std::string Code = R"(
-    #include "widening_test_defs.h"
-
     void target(bool Cond) {
-      OptionalInt Foo;
-      Foo = 1;
+      int *Foo;
+      int i1 = 0;
+      int i2 = 0;
+      Foo = &i1;
       while (Cond) {
-        Foo = 2;
+        Foo = &i2;
       }
       (void)0;
       /*[[p]]*/
@@ -872,8 +858,8 @@ TEST_F(WideningTest, DistinctValuesWithSamePropertiesAreEquivalent) {
         ASSERT_THAT(FooDecl, NotNull());
 
         const auto *FooVal = Env.getValue(*FooDecl);
-        EXPECT_EQ(FooVal->getProperty("has_value"),
-                  &Env.getBoolLiteralValue(true));
+        EXPECT_EQ(FooVal->getProperty("is_null"),
+                  &Env.getBoolLiteralValue(false));
       });
 }
 
diff --git a/clang/unittests/Format/CMakeLists.txt b/clang/unittests/Format/CMakeLists.txt
index 53136328928f..71f5886d946c 100644
--- a/clang/unittests/Format/CMakeLists.txt
+++ b/clang/unittests/Format/CMakeLists.txt
@@ -27,6 +27,7 @@ add_clang_unittest(FormatTests
   IntegerLiteralSeparatorTest.cpp
   MacroCallReconstructorTest.cpp
   MacroExpanderTest.cpp
+  MatchFilePathTest.cpp
   NamespaceEndCommentsFixerTest.cpp
   ObjCPropertyAttributeOrderFixerTest.cpp
   QualifierFixerTest.cpp
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 0e08723aa9e9..881993ede17c 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "FormatTestBase.h"
+#include "gmock/gmock.h"
 
 #define DEBUG_TYPE "format-test"
 
@@ -8497,7 +8498,10 @@ TEST_F(FormatTest, BreaksFunctionDeclarationsWithTrailingTokens) {
                "    __attribute__((unused));");
 
   Style = getGoogleStyle();
-  Style.AttributeMacros.push_back("GUARDED_BY");
+  ASSERT_THAT(Style.AttributeMacros,
+              testing::AllOf(testing::Contains("GUARDED_BY"),
+                             testing::Contains("ABSL_GUARDED_BY")));
+
   verifyFormat(
       "bool aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
       "    GUARDED_BY(aaaaaaaaaaaa);",
@@ -8514,6 +8518,23 @@ TEST_F(FormatTest, BreaksFunctionDeclarationsWithTrailingTokens) {
       "bool aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa GUARDED_BY(aaaaaaaaaaaa) =\n"
       "    aaaaaaaaaaaaaaaaaaaaaaaaa;",
       Style);
+
+  verifyFormat(
+      "bool aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
+      "    ABSL_GUARDED_BY(aaaaaaaaaaaa);",
+      Style);
+  verifyFormat(
+      "bool aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
+      "    ABSL_GUARDED_BY(aaaaaaaaaaaa);",
+      Style);
+  verifyFormat(
+      "bool aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ABSL_GUARDED_BY(aaaaaaaaaaaa) =\n"
+      "    aaaaaaaa::aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa;",
+      Style);
+  verifyFormat(
+      "bool aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ABSL_GUARDED_BY(aaaaaaaaaaaa) =\n"
+      "    aaaaaaaaaaaaaaaaaaaaaaaaa;",
+      Style);
 }
 
 TEST_F(FormatTest, FunctionAnnotations) {
@@ -10072,11 +10093,11 @@ TEST_F(FormatTest, ReturnTypeBreakingStyle) {
                getGoogleStyleWithColumns(40));
   verifyFormat("Tttttttttttttttttttttttt ppppppppppppppp\n"
                "    ABSL_GUARDED_BY(mutex1)\n"
-               "        ABSL_GUARDED_BY(mutex2);",
+               "    ABSL_GUARDED_BY(mutex2);",
                getGoogleStyleWithColumns(40));
   verifyFormat("Tttttt f(int a, int b)\n"
                "    ABSL_GUARDED_BY(mutex1)\n"
-               "        ABSL_GUARDED_BY(mutex2);",
+               "    ABSL_GUARDED_BY(mutex2);",
                getGoogleStyleWithColumns(40));
   // * typedefs
   verifyGoogleFormat("typedef ATTR(X) char x;");
@@ -26274,6 +26295,8 @@ TEST_F(FormatTest, BreakAfterAttributes) {
   constexpr StringRef Code("[[maybe_unused]] const int i;\n"
                            "[[foo([[]])]] [[maybe_unused]]\n"
                            "int j;\n"
+                           "[[maybe_unused]]\n"
+                           "foo<int> k;\n"
                            "[[nodiscard]] inline int f(int &i);\n"
                            "[[foo([[]])]] [[nodiscard]]\n"
                            "int g(int &i);\n"
@@ -26294,6 +26317,7 @@ TEST_F(FormatTest, BreakAfterAttributes) {
   Style.BreakAfterAttributes = FormatStyle::ABS_Never;
   verifyFormat("[[maybe_unused]] const int i;\n"
                "[[foo([[]])]] [[maybe_unused]] int j;\n"
+               "[[maybe_unused]] foo<int> k;\n"
                "[[nodiscard]] inline int f(int &i);\n"
                "[[foo([[]])]] [[nodiscard]] int g(int &i);\n"
                "[[nodiscard]] inline int f(int &i) {\n"
@@ -26311,6 +26335,8 @@ TEST_F(FormatTest, BreakAfterAttributes) {
                "const int i;\n"
                "[[foo([[]])]] [[maybe_unused]]\n"
                "int j;\n"
+               "[[maybe_unused]]\n"
+               "foo<int> k;\n"
                "[[nodiscard]]\n"
                "inline int f(int &i);\n"
                "[[foo([[]])]] [[nodiscard]]\n"
@@ -26458,6 +26484,19 @@ TEST_F(FormatTest, BreakAfterAttributes) {
                "{\n"
                "}",
                CtorDtorCode, Style);
+
+  verifyFormat("struct Foo {\n"
+               "  [[maybe_unused]]\n"
+               "  void operator+();\n"
+               "};\n"
+               "[[nodiscard]]\n"
+               "Foo &operator-(Foo &);",
+               Style);
+
+  Style.ReferenceAlignment = FormatStyle::ReferenceAlignmentStyle::RAS_Left;
+  verifyFormat("[[nodiscard]]\n"
+               "Foo& operator-(Foo&);",
+               Style);
 }
 
 TEST_F(FormatTest, InsertNewlineAtEOF) {
diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp
index e185eceb3530..3aded8f3726d 100644
--- a/clang/unittests/Format/FormatTestJS.cpp
+++ b/clang/unittests/Format/FormatTestJS.cpp
@@ -2836,5 +2836,11 @@ TEST_F(FormatTestJS, AlignConsecutiveAssignmentsAndDeclarations) {
                Style);
 }
 
+TEST_F(FormatTestJS, DontBreakFieldsAsGoToLabels) {
+  verifyFormat("export type Params = Config&{\n"
+               "  columns: Column[];\n"
+               "};");
+}
+
 } // namespace format
 } // end namespace clang
diff --git a/clang/unittests/Format/MatchFilePathTest.cpp b/clang/unittests/Format/MatchFilePathTest.cpp
new file mode 100644
index 000000000000..55723584ddc8
--- /dev/null
+++ b/clang/unittests/Format/MatchFilePathTest.cpp
@@ -0,0 +1,169 @@
+//===- unittest/Format/MatchFilePathTest.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../lib/Format/MatchFilePath.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace format {
+namespace {
+
+class MatchFilePathTest : public ::testing::Test {
+protected:
+  bool match(llvm::StringRef FilePath, llvm::StringRef Pattern) {
+    return matchFilePath(Pattern, FilePath);
+  }
+};
+
+// Most of the test cases below are from:
+// https://github.com/python/cpython/blob/main/Lib/test/test_fnmatch.py
+
+TEST_F(MatchFilePathTest, Wildcard) {
+  EXPECT_TRUE(match("abc", "?*?"));
+  EXPECT_TRUE(match("abc", "???*"));
+  EXPECT_TRUE(match("abc", "*???"));
+  EXPECT_TRUE(match("abc", "???"));
+  EXPECT_TRUE(match("abc", "*"));
+  EXPECT_TRUE(match("abc", "ab[cd]"));
+  EXPECT_TRUE(match("abc", "ab[!de]"));
+  EXPECT_FALSE(match("abc", "ab[de]"));
+  EXPECT_FALSE(match("a", "??"));
+  EXPECT_FALSE(match("a", "b"));
+}
+
+TEST_F(MatchFilePathTest, Backslash) {
+  EXPECT_TRUE(match("a?", R"(a\?)"));
+  EXPECT_FALSE(match("a\\", R"(a\)"));
+  EXPECT_TRUE(match("\\", R"([\])"));
+  EXPECT_TRUE(match("a", R"([!\])"));
+  EXPECT_FALSE(match("\\", R"([!\])"));
+}
+
+TEST_F(MatchFilePathTest, Newline) {
+  EXPECT_TRUE(match("foo\nbar", "foo*"));
+  EXPECT_TRUE(match("foo\nbar\n", "foo*"));
+  EXPECT_FALSE(match("\nfoo", "foo*"));
+  EXPECT_TRUE(match("\n", "*"));
+}
+
+TEST_F(MatchFilePathTest, Star) {
+  EXPECT_TRUE(match(std::string(50, 'a'), "*a*a*a*a*a*a*a*a*a*a"));
+  EXPECT_FALSE(match((std::string(50, 'a') + 'b'), "*a*a*a*a*a*a*a*a*a*a"));
+}
+
+TEST_F(MatchFilePathTest, CaseSensitive) {
+  EXPECT_TRUE(match("abc", "abc"));
+  EXPECT_FALSE(match("AbC", "abc"));
+  EXPECT_FALSE(match("abc", "AbC"));
+  EXPECT_TRUE(match("AbC", "AbC"));
+}
+
+TEST_F(MatchFilePathTest, PathSeparators) {
+  EXPECT_TRUE(match("usr/bin", "usr/bin"));
+  EXPECT_TRUE(match("usr\\bin", R"(usr\\bin)"));
+}
+
+TEST_F(MatchFilePathTest, NumericEscapeSequence) {
+  EXPECT_TRUE(match("test", "te*"));
+  EXPECT_TRUE(match("test\xff", "te*\xff"));
+  EXPECT_TRUE(match("foo\nbar", "foo*"));
+}
+
+TEST_F(MatchFilePathTest, ValidBrackets) {
+  EXPECT_TRUE(match("z", "[az]"));
+  EXPECT_FALSE(match("z", "[!az]"));
+  EXPECT_TRUE(match("a", "[aa]"));
+  EXPECT_TRUE(match("^", "[^az]"));
+  EXPECT_TRUE(match("[", "[[az]"));
+  EXPECT_FALSE(match("]", "[!]]"));
+}
+
+TEST_F(MatchFilePathTest, InvalidBrackets) {
+  EXPECT_TRUE(match("[", "["));
+  EXPECT_TRUE(match("[]", "[]"));
+  EXPECT_TRUE(match("[!", "[!"));
+  EXPECT_TRUE(match("[!]", "[!]"));
+}
+
+TEST_F(MatchFilePathTest, Range) {
+  EXPECT_TRUE(match("c", "[b-d]"));
+  EXPECT_FALSE(match("c", "[!b-d]"));
+  EXPECT_TRUE(match("y", "[b-dx-z]"));
+  EXPECT_FALSE(match("y", "[!b-dx-z]"));
+}
+
+TEST_F(MatchFilePathTest, Hyphen) {
+  EXPECT_FALSE(match("#", "[!-#]"));
+  EXPECT_FALSE(match("-", "[!--.]"));
+  EXPECT_TRUE(match("_", "[^-`]"));
+  EXPECT_TRUE(match("]", "[[-^]"));
+  EXPECT_TRUE(match("]", R"([\-^])"));
+  EXPECT_TRUE(match("-", "[b-]"));
+  EXPECT_FALSE(match("-", "[!b-]"));
+  EXPECT_TRUE(match("-", "[-b]"));
+  EXPECT_FALSE(match("-", "[!-b]"));
+  EXPECT_TRUE(match("-", "[-]"));
+  EXPECT_FALSE(match("-", "[!-]"));
+}
+
+TEST_F(MatchFilePathTest, UpperLELower) {
+  EXPECT_FALSE(match("c", "[d-b]"));
+  EXPECT_TRUE(match("c", "[!d-b]"));
+  EXPECT_TRUE(match("y", "[d-bx-z]"));
+  EXPECT_FALSE(match("y", "[!d-bx-z]"));
+  EXPECT_TRUE(match("_", "[d-b^-`]"));
+  EXPECT_TRUE(match("]", "[d-b[-^]"));
+  EXPECT_TRUE(match("b", "[b-b]"));
+}
+
+TEST_F(MatchFilePathTest, SlashAndBackslashInBrackets) {
+  EXPECT_FALSE(match("/", "[/]"));
+  EXPECT_TRUE(match("\\", R"([\])"));
+  EXPECT_TRUE(match("[/]", "[/]"));
+  EXPECT_TRUE(match("\\", R"([\t])"));
+  EXPECT_TRUE(match("t", R"([\t])"));
+  EXPECT_FALSE(match("\t", R"([\t])"));
+}
+
+TEST_F(MatchFilePathTest, SlashAndBackslashInRange) {
+  EXPECT_FALSE(match("a/b", "a[.-0]b"));
+  EXPECT_TRUE(match("a\\b", "a[Z-^]b"));
+  EXPECT_FALSE(match("a/b", "a[/-0]b"));
+  EXPECT_TRUE(match("a[/-0]b", "a[/-0]b"));
+  EXPECT_FALSE(match("a/b", "a[.-/]b"));
+  EXPECT_TRUE(match("a[.-/]b", "a[.-/]b"));
+  EXPECT_TRUE(match("a\\b", R"(a[\-^]b)"));
+  EXPECT_TRUE(match("a\\b", R"(a[Z-\]b)"));
+}
+
+TEST_F(MatchFilePathTest, Brackets) {
+  EXPECT_TRUE(match("[", "[[]"));
+  EXPECT_TRUE(match("&", "[a&&b]"));
+  EXPECT_TRUE(match("|", "[a||b]"));
+  EXPECT_TRUE(match("~", "[a~~b]"));
+  EXPECT_TRUE(match(",", "[a-z+--A-Z]"));
+  EXPECT_FALSE(match(".", "[a-z--/A-Z]"));
+}
+
+TEST_F(MatchFilePathTest, Path) {
+  EXPECT_TRUE(match(".clang-format", "*"));
+  EXPECT_TRUE(match(".git", "*git*"));
+  EXPECT_TRUE(match(".gitignore", "*git*"));
+  EXPECT_TRUE(match("foo/bar", "foo*/*bar"));
+  EXPECT_TRUE(match("foo/bar", "*/*"));
+  EXPECT_TRUE(match("foo/bar", R"(*foo*\/*bar*)"));
+  EXPECT_FALSE(match("foo/bar", "foo*"));
+  EXPECT_FALSE(match("foo/bar", "foo?bar"));
+  EXPECT_FALSE(match("foo/bar", "foo*bar"));
+  EXPECT_FALSE(match("foobar", "foo*/*"));
+  EXPECT_FALSE(match("foo\\", R"(foo*\)"));
+}
+
+} // namespace
+} // namespace format
+} // namespace clang
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 8e6935319b2f..2cafc0438ffb 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1718,6 +1718,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsFunctionDeclarationNames) {
   ASSERT_EQ(Tokens.size(), 14u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[4], tok::l_paren, TT_FunctionTypeLParen);
+
+  auto Style = getLLVMStyle();
+  Style.TypeNames.push_back("time_t");
+  Tokens = annotate("int iso_time(time_t);", Style);
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_TypeName);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsCtorAndDtorDeclNames) {
diff --git a/clang/unittests/StaticAnalyzer/CallEventTest.cpp b/clang/unittests/StaticAnalyzer/CallEventTest.cpp
index 5be25d2ada67..adbfe02a284d 100644
--- a/clang/unittests/StaticAnalyzer/CallEventTest.cpp
+++ b/clang/unittests/StaticAnalyzer/CallEventTest.cpp
@@ -36,12 +36,7 @@ void reportBug(const CheckerBase *Checker, const CallEvent &Call,
 }
 
 class CXXDeallocatorChecker : public Checker<check::PreCall> {
-  std::unique_ptr<BugType> BT_uninitField;
-
 public:
-  CXXDeallocatorChecker()
-      : BT_uninitField(new BugType(this, "CXXDeallocator")) {}
-
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const {
     const auto *DC = dyn_cast<CXXDeallocatorCall>(&Call);
     if (!DC) {
diff --git a/clang/unittests/StaticAnalyzer/NoStateChangeFuncVisitorTest.cpp b/clang/unittests/StaticAnalyzer/NoStateChangeFuncVisitorTest.cpp
index 69e29b47f925..51aca42a26b0 100644
--- a/clang/unittests/StaticAnalyzer/NoStateChangeFuncVisitorTest.cpp
+++ b/clang/unittests/StaticAnalyzer/NoStateChangeFuncVisitorTest.cpp
@@ -82,7 +82,7 @@ public:
 
 template <class Visitor>
 class StatefulChecker : public Checker<check::PreCall> {
-  mutable std::unique_ptr<BugType> BT;
+  const BugType BT{this, "error()", categories::SecurityError};
 
 public:
   void checkPreCall(const CallEvent &Call, CheckerContext &C) const {
@@ -102,11 +102,8 @@ public:
       const ExplodedNode *N = C.generateErrorNode();
       if (!N)
         return;
-      if (!BT)
-        BT.reset(new BugType(this->getCheckerName(), "error()",
-                             categories::SecurityError));
       auto R =
-          std::make_unique<PathSensitiveBugReport>(*BT, "error() called", N);
+          std::make_unique<PathSensitiveBugReport>(BT, "error() called", N);
       R->template addVisitor<Visitor>();
       C.emitReport(std::move(R));
     }
diff --git a/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp b/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp
index d5c7a5c9bb82..cd46efc8ad76 100644
--- a/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp
+++ b/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp
@@ -89,8 +89,7 @@ TEST(RegisterCustomCheckers, CheckLocationIncDec) {
 
 class CheckerRegistrationOrderPrinter
     : public Checker<check::PreStmt<DeclStmt>> {
-  std::unique_ptr<BugType> BT =
-      std::make_unique<BugType>(this, "Registration order");
+  const BugType BT{this, "Registration order"};
 
 public:
   void checkPreStmt(const DeclStmt *DS, CheckerContext &C) const {
@@ -104,7 +103,7 @@ public:
         .printEnabledCheckerList(OS);
     // Strip a newline off.
     auto R =
-        std::make_unique<PathSensitiveBugReport>(*BT, OS.str().drop_back(1), N);
+        std::make_unique<PathSensitiveBugReport>(BT, OS.str().drop_back(1), N);
     C.emitReport(std::move(R));
   }
 };
@@ -125,8 +124,6 @@ void addCheckerRegistrationOrderPrinter(CheckerRegistry &Registry) {
 
 #define UNITTEST_CHECKER(CHECKER_NAME, DIAG_MSG)                               \
   class CHECKER_NAME : public Checker<check::PreStmt<DeclStmt>> {              \
-    std::unique_ptr<BugType> BT = std::make_unique<BugType>(this, DIAG_MSG);   \
-                                                                               \
   public:                                                                      \
     void checkPreStmt(const DeclStmt *DS, CheckerContext &C) const {}          \
   };                                                                           \
diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
index da2a885ce851..d570bcae8d86 100644
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -151,7 +151,7 @@ static BasicType ParseBasicType(char c) {
   case 'd':
     return BasicType::Float64;
     break;
-  case 'b':
+  case 'y':
     return BasicType::BFloat16;
     break;
   default:
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 311c6b09dc79..6c302da106a2 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1603,6 +1603,25 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
   OS << "extern \"C\" {\n";
   OS << "#endif\n\n";
 
+  OS << "void __arm_za_disable(void) __arm_streaming_compatible;\n\n";
+
+  OS << "__ai bool __arm_has_sme(void) __arm_streaming_compatible {\n";
+  OS << "  uint64_t x0, x1;\n";
+  OS << "  __builtin_arm_get_sme_state(&x0, &x1);\n";
+  OS << "  return x0 & (1ULL << 63);\n";
+  OS << "}\n\n";
+
+  OS << "__ai bool __arm_in_streaming_mode(void) __arm_streaming_compatible "
+        "{\n";
+  OS << "  uint64_t x0, x1;\n";
+  OS << "  __builtin_arm_get_sme_state(&x0, &x1);\n";
+  OS << "  return x0 & 1;\n";
+  OS << "}\n\n";
+
+  OS << "__ai __attribute__((target(\"sme\"))) void svundef_za(void) "
+        "__arm_streaming_compatible __arm_shared_za "
+        "{ }\n\n";
+
   createCoreHeaderIntrinsics(OS, *this, ACLEKind::SME);
 
   OS << "#ifdef __cplusplus\n";
diff --git a/compiler-rt/lib/asan/asan_linux.cpp b/compiler-rt/lib/asan/asan_linux.cpp
index e19b4479aaf3..37d3bad1b1ec 100644
--- a/compiler-rt/lib/asan/asan_linux.cpp
+++ b/compiler-rt/lib/asan/asan_linux.cpp
@@ -33,7 +33,6 @@
 #  include "asan_premap_shadow.h"
 #  include "asan_thread.h"
 #  include "sanitizer_common/sanitizer_flags.h"
-#  include "sanitizer_common/sanitizer_freebsd.h"
 #  include "sanitizer_common/sanitizer_hash.h"
 #  include "sanitizer_common/sanitizer_libc.h"
 #  include "sanitizer_common/sanitizer_procmaps.h"
@@ -59,13 +58,6 @@ extern Elf_Dyn _DYNAMIC;
 extern ElfW(Dyn) _DYNAMIC[];
 #  endif
 
-// x86-64 FreeBSD 9.2 and older define 'ucontext_t' incorrectly in
-// 32-bit mode.
-#  if SANITIZER_FREEBSD && (SANITIZER_WORDSIZE == 32) && \
-      __FreeBSD_version <= 902001  // v9.2
-#    define ucontext_t xucontext_t
-#  endif
-
 typedef enum {
   ASAN_RT_VERSION_UNDEFINED = 0,
   ASAN_RT_VERSION_DYNAMIC,
@@ -148,6 +140,11 @@ static int FindFirstDSOCallback(struct dl_phdr_info *info, size_t size,
       internal_strncmp(info->dlpi_name, "linux-", sizeof("linux-") - 1) == 0)
     return 0;
 #    endif
+#    if SANITIZER_FREEBSD
+  // Ignore vDSO.
+  if (internal_strcmp(info->dlpi_name, "[vdso]") == 0)
+    return 0;
+#    endif
 
   *name = info->dlpi_name;
   return 1;
diff --git a/compiler-rt/lib/asan/asan_new_delete.cpp b/compiler-rt/lib/asan/asan_new_delete.cpp
index 17280129c758..b5b1ced8ac5e 100644
--- a/compiler-rt/lib/asan/asan_new_delete.cpp
+++ b/compiler-rt/lib/asan/asan_new_delete.cpp
@@ -48,15 +48,6 @@ COMMENT_EXPORT("??_V@YAXPAX@Z")                   // operator delete[]
 
 using namespace __asan;
 
-// FreeBSD prior v9.2 have wrong definition of 'size_t'.
-// http://svnweb.freebsd.org/base?view=revision&revision=232261
-#if SANITIZER_FREEBSD && SANITIZER_WORDSIZE == 32
-#include <sys/param.h>
-#if __FreeBSD_version <= 902001  // v9.2
-#define size_t unsigned
-#endif  // __FreeBSD_version
-#endif  // SANITIZER_FREEBSD && SANITIZER_WORDSIZE == 32
-
 // This code has issues on OSX.
 // See https://github.com/google/sanitizers/issues/131.
 
diff --git a/compiler-rt/lib/asan/asan_posix.cpp b/compiler-rt/lib/asan/asan_posix.cpp
index a5b87b7fbf1b..76564538bd5d 100644
--- a/compiler-rt/lib/asan/asan_posix.cpp
+++ b/compiler-rt/lib/asan/asan_posix.cpp
@@ -146,33 +146,37 @@ void PlatformTSDDtor(void *tsd) {
 #    endif
   AsanThread::TSDDtor(tsd);
 }
-#endif
+#  endif
+
+static void BeforeFork() {
+  if (CAN_SANITIZE_LEAKS) {
+    __lsan::LockGlobal();
+  }
+  // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and lock the
+  // stuff we need.
+  __lsan::LockThreads();
+  __lsan::LockAllocator();
+  StackDepotLockBeforeFork();
+}
+
+static void AfterFork(bool fork_child) {
+  StackDepotUnlockAfterFork(fork_child);
+  // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and unlock
+  // the stuff we need.
+  __lsan::UnlockAllocator();
+  __lsan::UnlockThreads();
+  if (CAN_SANITIZE_LEAKS) {
+    __lsan::UnlockGlobal();
+  }
+}
 
 void InstallAtForkHandler() {
 #  if SANITIZER_SOLARIS || SANITIZER_NETBSD || SANITIZER_APPLE
   return;  // FIXME: Implement FutexWait.
 #  endif
-  auto before = []() {
-    if (CAN_SANITIZE_LEAKS) {
-      __lsan::LockGlobal();
-    }
-    // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and lock the
-    // stuff we need.
-    __lsan::LockThreads();
-    __lsan::LockAllocator();
-    StackDepotLockAll();
-  };
-  auto after = []() {
-    StackDepotUnlockAll();
-    // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and unlock
-    // the stuff we need.
-    __lsan::UnlockAllocator();
-    __lsan::UnlockThreads();
-    if (CAN_SANITIZE_LEAKS) {
-      __lsan::UnlockGlobal();
-    }
-  };
-  pthread_atfork(before, after, after);
+  pthread_atfork(
+      &BeforeFork, []() { AfterFork(/* fork_child= */ false); },
+      []() { AfterFork(/* fork_child= */ true); });
 }
 
 void InstallAtExitCheckLeaks() {
diff --git a/compiler-rt/lib/asan/asan_rtl_x86_64.S b/compiler-rt/lib/asan/asan_rtl_x86_64.S
index e44587ac4331..9c5289856d8a 100644
--- a/compiler-rt/lib/asan/asan_rtl_x86_64.S
+++ b/compiler-rt/lib/asan/asan_rtl_x86_64.S
@@ -89,7 +89,12 @@ ENDF
 #define ASAN_MEMORY_ACCESS_CHECK_ADD(reg, op, s, c) \
         mov    %##reg,%r10 ;\
         shr    $0x3,%r10 ;\
+        .if ASAN_SHADOW_OFFSET_CONST < 0x80000000  ;\
         ##c    $0x0,ASAN_SHADOW_OFFSET_CONST(%r10) ;\
+        .else                                      ;\
+        movabsq $ASAN_SHADOW_OFFSET_CONST,%r11     ;\
+        ##c    $0x0,(%r10,%r11)                    ;\
+        .endif                                     ;\
         jne    FLABEL(reg, op, s, add) ;\
         retq  ;\
 
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c
index d59dbfdad380..44e1cf49d1e9 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64.c
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c
@@ -34,6 +34,9 @@ _Bool __aarch64_have_lse_atomics
     __attribute__((visibility("hidden"), nocommon)) = false;
 
 #if defined(__FreeBSD__)
+// clang-format off: should not reorder sys/auxv.h alphabetically
+#include <sys/auxv.h>
+// clang-format on
 #include "aarch64/hwcap.inc"
 #include "aarch64/lse_atomics/freebsd.inc"
 #elif defined(__Fuchsia__)
@@ -133,7 +136,6 @@ struct {
 #include "aarch64/fmv/mrs.inc"
 #include "aarch64/fmv/freebsd.inc"
 #elif defined(__Fuchsia__)
-#include "aarch64/fmv/mrs.inc"
 #include "aarch64/fmv/fuchsia.inc"
 #elif defined(__ANDROID__)
 #include "aarch64/fmv/mrs.inc"
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
index 4dab6ff58b37..d8e0280f4041 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
@@ -1,22 +1,51 @@
-void __init_cpu_features_resolver(unsigned long hwcap,
-                                  const __ifunc_arg_t *arg) {
+#include <zircon/features.h>
+#include <zircon/syscalls.h>
+
+void __init_cpu_features_resolver() {
   if (__aarch64_cpu_features.features)
     return;
 
-  __init_cpu_features_constructor(hwcap, arg);
-}
-
-void CONSTRUCTOR_ATTRIBUTE __init_cpu_features(void) {
-  // CPU features already initialized.
-  if (__aarch64_cpu_features.features)
+    // This ensures the vDSO is a direct link-time dependency of anything that
+    // needs this initializer code.
+#pragma comment(lib, "zircon")
+  uint32_t features;
+  zx_status_t status = _zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
+  if (status != ZX_OK)
     return;
 
-  unsigned long hwcap = getauxval(AT_HWCAP);
-  unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#define setCPUFeature(cpu_feature)                                             \
+  __aarch64_cpu_features.features |= 1ULL << cpu_feature
+
+  if (features & ZX_ARM64_FEATURE_ISA_FP)
+    setCPUFeature(FEAT_FP);
+  if (features & ZX_ARM64_FEATURE_ISA_ASIMD)
+    setCPUFeature(FEAT_SIMD);
+  if (features & ZX_ARM64_FEATURE_ISA_AES)
+    setCPUFeature(FEAT_AES);
+  if (features & ZX_ARM64_FEATURE_ISA_PMULL)
+    setCPUFeature(FEAT_PMULL);
+  if (features & ZX_ARM64_FEATURE_ISA_SHA1)
+    setCPUFeature(FEAT_SHA1);
+  if (features & ZX_ARM64_FEATURE_ISA_SHA256)
+    setCPUFeature(FEAT_SHA2);
+  if (features & ZX_ARM64_FEATURE_ISA_CRC32)
+    setCPUFeature(FEAT_CRC);
+  if (features & ZX_ARM64_FEATURE_ISA_RDM)
+    setCPUFeature(FEAT_RDM);
+  if (features & ZX_ARM64_FEATURE_ISA_SHA3)
+    setCPUFeature(FEAT_SHA3);
+  if (features & ZX_ARM64_FEATURE_ISA_SM4)
+    setCPUFeature(FEAT_SM4);
+  if (features & ZX_ARM64_FEATURE_ISA_DP)
+    setCPUFeature(FEAT_DOTPROD);
+  if (features & ZX_ARM64_FEATURE_ISA_FHM)
+    setCPUFeature(FEAT_FP16FML);
+  if (features & ZX_ARM64_FEATURE_ISA_SHA512)
+    setCPUFeature(FEAT_SHA3);
+  if (features & ZX_ARM64_FEATURE_ISA_I8MM)
+    setCPUFeature(FEAT_I8MM);
+  if (features & ZX_ARM64_FEATURE_ISA_SVE)
+    setCPUFeature(FEAT_SVE);
 
-  __ifunc_arg_t arg;
-  arg._size = sizeof(__ifunc_arg_t);
-  arg._hwcap = hwcap;
-  arg._hwcap2 = hwcap2;
-  __init_cpu_features_constructor(hwcap | _IFUNC_ARG_HWCAP, &arg);
+  setCPUFeature(FEAT_INIT);
 }
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index 72b0d55d65f0..9d9a5d3f1542 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -647,35 +647,59 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
   case 23:
     CPU = "znver1";
     *Type = AMDFAM17H;
-    if ((Model >= 0x30 && Model <= 0x3f) || Model == 0x71) {
+    if ((Model >= 0x30 && Model <= 0x3f) || (Model == 0x47) ||
+        (Model >= 0x60 && Model <= 0x67) || (Model >= 0x68 && Model <= 0x6f) ||
+        (Model >= 0x70 && Model <= 0x7f) || (Model >= 0x84 && Model <= 0x87) ||
+        (Model >= 0x90 && Model <= 0x97) || (Model >= 0x98 && Model <= 0x9f) ||
+        (Model >= 0xa0 && Model <= 0xaf)) {
+      // Family 17h Models 30h-3Fh (Starship) Zen 2
+      // Family 17h Models 47h (Cardinal) Zen 2
+      // Family 17h Models 60h-67h (Renoir) Zen 2
+      // Family 17h Models 68h-6Fh (Lucienne) Zen 2
+      // Family 17h Models 70h-7Fh (Matisse) Zen 2
+      // Family 17h Models 84h-87h (ProjectX) Zen 2
+      // Family 17h Models 90h-97h (VanGogh) Zen 2
+      // Family 17h Models 98h-9Fh (Mero) Zen 2
+      // Family 17h Models A0h-AFh (Mendocino) Zen 2
       CPU = "znver2";
       *Subtype = AMDFAM17H_ZNVER2;
-      break; // 30h-3fh, 71h: Zen2
+      break;
     }
-    if (Model <= 0x0f) {
+    if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x20 && Model <= 0x2f)) {
+      // Family 17h Models 10h-1Fh (Raven1) Zen
+      // Family 17h Models 10h-1Fh (Picasso) Zen+
+      // Family 17h Models 20h-2Fh (Raven2 x86) Zen
       *Subtype = AMDFAM17H_ZNVER1;
-      break; // 00h-0Fh: Zen1
+      break;
     }
     break;
   case 25:
     CPU = "znver3";
     *Type = AMDFAM19H;
-    if (Model <= 0x0f || (Model >= 0x20 && Model <= 0x5f)) {
-      // Family 19h Models 00h-0Fh - Zen3
-      // Family 19h Models 20h-2Fh - Zen3
-      // Family 19h Models 30h-3Fh - Zen3
-      // Family 19h Models 40h-4Fh - Zen3+
-      // Family 19h Models 50h-5Fh - Zen3+
+    if ((Model <= 0x0f) || (Model >= 0x20 && Model <= 0x2f) ||
+        (Model >= 0x30 && Model <= 0x3f) || (Model >= 0x40 && Model <= 0x4f) ||
+        (Model >= 0x50 && Model <= 0x5f)) {
+      // Family 19h Models 00h-0Fh (Genesis, Chagall) Zen 3
+      // Family 19h Models 20h-2Fh (Vermeer) Zen 3
+      // Family 19h Models 30h-3Fh (Badami) Zen 3
+      // Family 19h Models 40h-4Fh (Rembrandt) Zen 3+
+      // Family 19h Models 50h-5Fh (Cezanne) Zen 3
       *Subtype = AMDFAM19H_ZNVER3;
       break;
     }
-    if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x60 && Model <= 0x74) ||
-        (Model >= 0x78 && Model <= 0x7b) || (Model >= 0xA0 && Model <= 0xAf)) {
+    if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x60 && Model <= 0x6f) ||
+        (Model >= 0x70 && Model <= 0x77) || (Model >= 0x78 && Model <= 0x7f) ||
+        (Model >= 0xa0 && Model <= 0xaf)) {
+      // Family 19h Models 10h-1Fh (Stones; Storm Peak) Zen 4
+      // Family 19h Models 60h-6Fh (Raphael) Zen 4
+      // Family 19h Models 70h-77h (Phoenix, Hawkpoint1) Zen 4
+      // Family 19h Models 78h-7Fh (Phoenix 2, Hawkpoint2) Zen 4
+      // Family 19h Models A0h-AFh (Stones-Dense) Zen 4
       CPU = "znver4";
       *Subtype = AMDFAM19H_ZNVER4;
       break; //  "znver4"
     }
-    break;
+    break; // family 19h
   default:
     break; // Unknown AMD CPU.
   }
diff --git a/compiler-rt/lib/builtins/fp_lib.h b/compiler-rt/lib/builtins/fp_lib.h
index 43bbdd5f8736..af406e760497 100644
--- a/compiler-rt/lib/builtins/fp_lib.h
+++ b/compiler-rt/lib/builtins/fp_lib.h
@@ -26,18 +26,6 @@
 #include <stdbool.h>
 #include <stdint.h>
 
-// x86_64 FreeBSD prior v9.3 define fixed-width types incorrectly in
-// 32-bit mode.
-#if defined(__FreeBSD__) && defined(__i386__)
-#include <sys/param.h>
-#if __FreeBSD_version < 903000 // v9.3
-#define uint64_t unsigned long long
-#define int64_t long long
-#undef UINT64_C
-#define UINT64_C(c) (c##ULL)
-#endif
-#endif
-
 #if defined SINGLE_PRECISION
 
 typedef uint16_t half_rep_t;
diff --git a/compiler-rt/lib/builtins/int_types.h b/compiler-rt/lib/builtins/int_types.h
index 18bf0a7f3bf9..7624c7280615 100644
--- a/compiler-rt/lib/builtins/int_types.h
+++ b/compiler-rt/lib/builtins/int_types.h
@@ -139,7 +139,6 @@ typedef union {
   udwords u;
   double f;
 } double_bits;
-#endif
 
 typedef struct {
 #if _YUGA_LITTLE_ENDIAN
@@ -220,7 +219,6 @@ typedef union {
 #define CRT_HAS_TF_MODE
 #endif
 
-#if CRT_HAS_FLOATING_POINT
 #if __STDC_VERSION__ >= 199901L
 typedef float _Complex Fcomplex;
 typedef double _Complex Dcomplex;
@@ -270,5 +268,5 @@ typedef struct {
 #define COMPLEXTF_IMAGINARY(x) (x).imaginary
 #endif
 
-#endif
+#endif // CRT_HAS_FLOATING_POINT
 #endif // INT_TYPES_H
diff --git a/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.cpp b/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.cpp
index 9ec598bf2ce9..f95194d19f03 100644
--- a/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.cpp
+++ b/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.cpp
@@ -19,4 +19,10 @@ static ChainedOriginDepot chainedOriginDepot;
 
 ChainedOriginDepot* GetChainedOriginDepot() { return &chainedOriginDepot; }
 
+void ChainedOriginDepotLockBeforeFork() { chainedOriginDepot.LockBeforeFork(); }
+
+void ChainedOriginDepotUnlockAfterFork(bool fork_child) {
+  chainedOriginDepot.UnlockAfterFork(fork_child);
+}
+
 }  // namespace __dfsan
diff --git a/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.h b/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.h
index d715ef707f41..83b9e29e1b71 100644
--- a/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.h
+++ b/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.h
@@ -21,6 +21,9 @@ namespace __dfsan {
 
 ChainedOriginDepot* GetChainedOriginDepot();
 
+void ChainedOriginDepotLockBeforeFork();
+void ChainedOriginDepotUnlockAfterFork(bool fork_child);
+
 }  // namespace __dfsan
 
 #endif  // DFSAN_CHAINED_ORIGIN_DEPOT_H
diff --git a/compiler-rt/lib/dfsan/dfsan_custom.cpp b/compiler-rt/lib/dfsan/dfsan_custom.cpp
index 38371d353368..05b48fd0525e 100644
--- a/compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ b/compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -2893,13 +2893,13 @@ int __dfso___isoc99_sscanf(char *str, const char *format, dfsan_label str_label,
 }
 
 static void BeforeFork() {
-  StackDepotLockAll();
-  GetChainedOriginDepot()->LockAll();
+  StackDepotLockBeforeFork();
+  ChainedOriginDepotLockBeforeFork();
 }
 
-static void AfterFork() {
-  GetChainedOriginDepot()->UnlockAll();
-  StackDepotUnlockAll();
+static void AfterFork(bool fork_child) {
+  ChainedOriginDepotUnlockAfterFork(fork_child);
+  StackDepotUnlockAfterFork(fork_child);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
@@ -2913,7 +2913,7 @@ SANITIZER_INTERFACE_ATTRIBUTE
 pid_t __dfso_fork(dfsan_label *ret_label, dfsan_origin *ret_origin) {
   BeforeFork();
   pid_t pid = __dfsw_fork(ret_label);
-  AfterFork();
+  AfterFork(/* fork_child= */ pid == 0);
   return pid;
 }
 
diff --git a/compiler-rt/lib/hwasan/hwasan_linux.cpp b/compiler-rt/lib/hwasan/hwasan_linux.cpp
index 3271a955e7ed..e6aa60b324fa 100644
--- a/compiler-rt/lib/hwasan/hwasan_linux.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_linux.cpp
@@ -521,28 +521,32 @@ uptr TagMemoryAligned(uptr p, uptr size, tag_t tag) {
   return AddTagToPointer(p, tag);
 }
 
+static void BeforeFork() {
+  if (CAN_SANITIZE_LEAKS) {
+    __lsan::LockGlobal();
+  }
+  // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and lock the
+  // stuff we need.
+  __lsan::LockThreads();
+  __lsan::LockAllocator();
+  StackDepotLockBeforeFork();
+}
+
+static void AfterFork(bool fork_child) {
+  StackDepotUnlockAfterFork(fork_child);
+  // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and unlock
+  // the stuff we need.
+  __lsan::UnlockAllocator();
+  __lsan::UnlockThreads();
+  if (CAN_SANITIZE_LEAKS) {
+    __lsan::UnlockGlobal();
+  }
+}
+
 void HwasanInstallAtForkHandler() {
-  auto before = []() {
-    if (CAN_SANITIZE_LEAKS) {
-      __lsan::LockGlobal();
-    }
-    // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and lock the
-    // stuff we need.
-    __lsan::LockThreads();
-    __lsan::LockAllocator();
-    StackDepotLockAll();
-  };
-  auto after = []() {
-    StackDepotUnlockAll();
-    // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and unlock
-    // the stuff we need.
-    __lsan::UnlockAllocator();
-    __lsan::UnlockThreads();
-    if (CAN_SANITIZE_LEAKS) {
-      __lsan::UnlockGlobal();
-    }
-  };
-  pthread_atfork(before, after, after);
+  pthread_atfork(
+      &BeforeFork, []() { AfterFork(/* fork_child= */ false); },
+      []() { AfterFork(/* fork_child= */ true); });
 }
 
 void InstallAtExitCheckLeaks() {
diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 5e8aa315801b..1a018a891b56 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -205,6 +205,7 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa,
                                   tag_t addr_tag, uptr untagged_addr) {
   uptr frames = Min((uptr)flags()->stack_history_size, sa->size());
   bool found_local = false;
+  InternalScopedString location;
   for (uptr i = 0; i < frames; i++) {
     const uptr *record_addr = &(*sa)[i];
     uptr record = *record_addr;
@@ -220,24 +221,56 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa,
       for (LocalInfo &local : frame.locals) {
         if (!local.has_frame_offset || !local.has_size || !local.has_tag_offset)
           continue;
+        if (!(local.name && internal_strlen(local.name)) &&
+            !(local.function_name && internal_strlen(local.name)) &&
+            !(local.decl_file && internal_strlen(local.decl_file)))
+          continue;
         tag_t obj_tag = base_tag ^ local.tag_offset;
         if (obj_tag != addr_tag)
           continue;
-        // Calculate the offset from the object address to the faulting
-        // address. Because we only store bits 4-19 of FP (bits 0-3 are
-        // guaranteed to be zero), the calculation is performed mod 2^20 and may
-        // harmlessly underflow if the address mod 2^20 is below the object
-        // address.
-        uptr obj_offset =
-            (untagged_addr - fp - local.frame_offset) & (kRecordFPModulus - 1);
-        if (obj_offset >= local.size)
-          continue;
+        // Guess top bits of local variable from the faulting address, because
+        // we only store bits 4-19 of FP (bits 0-3 are guaranteed to be zero).
+        uptr local_beg = (fp + local.frame_offset) |
+                         (untagged_addr & ~(uptr(kRecordFPModulus) - 1));
+        uptr local_end = local_beg + local.size;
+
         if (!found_local) {
-          Printf("Potentially referenced stack objects:\n");
+          Printf("\nPotentially referenced stack objects:\n");
           found_local = true;
         }
-        Printf("  %s in %s %s:%d\n", local.name, local.function_name,
-               local.decl_file, local.decl_line);
+
+        uptr offset;
+        const char *whence;
+        const char *cause;
+        if (local_beg <= untagged_addr && untagged_addr < local_end) {
+          offset = untagged_addr - local_beg;
+          whence = "inside";
+          cause = "use-after-scope";
+        } else if (untagged_addr >= local_end) {
+          offset = untagged_addr - local_end;
+          whence = "after";
+          cause = "stack-buffer-overflow";
+        } else {
+          offset = local_beg - untagged_addr;
+          whence = "before";
+          cause = "stack-buffer-overflow";
+        }
+        Decorator d;
+        Printf("%s", d.Error());
+        Printf("Cause: %s\n", cause);
+        Printf("%s", d.Default());
+        Printf("%s", d.Location());
+        StackTracePrinter::GetOrInit()->RenderSourceLocation(
+            &location, local.decl_file, local.decl_line, /* column= */ 0,
+            common_flags()->symbolize_vs_style,
+            common_flags()->strip_path_prefix);
+        Printf(
+            "%p is located %zd bytes %s a %zd-byte local variable %s [%p,%p) "
+            "in %s %s\n",
+            untagged_addr, offset, whence, local_end - local_beg, local.name,
+            local_beg, local_end, local.function_name, location.data());
+        location.clear();
+        Printf("%s\n", d.Default());
       }
       frame.Clear();
     }
@@ -363,7 +396,7 @@ static void PrintTagsAroundAddr(uptr addr, GetTag get_tag,
   InternalScopedString s;
   addr = MemToShadow(addr);
   s.AppendF(
-      "Memory tags around the buggy address (one tag corresponds to %zd "
+      "\nMemory tags around the buggy address (one tag corresponds to %zd "
       "bytes):\n",
       kShadowAlignment);
   PrintTagInfoAroundAddr(addr, kShadowLines, s,
@@ -745,8 +778,6 @@ void BaseReport::PrintAddressDescription() const {
   // Check stack first. If the address is on the stack of a live thread, we
   // know it cannot be a heap / global overflow.
   for (const auto &sa : allocations.stack) {
-    // TODO(fmayer): figure out how to distinguish use-after-return and
-    // stack-buffer-overflow.
     Printf("%s", d.Error());
     Printf("\nCause: stack tag-mismatch\n");
     Printf("%s", d.Location());
@@ -803,8 +834,10 @@ void BaseReport::PrintAddressDescription() const {
   }
 
   // Print the remaining threads, as an extra information, 1 line per thread.
-  if (flags()->print_live_threads_info)
+  if (flags()->print_live_threads_info) {
+    Printf("\n");
     hwasanThreadList().VisitAllLiveThreads([&](Thread *t) { t->Announce(); });
+  }
 
   if (!num_descriptions_printed)
     // We exhausted our possibilities. Bail out.
@@ -1020,7 +1053,7 @@ void ReportTagMismatch(StackTrace *stack, uptr tagged_addr, uptr access_size,
 // See the frame breakdown defined in __hwasan_tag_mismatch (from
 // hwasan_tag_mismatch_{aarch64,riscv64}.S).
 void ReportRegisters(const uptr *frame, uptr pc) {
-  Printf("Registers where the failure occurred (pc %p):\n", pc);
+  Printf("\nRegisters where the failure occurred (pc %p):\n", pc);
 
   // We explicitly print a single line (4 registers/line) each iteration to
   // reduce the amount of logcat error messages printed. Each Printf() will
diff --git a/compiler-rt/lib/lsan/lsan_posix.cpp b/compiler-rt/lib/lsan/lsan_posix.cpp
index 4bfadf1ef809..422c29acca69 100644
--- a/compiler-rt/lib/lsan/lsan_posix.cpp
+++ b/compiler-rt/lib/lsan/lsan_posix.cpp
@@ -100,23 +100,27 @@ void InstallAtExitCheckLeaks() {
     Atexit(DoLeakCheck);
 }
 
+static void BeforeFork() {
+  LockGlobal();
+  LockThreads();
+  LockAllocator();
+  StackDepotLockBeforeFork();
+}
+
+static void AfterFork(bool fork_child) {
+  StackDepotUnlockAfterFork(fork_child);
+  UnlockAllocator();
+  UnlockThreads();
+  UnlockGlobal();
+}
+
 void InstallAtForkHandler() {
 #  if SANITIZER_SOLARIS || SANITIZER_NETBSD || SANITIZER_APPLE
   return;  // FIXME: Implement FutexWait.
 #  endif
-  auto before = []() {
-    LockGlobal();
-    LockThreads();
-    LockAllocator();
-    StackDepotLockAll();
-  };
-  auto after = []() {
-    StackDepotUnlockAll();
-    UnlockAllocator();
-    UnlockThreads();
-    UnlockGlobal();
-  };
-  pthread_atfork(before, after, after);
+  pthread_atfork(
+      &BeforeFork, []() { AfterFork(/* fork_child= */ false); },
+      []() { AfterFork(/* fork_child= */ true); });
 }
 
 }  // namespace __lsan
diff --git a/compiler-rt/lib/memprof/memprof_linux.cpp b/compiler-rt/lib/memprof/memprof_linux.cpp
index fcd927023f5c..fcb6f662a82e 100644
--- a/compiler-rt/lib/memprof/memprof_linux.cpp
+++ b/compiler-rt/lib/memprof/memprof_linux.cpp
@@ -20,7 +20,6 @@
 #include "memprof_internal.h"
 #include "memprof_thread.h"
 #include "sanitizer_common/sanitizer_flags.h"
-#include "sanitizer_common/sanitizer_freebsd.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_procmaps.h"
 
diff --git a/compiler-rt/lib/msan/msan_chained_origin_depot.cpp b/compiler-rt/lib/msan/msan_chained_origin_depot.cpp
index 49b14131a89b..b98b0e6b14b5 100644
--- a/compiler-rt/lib/msan/msan_chained_origin_depot.cpp
+++ b/compiler-rt/lib/msan/msan_chained_origin_depot.cpp
@@ -31,12 +31,10 @@ u32 ChainedOriginDepotGet(u32 id, u32 *other) {
   return chainedOriginDepot.Get(id, other);
 }
 
-void ChainedOriginDepotLockAll() {
-  chainedOriginDepot.LockAll();
-}
+void ChainedOriginDepotBeforeFork() { chainedOriginDepot.LockBeforeFork(); }
 
-void ChainedOriginDepotUnlockAll() {
-  chainedOriginDepot.UnlockAll();
+void ChainedOriginDepotAfterFork(bool fork_child) {
+  chainedOriginDepot.UnlockAfterFork(fork_child);
 }
 
 } // namespace __msan
diff --git a/compiler-rt/lib/msan/msan_chained_origin_depot.h b/compiler-rt/lib/msan/msan_chained_origin_depot.h
index ea51c77a905b..7518745dc852 100644
--- a/compiler-rt/lib/msan/msan_chained_origin_depot.h
+++ b/compiler-rt/lib/msan/msan_chained_origin_depot.h
@@ -30,8 +30,8 @@ bool ChainedOriginDepotPut(u32 here_id, u32 prev_id, u32 *new_id);
 // Retrieves the stored StackDepot ID for the given origin ID.
 u32 ChainedOriginDepotGet(u32 id, u32 *other);
 
-void ChainedOriginDepotLockAll();
-void ChainedOriginDepotUnlockAll();
+void ChainedOriginDepotBeforeFork();
+void ChainedOriginDepotAfterFork(bool fork_child);
 
 }  // namespace __msan
 
diff --git a/compiler-rt/lib/msan/msan_linux.cpp b/compiler-rt/lib/msan/msan_linux.cpp
index 04af6f4b27ac..c7ecb7cad566 100644
--- a/compiler-rt/lib/msan/msan_linux.cpp
+++ b/compiler-rt/lib/msan/msan_linux.cpp
@@ -256,22 +256,26 @@ void MsanTSDDtor(void *tsd) {
   atomic_signal_fence(memory_order_seq_cst);
   MsanThread::TSDDtor(tsd);
 }
-#endif
+#  endif
+
+static void BeforeFork() {
+  // Usually we lock ThreadRegistry, but msan does not have one.
+  LockAllocator();
+  StackDepotLockBeforeFork();
+  ChainedOriginDepotBeforeFork();
+}
+
+static void AfterFork(bool fork_child) {
+  ChainedOriginDepotAfterFork(fork_child);
+  StackDepotUnlockAfterFork(fork_child);
+  UnlockAllocator();
+  // Usually we unlock ThreadRegistry, but msan does not have one.
+}
 
 void InstallAtForkHandler() {
-  auto before = []() {
-    // Usually we lock ThreadRegistry, but msan does not have one.
-    LockAllocator();
-    StackDepotLockAll();
-    ChainedOriginDepotLockAll();
-  };
-  auto after = []() {
-    ChainedOriginDepotUnlockAll();
-    StackDepotUnlockAll();
-    UnlockAllocator();
-    // Usually we unlock ThreadRegistry, but msan does not have one.
-  };
-  pthread_atfork(before, after, after);
+  pthread_atfork(
+      &BeforeFork, []() { AfterFork(/* fork_child= */ false); },
+      []() { AfterFork(/* fork_child= */ true); });
 }
 
 } // namespace __msan
diff --git a/compiler-rt/lib/msan/tests/msan_test.cpp b/compiler-rt/lib/msan/tests/msan_test.cpp
index 9c8ff9d9b8f9..41b99fabe84f 100644
--- a/compiler-rt/lib/msan/tests/msan_test.cpp
+++ b/compiler-rt/lib/msan/tests/msan_test.cpp
@@ -3241,7 +3241,7 @@ TEST(MemorySanitizer, dlopenFailed) {
 
 #endif // MSAN_TEST_DISABLE_DLOPEN
 
-#if !defined(__FreeBSD__) && !defined(__NetBSD__)
+#if !defined(__NetBSD__)
 TEST(MemorySanitizer, sched_getaffinity) {
   cpu_set_t mask;
   if (sched_getaffinity(getpid(), sizeof(mask), &mask) == 0)
diff --git a/compiler-rt/lib/sanitizer_common/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
index fb7584c298a1..f762524c333a 100644
--- a/compiler-rt/lib/sanitizer_common/CMakeLists.txt
+++ b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
@@ -151,7 +151,6 @@ set(SANITIZER_IMPL_HEADERS
   sanitizer_flags.h
   sanitizer_flags.inc
   sanitizer_flat_map.h
-  sanitizer_freebsd.h
   sanitizer_fuchsia.h
   sanitizer_getauxval.h
   sanitizer_hash.h
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.cpp
index e0e2bd01069f..df2b2eb23df2 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.cpp
@@ -139,9 +139,11 @@ u32 ChainedOriginDepot::Get(u32 id, u32 *other) {
   return desc.here_id;
 }
 
-void ChainedOriginDepot::LockAll() { depot.LockAll(); }
+void ChainedOriginDepot::LockBeforeFork() { depot.LockBeforeFork(); }
 
-void ChainedOriginDepot::UnlockAll() { depot.UnlockAll(); }
+void ChainedOriginDepot::UnlockAfterFork(bool fork_child) {
+  depot.UnlockAfterFork(fork_child);
+}
 
 void ChainedOriginDepot::TestOnlyUnmap() { depot.TestOnlyUnmap(); }
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.h b/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.h
index f9f192b68571..f3da28129e6b 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.h
@@ -32,8 +32,8 @@ class ChainedOriginDepot {
   // Retrieves the stored StackDepot ID for the given origin ID.
   u32 Get(u32 id, u32 *other);
 
-  void LockAll();
-  void UnlockAll();
+  void LockBeforeFork();
+  void UnlockAfterFork(bool fork_child);
   void TestOnlyUnmap();
 
  private:
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index ba4670751697..77fa1b4965a7 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -6785,7 +6785,7 @@ INTERCEPTOR(SSIZE_T, sendto, int fd, void *buf, SIZE_T len, int flags,
 #endif
 
 #if SANITIZER_INTERCEPT_EVENTFD_READ_WRITE
-INTERCEPTOR(int, eventfd_read, int fd, u64 *value) {
+INTERCEPTOR(int, eventfd_read, int fd, __sanitizer_eventfd_t *value) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, eventfd_read, fd, value);
   COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
@@ -6796,7 +6796,7 @@ INTERCEPTOR(int, eventfd_read, int fd, u64 *value) {
   }
   return res;
 }
-INTERCEPTOR(int, eventfd_write, int fd, u64 value) {
+INTERCEPTOR(int, eventfd_write, int fd, __sanitizer_eventfd_t value) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, eventfd_write, fd, value);
   if (fd >= 0) {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_flat_map.h b/compiler-rt/lib/sanitizer_common/sanitizer_flat_map.h
index 8bb8304910c7..d246781fe1df 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_flat_map.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_flat_map.h
@@ -109,6 +109,10 @@ class TwoLevelMap {
     return *AddressSpaceView::LoadWritable(&map2[idx % kSize2]);
   }
 
+  void Lock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS { mu_.Lock(); }
+
+  void Unlock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS { mu_.Unlock(); }
+
  private:
   constexpr uptr MmapSize() const {
     return RoundUpTo(kSize2 * sizeof(T), GetPageSizeCached());
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_freebsd.h b/compiler-rt/lib/sanitizer_common/sanitizer_freebsd.h
deleted file mode 100644
index 82b227eab6da..000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_freebsd.h
+++ /dev/null
@@ -1,137 +0,0 @@
-//===-- sanitizer_freebsd.h -------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of Sanitizer runtime. It contains FreeBSD-specific
-// definitions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SANITIZER_FREEBSD_H
-#define SANITIZER_FREEBSD_H
-
-#include "sanitizer_internal_defs.h"
-
-// x86-64 FreeBSD 9.2 and older define 'ucontext_t' incorrectly in
-// 32-bit mode.
-#if SANITIZER_FREEBSD && (SANITIZER_WORDSIZE == 32)
-#include <osreldate.h>
-#if __FreeBSD_version <= 902001  // v9.2
-#include <link.h>
-#include <sys/param.h>
-#include <ucontext.h>
-
-namespace __sanitizer {
-
-typedef unsigned long long __xuint64_t;
-
-typedef __int32_t __xregister_t;
-
-typedef struct __xmcontext {
-  __xregister_t mc_onstack;
-  __xregister_t mc_gs;
-  __xregister_t mc_fs;
-  __xregister_t mc_es;
-  __xregister_t mc_ds;
-  __xregister_t mc_edi;
-  __xregister_t mc_esi;
-  __xregister_t mc_ebp;
-  __xregister_t mc_isp;
-  __xregister_t mc_ebx;
-  __xregister_t mc_edx;
-  __xregister_t mc_ecx;
-  __xregister_t mc_eax;
-  __xregister_t mc_trapno;
-  __xregister_t mc_err;
-  __xregister_t mc_eip;
-  __xregister_t mc_cs;
-  __xregister_t mc_eflags;
-  __xregister_t mc_esp;
-  __xregister_t mc_ss;
-
-  int mc_len;
-  int mc_fpformat;
-  int mc_ownedfp;
-  __xregister_t mc_flags;
-
-  int mc_fpstate[128] __aligned(16);
-  __xregister_t mc_fsbase;
-  __xregister_t mc_gsbase;
-  __xregister_t mc_xfpustate;
-  __xregister_t mc_xfpustate_len;
-
-  int mc_spare2[4];
-} xmcontext_t;
-
-typedef struct __xucontext {
-  sigset_t uc_sigmask;
-  xmcontext_t uc_mcontext;
-
-  struct __ucontext *uc_link;
-  stack_t uc_stack;
-  int uc_flags;
-  int __spare__[4];
-} xucontext_t;
-
-struct xkinfo_vmentry {
-  int kve_structsize;
-  int kve_type;
-  __xuint64_t kve_start;
-  __xuint64_t kve_end;
-  __xuint64_t kve_offset;
-  __xuint64_t kve_vn_fileid;
-  __uint32_t kve_vn_fsid;
-  int kve_flags;
-  int kve_resident;
-  int kve_private_resident;
-  int kve_protection;
-  int kve_ref_count;
-  int kve_shadow_count;
-  int kve_vn_type;
-  __xuint64_t kve_vn_size;
-  __uint32_t kve_vn_rdev;
-  __uint16_t kve_vn_mode;
-  __uint16_t kve_status;
-  int _kve_ispare[12];
-  char kve_path[PATH_MAX];
-};
-
-typedef struct {
-  __uint32_t p_type;
-  __uint32_t p_offset;
-  __uint32_t p_vaddr;
-  __uint32_t p_paddr;
-  __uint32_t p_filesz;
-  __uint32_t p_memsz;
-  __uint32_t p_flags;
-  __uint32_t p_align;
-} XElf32_Phdr;
-
-struct xdl_phdr_info {
-  Elf_Addr dlpi_addr;
-  const char *dlpi_name;
-  const XElf32_Phdr *dlpi_phdr;
-  Elf_Half dlpi_phnum;
-  unsigned long long int dlpi_adds;
-  unsigned long long int dlpi_subs;
-  size_t dlpi_tls_modid;
-  void *dlpi_tls_data;
-};
-
-typedef int (*__xdl_iterate_hdr_callback)(struct xdl_phdr_info *, size_t,
-                                          void *);
-typedef int xdl_iterate_phdr_t(__xdl_iterate_hdr_callback, void *);
-
-#define xdl_iterate_phdr(callback, param) \
-  (((xdl_iterate_phdr_t *)dl_iterate_phdr)((callback), (param)))
-
-}  // namespace __sanitizer
-
-#endif  // __FreeBSD_version <= 902001
-#endif  // SANITIZER_FREEBSD && (SANITIZER_WORDSIZE == 32)
-
-#endif  // SANITIZER_FREEBSD_H
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 841d7c096292..5d2dd3a7a658 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -58,7 +58,6 @@
 #  include <sched.h>
 #  include <signal.h>
 #  include <sys/mman.h>
-#  include <sys/param.h>
 #  if !SANITIZER_SOLARIS
 #    include <sys/ptrace.h>
 #  endif
@@ -136,9 +135,7 @@ const int FUTEX_WAKE_PRIVATE = FUTEX_WAKE | FUTEX_PRIVATE_FLAG;
 #    define SANITIZER_LINUX_USES_64BIT_SYSCALLS 0
 #  endif
 
-// Note : FreeBSD had implemented both
-// Linux apis, available from
-// future 12.x version most likely
+// Note : FreeBSD implemented both Linux and OpenBSD apis.
 #  if SANITIZER_LINUX && defined(__NR_getrandom)
 #    if !defined(GRND_NONBLOCK)
 #      define GRND_NONBLOCK 1
@@ -148,10 +145,8 @@ const int FUTEX_WAKE_PRIVATE = FUTEX_WAKE | FUTEX_PRIVATE_FLAG;
 #    define SANITIZER_USE_GETRANDOM 0
 #  endif  // SANITIZER_LINUX && defined(__NR_getrandom)
 
-#  if SANITIZER_FREEBSD && __FreeBSD_version >= 1200000
+#  if SANITIZER_FREEBSD
 #    define SANITIZER_USE_GETENTROPY 1
-#  else
-#    define SANITIZER_USE_GETENTROPY 0
 #  endif
 
 namespace __sanitizer {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
index 8e942b69e6a7..cccbb4d256df 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
@@ -21,7 +21,6 @@
 #  include "sanitizer_common.h"
 #  include "sanitizer_file.h"
 #  include "sanitizer_flags.h"
-#  include "sanitizer_freebsd.h"
 #  include "sanitizer_getauxval.h"
 #  include "sanitizer_glibc_version.h"
 #  include "sanitizer_linux.h"
@@ -46,7 +45,6 @@
 #  endif
 
 #  if SANITIZER_FREEBSD
-#    include <osreldate.h>
 #    include <pthread_np.h>
 #    include <sys/auxv.h>
 #    include <sys/sysctl.h>
@@ -629,11 +627,7 @@ void GetThreadStackAndTls(bool main, uptr *stk_addr, uptr *stk_size,
 
 #  if !SANITIZER_FREEBSD
 typedef ElfW(Phdr) Elf_Phdr;
-#  elif SANITIZER_WORDSIZE == 32 && __FreeBSD_version <= 902001  // v9.2
-#    define Elf_Phdr XElf32_Phdr
-#    define dl_phdr_info xdl_phdr_info
-#    define dl_iterate_phdr(c, b) xdl_iterate_phdr((c), (b))
-#  endif  // !SANITIZER_FREEBSD
+#  endif
 
 struct DlIteratePhdrData {
   InternalMmapVectorNoCtor<LoadedModule> *modules;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 8c7c00de6d12..289ae661c343 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -301,7 +301,8 @@
 #define SANITIZER_INTERCEPT_CANONICALIZE_FILE_NAME (SI_GLIBC || SI_SOLARIS)
 #define SANITIZER_INTERCEPT_CONFSTR \
   (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS)
-#define SANITIZER_INTERCEPT_SCHED_GETAFFINITY SI_LINUX_NOT_ANDROID
+#define SANITIZER_INTERCEPT_SCHED_GETAFFINITY \
+  (SI_LINUX_NOT_ANDROID || SI_FREEBSD)
 #define SANITIZER_INTERCEPT_SCHED_GETPARAM SI_LINUX_NOT_ANDROID || SI_SOLARIS
 #define SANITIZER_INTERCEPT_STRERROR SI_POSIX
 #define SANITIZER_INTERCEPT_STRERROR_R SI_POSIX
@@ -462,7 +463,7 @@
   (SI_LINUX || SI_MAC || SI_WINDOWS || SI_FREEBSD || SI_NETBSD || SI_SOLARIS)
 #define SANITIZER_INTERCEPT_RECV_RECVFROM SI_POSIX
 #define SANITIZER_INTERCEPT_SEND_SENDTO SI_POSIX
-#define SANITIZER_INTERCEPT_EVENTFD_READ_WRITE SI_LINUX
+#define SANITIZER_INTERCEPT_EVENTFD_READ_WRITE (SI_LINUX || SI_FREEBSD)
 
 #define SI_STAT_LINUX (SI_LINUX && __GLIBC_PREREQ(2, 33))
 #define SANITIZER_INTERCEPT_STAT                                        \
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h
index b119f059007d..43b8a38f39be 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h
@@ -726,6 +726,8 @@ struct __sanitizer_cpuset {
 
 typedef struct __sanitizer_cpuset __sanitizer_cpuset_t;
 extern unsigned struct_cpuset_sz;
+
+typedef unsigned long long __sanitizer_eventfd_t;
 }  // namespace __sanitizer
 
 #  define CHECK_TYPE_SIZE(TYPE) \
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index 58244c9944a0..1ee26e0cbbb9 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -938,6 +938,8 @@ struct __sanitizer_cookie_io_functions_t {
   __sanitizer_cookie_io_seek seek;
   __sanitizer_cookie_io_close close;
 };
+
+typedef unsigned long long __sanitizer_eventfd_t;
 #endif
 
 #define IOC_NRBITS 8
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
index 36a82c4ac966..dcfd94fe3225 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
@@ -13,9 +13,6 @@
 #include "sanitizer_platform.h"
 #if SANITIZER_FREEBSD || SANITIZER_NETBSD
 #include "sanitizer_common.h"
-#if SANITIZER_FREEBSD
-#include "sanitizer_freebsd.h"
-#endif
 #include "sanitizer_procmaps.h"
 
 // clang-format off
@@ -29,14 +26,6 @@
 
 #include <limits.h>
 
-// Fix 'kinfo_vmentry' definition on FreeBSD prior v9.2 in 32-bit mode.
-#if SANITIZER_FREEBSD && (SANITIZER_WORDSIZE == 32)
-#include <osreldate.h>
-#if __FreeBSD_version <= 902001 // v9.2
-#define kinfo_vmentry xkinfo_vmentry
-#endif
-#endif
-
 namespace __sanitizer {
 
 #if SANITIZER_FREEBSD
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stack_store.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stack_store.cpp
index 148470943b47..c11df0ddfde4 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stack_store.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stack_store.cpp
@@ -44,6 +44,9 @@ StackStore::Id StackStore::Store(const StackTrace &trace, uptr *pack) {
   uptr idx = 0;
   *pack = 0;
   uptr *stack_trace = Alloc(h.size + 1, &idx, pack);
+  // No more space.
+  if (stack_trace == nullptr)
+    return 0;
   *stack_trace = h.ToUptr();
   internal_memcpy(stack_trace + 1, trace.trace, h.size * sizeof(uptr));
   *pack += blocks_[GetBlockIdx(idx)].Stored(h.size + 1);
@@ -76,8 +79,10 @@ uptr *StackStore::Alloc(uptr count, uptr *idx, uptr *pack) {
     uptr block_idx = GetBlockIdx(start);
     uptr last_idx = GetBlockIdx(start + count - 1);
     if (LIKELY(block_idx == last_idx)) {
-      // Fits into the a single block.
-      CHECK_LT(block_idx, ARRAY_SIZE(blocks_));
+      // Fits into a single block.
+      // No more available blocks.  Indicate inability to allocate more memory.
+      if (block_idx >= ARRAY_SIZE(blocks_))
+        return nullptr;
       *idx = start;
       return blocks_[block_idx].GetOrCreate(this) + GetInBlockIdx(start);
     }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.cpp
index a746d4621936..3776e8c97057 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.cpp
@@ -215,16 +215,16 @@ StackTrace StackDepotGet(u32 id) {
   return theDepot.Get(id);
 }
 
-void StackDepotLockAll() {
-  theDepot.LockAll();
+void StackDepotLockBeforeFork() {
+  theDepot.LockBeforeFork();
   compress_thread.LockAndStop();
   stackStore.LockAll();
 }
 
-void StackDepotUnlockAll() {
+void StackDepotUnlockAfterFork(bool fork_child) {
   stackStore.UnlockAll();
   compress_thread.Unlock();
-  theDepot.UnlockAll();
+  theDepot.UnlockAfterFork(fork_child);
 }
 
 void StackDepotPrintAll() {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.h b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.h
index cca6fd534688..82cf7578d0fb 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.h
@@ -39,8 +39,8 @@ StackDepotHandle StackDepotPut_WithHandle(StackTrace stack);
 // Retrieves a stored stack trace by the id.
 StackTrace StackDepotGet(u32 id);
 
-void StackDepotLockAll();
-void StackDepotUnlockAll();
+void StackDepotLockBeforeFork();
+void StackDepotUnlockAfterFork(bool fork_child);
 void StackDepotPrintAll();
 void StackDepotStopBackgroundThread();
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepotbase.h b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepotbase.h
index 96d1ddc87fd0..279bc5de3bb9 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepotbase.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepotbase.h
@@ -52,8 +52,8 @@ class StackDepotBase {
     };
   }
 
-  void LockAll();
-  void UnlockAll();
+  void LockBeforeFork();
+  void UnlockAfterFork(bool fork_child);
   void PrintAll();
 
   void TestOnlyUnmap() {
@@ -160,18 +160,33 @@ StackDepotBase<Node, kReservedBits, kTabSizeLog>::Get(u32 id) {
 }
 
 template <class Node, int kReservedBits, int kTabSizeLog>
-void StackDepotBase<Node, kReservedBits, kTabSizeLog>::LockAll() {
-  for (int i = 0; i < kTabSize; ++i) {
-    lock(&tab[i]);
-  }
+void StackDepotBase<Node, kReservedBits, kTabSizeLog>::LockBeforeFork() {
+  // Do not lock hash table. It's very expensive, but it's not rely needed. The
+  // parent process will neither lock nor unlock. Child process risks to be
+  // deadlocked on already locked buckets. To avoid deadlock we will unlock
+  // every locked buckets in `UnlockAfterFork`. This may affect consistency of
+  // the hash table, but the only issue is a few items inserted by parent
+  // process will be not found by child, and the child may insert them again,
+  // wasting some space in `stackStore`.
+
+  // We still need to lock nodes.
+  nodes.Lock();
 }
 
 template <class Node, int kReservedBits, int kTabSizeLog>
-void StackDepotBase<Node, kReservedBits, kTabSizeLog>::UnlockAll() {
+void StackDepotBase<Node, kReservedBits, kTabSizeLog>::UnlockAfterFork(
+    bool fork_child) {
+  nodes.Unlock();
+
+  // Only unlock in child process to avoid deadlock. See `LockBeforeFork`.
+  if (!fork_child)
+    return;
+
   for (int i = 0; i < kTabSize; ++i) {
     atomic_uint32_t *p = &tab[i];
     uptr s = atomic_load(p, memory_order_relaxed);
-    unlock(p, s & kUnlockMask);
+    if (s & kLockMask)
+      unlock(p, s & kUnlockMask);
   }
 }
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
index 28f11352a6b5..0ddc24802d21 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
@@ -341,15 +341,14 @@ __sanitizer_symbolize_set_inline_frames(bool InlineFrames);
 class InternalSymbolizer final : public SymbolizerTool {
  public:
   static InternalSymbolizer *get(LowLevelAllocator *alloc) {
-    if (&__sanitizer_symbolize_set_demangle)
-      CHECK(__sanitizer_symbolize_set_demangle(common_flags()->demangle));
-    if (&__sanitizer_symbolize_set_inline_frames)
-      CHECK(__sanitizer_symbolize_set_inline_frames(
-          common_flags()->symbolize_inline_frames));
-    // These are essential, we don't have InternalSymbolizer without them.
-    if (&__sanitizer_symbolize_code && &__sanitizer_symbolize_data)
-      return new (*alloc) InternalSymbolizer();
-    return 0;
+    // These one is the most used one, so we will use it to detect a presence of
+    // internal symbolizer.
+    if (&__sanitizer_symbolize_code == nullptr)
+      return nullptr;
+    CHECK(__sanitizer_symbolize_set_demangle(common_flags()->demangle));
+    CHECK(__sanitizer_symbolize_set_inline_frames(
+        common_flags()->symbolize_inline_frames));
+    return new (*alloc) InternalSymbolizer();
   }
 
   bool SymbolizePC(uptr addr, SymbolizedStack *stack) override {
@@ -371,8 +370,6 @@ class InternalSymbolizer final : public SymbolizerTool {
   }
 
   bool SymbolizeFrame(uptr addr, FrameInfo *info) override {
-    if (&__sanitizer_symbolize_frame == nullptr)
-      return false;
     bool result = __sanitizer_symbolize_frame(info->module, info->module_offset,
                                               buffer_, sizeof(buffer_));
     if (result)
@@ -380,14 +377,10 @@ class InternalSymbolizer final : public SymbolizerTool {
     return result;
   }
 
-  void Flush() override {
-    if (&__sanitizer_symbolize_flush)
-      __sanitizer_symbolize_flush();
-  }
+  void Flush() override { __sanitizer_symbolize_flush(); }
 
   const char *Demangle(const char *name) override {
-    if (&__sanitizer_symbolize_demangle &&
-        __sanitizer_symbolize_demangle(name, buffer_, sizeof(buffer_))) {
+    if (__sanitizer_symbolize_demangle(name, buffer_, sizeof(buffer_))) {
       char *res_buff = nullptr;
       ExtractToken(buffer_, "", &res_buff);
       return res_buff;
diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/sanitizer_symbolize.cpp b/compiler-rt/lib/sanitizer_common/symbolizer/sanitizer_symbolize.cpp
index 4bdf75332bf3..c851dbbf2eb2 100644
--- a/compiler-rt/lib/sanitizer_common/symbolizer/sanitizer_symbolize.cpp
+++ b/compiler-rt/lib/sanitizer_common/symbolizer/sanitizer_symbolize.cpp
@@ -71,7 +71,7 @@ bool __sanitizer_symbolize_code(const char *ModuleName, uint64_t ModuleOffset,
     auto Printer = std::make_unique<llvm::symbolize::LLVMPrinter>(
         OS, symbolize_error_handler(OS), Config);
 
-    // TODO: it is neccessary to set proper SectionIndex here.
+    // TODO: it is necessary to set proper SectionIndex here.
     // object::SectionedAddress::UndefSection works for only absolute addresses.
     if (InlineFrames) {
       auto ResOrErr = getDefaultSymbolizer()->symbolizeInlinedCode(
@@ -103,7 +103,7 @@ bool __sanitizer_symbolize_data(const char *ModuleName, uint64_t ModuleOffset,
     auto Printer = std::make_unique<llvm::symbolize::LLVMPrinter>(
         OS, symbolize_error_handler(OS), Config);
 
-    // TODO: it is neccessary to set proper SectionIndex here.
+    // TODO: it is necessary to set proper SectionIndex here.
     // object::SectionedAddress::UndefSection works for only absolute addresses.
     auto ResOrErr = getDefaultSymbolizer()->symbolizeData(
         ModuleName,
@@ -126,7 +126,7 @@ bool __sanitizer_symbolize_frame(const char *ModuleName, uint64_t ModuleOffset,
     auto Printer = std::make_unique<llvm::symbolize::LLVMPrinter>(
         OS, symbolize_error_handler(OS), Config);
 
-    // TODO: it is neccessary to set proper SectionIndex here.
+    // TODO: it is necessary to set proper SectionIndex here.
     // object::SectionedAddress::UndefSection works for only absolute addresses.
     auto ResOrErr = getDefaultSymbolizer()->symbolizeFrame(
         ModuleName,
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
index 3835ce26c4d5..479e4a0c184f 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
@@ -148,6 +148,10 @@ static struct StackDepotBenchmarkParams {
     {500000, 10, 16, true, false},
     {1500000, 10, 4, true, true},
     {800000, 10, 16, true, true},
+    // Go crazy, and create too many unique stacks, such that StackStore runs
+    // out of space.
+    {1000000, 1, 128, true, true},
+    {100000000, 1, 1, true, true},
 };
 
 static std::string PrintStackDepotBenchmarkParams(
diff --git a/compiler-rt/lib/tsan/go/buildgo.sh b/compiler-rt/lib/tsan/go/buildgo.sh
index 0bd59368cc46..78ba41a0bdc6 100755
--- a/compiler-rt/lib/tsan/go/buildgo.sh
+++ b/compiler-rt/lib/tsan/go/buildgo.sh
@@ -10,6 +10,8 @@ if [ "`uname -a | grep Linux`" != "" ]; then
 		HOST_GOARCH="amd64"
 	elif [ "`uname -a | grep aarch64`" != "" ]; then
 		HOST_GOARCH="arm64"
+	elif [ "`uname -a | grep loongarch64`" != "" ]; then
+		HOST_GOARCH="loong64"
 	elif [ "`uname -a | grep -i mips64`" != "" ]; then
 		if [ "`lscpu | grep -i Little`" != "" ]; then
 			HOST_GOARCH="mips64le"
diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
index fdcba6e8ca73..a9f6673ac44e 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
@@ -2918,7 +2918,9 @@ void InitializeInterceptors() {
   TSAN_INTERCEPT(pthread_mutex_trylock);
   TSAN_INTERCEPT(pthread_mutex_timedlock);
   TSAN_INTERCEPT(pthread_mutex_unlock);
+#if SANITIZER_LINUX
   TSAN_INTERCEPT(pthread_mutex_clocklock);
+#endif
 #if SANITIZER_GLIBC
 #  if !__GLIBC_PREREQ(2, 34)
   TSAN_INTERCEPT(__pthread_mutex_lock);
diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform.h b/compiler-rt/lib/tsan/rtl/tsan_platform.h
index 70b9ae09a990..84ff4bfade09 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_platform.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_platform.h
@@ -622,6 +622,35 @@ struct MappingGoAarch64 {
   static const uptr kShadowAdd = 0x200000000000ull;
 };
 
+/* Go on linux/loongarch64 (47-bit VMA)
+0000 0000 1000 - 0000 1000 0000: executable
+0000 1000 0000 - 00c0 0000 0000: -
+00c0 0000 0000 - 00e0 0000 0000: heap
+00e0 0000 0000 - 2000 0000 0000: -
+2000 0000 0000 - 2800 0000 0000: shadow
+2800 0000 0000 - 3000 0000 0000: -
+3000 0000 0000 - 3200 0000 0000: metainfo (memory blocks and sync objects)
+3200 0000 0000 - 8000 0000 0000: -
+*/
+struct MappingGoLoongArch64_47 {
+  static const uptr kMetaShadowBeg = 0x300000000000ull;
+  static const uptr kMetaShadowEnd = 0x320000000000ull;
+  static const uptr kShadowBeg = 0x200000000000ull;
+  static const uptr kShadowEnd = 0x280000000000ull;
+  static const uptr kLoAppMemBeg = 0x000000001000ull;
+  static const uptr kLoAppMemEnd = 0x00e000000000ull;
+  static const uptr kMidAppMemBeg = 0;
+  static const uptr kMidAppMemEnd = 0;
+  static const uptr kHiAppMemBeg = 0;
+  static const uptr kHiAppMemEnd = 0;
+  static const uptr kHeapMemBeg = 0;
+  static const uptr kHeapMemEnd = 0;
+  static const uptr kVdsoBeg = 0;
+  static const uptr kShadowMsk = 0;
+  static const uptr kShadowXor = 0;
+  static const uptr kShadowAdd = 0x200000000000ull;
+};
+
 /*
 Go on linux/mips64 (47-bit VMA)
 0000 0000 1000 - 0000 1000 0000: executable
@@ -697,6 +726,8 @@ ALWAYS_INLINE auto SelectMapping(Arg arg) {
   return Func::template Apply<MappingGoS390x>(arg);
 #  elif defined(__aarch64__)
   return Func::template Apply<MappingGoAarch64>(arg);
+#  elif defined(__loongarch_lp64)
+  return Func::template Apply<MappingGoLoongArch64_47>(arg);
 #  elif SANITIZER_WINDOWS
   return Func::template Apply<MappingGoWindows>(arg);
 #  else
@@ -765,6 +796,7 @@ void ForEachMapping() {
   Func::template Apply<MappingGoPPC64_46>();
   Func::template Apply<MappingGoPPC64_47>();
   Func::template Apply<MappingGoAarch64>();
+  Func::template Apply<MappingGoLoongArch64_47>();
   Func::template Apply<MappingGoMips64_47>();
   Func::template Apply<MappingGoS390x>();
 }
diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
index 369509ed0a60..b45adea45b27 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
@@ -238,7 +238,13 @@ void InitializePlatformEarly() {
     Printf("FATAL: Found %zd - Supported 47\n", vmaSize);
     Die();
   }
-# endif
+#    else
+  if (vmaSize != 47) {
+    Printf("FATAL: ThreadSanitizer: unsupported VMA range\n");
+    Printf("FATAL: Found %zd - Supported 47\n", vmaSize);
+    Die();
+  }
+#    endif
 #elif defined(__powerpc64__)
 # if !SANITIZER_GO
   if (vmaSize != 44 && vmaSize != 46 && vmaSize != 47) {
diff --git a/compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c b/compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
index 4e6638be584b..d390017dd755 100644
--- a/compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
+++ b/compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
@@ -36,36 +36,36 @@ int main(int argc, char **argv) {
   }
 #endif
 
-// CHECK40: allocated heap chunk; size: 32 offset: 8
-// CHECK40: Cause: heap-buffer-overflow
-// CHECK40: is located 10 bytes after a 30-byte region
-//
-// CHECK80: allocated heap chunk; size: 32 offset: 16
-// CHECK80: Cause: heap-buffer-overflow
-// CHECK80: is located 50 bytes after a 30-byte region
-//
-// CHECKm30: Cause: heap-buffer-overflow
-// CHECKm30: is located 30 bytes before a 30-byte region
-//
-// CHECKMm30: is a large allocated heap chunk; size: 1003520 offset: -30
-// CHECKMm30: Cause: heap-buffer-overflow
-// CHECKMm30: is located 30 bytes before a 1000000-byte region
-//
-// CHECKM: is a large allocated heap chunk; size: 1003520 offset: 1000000
-// CHECKM: Cause: heap-buffer-overflow
-// CHECKM: is located 0 bytes after a 1000000-byte region
-//
-// CHECK31: tags: [[TAG:..]]/0e([[TAG]]) (ptr/mem)
-// CHECK31-NOT: Invalid access starting at offset
-// CHECK31: Cause: heap-buffer-overflow
-// CHECK31: is located 1 bytes after a 30-byte region
-// CHECK31: Memory tags around the buggy address
-// CHECK31: [0e]
-// CHECK31: Tags for short granules around the buggy address
-// CHECK31: {{\[}}[[TAG]]]
-//
-// CHECK20-NOT: Invalid access starting at offset
-// CHECK20: Cause: heap-buffer-overflow
-// CHECK20: is located 10 bytes after a 20-byte region [0x{{.*}}0,0x{{.*}}4)
+  // CHECK40: allocated heap chunk; size: 32 offset: 8
+  // CHECK40: Cause: heap-buffer-overflow
+  // CHECK40: is located 10 bytes after a 30-byte region
+  //
+  // CHECK80: allocated heap chunk; size: 32 offset: 16
+  // CHECK80: Cause: heap-buffer-overflow
+  // CHECK80: is located 50 bytes after a 30-byte region
+  //
+  // CHECKm30: Cause: heap-buffer-overflow
+  // CHECKm30: is located 30 bytes before a 30-byte region
+  //
+  // CHECKMm30: is a large allocated heap chunk; size: 1003520 offset: -30
+  // CHECKMm30: Cause: heap-buffer-overflow
+  // CHECKMm30: is located 30 bytes before a 1000000-byte region
+  //
+  // CHECKM: is a large allocated heap chunk; size: 1003520 offset: 1000000
+  // CHECKM: Cause: heap-buffer-overflow
+  // CHECKM: is located 0 bytes after a 1000000-byte region
+  //
+  // CHECK31: tags: [[TAG:..]]/0e([[TAG]]) (ptr/mem)
+  // CHECK31-NOT: Invalid access starting at offset
+  // CHECK31: Cause: heap-buffer-overflow
+  // CHECK31: is located 1 bytes after a 30-byte region
+  // CHECK31: Memory tags around the buggy address
+  // CHECK31: [0e]
+  // CHECK31: Tags for short granules around the buggy address
+  // CHECK31: {{\[}}[[TAG]]]
+  //
+  // CHECK20-NOT: Invalid access starting at offset
+  // CHECK20: Cause: heap-buffer-overflow
+  // CHECK20: is located 10 bytes after a 20-byte region [0x{{.*}}0,0x{{.*}}4)
   free(x);
 }
diff --git a/compiler-rt/test/hwasan/TestCases/stack-overflow.c b/compiler-rt/test/hwasan/TestCases/stack-overflow.c
new file mode 100644
index 000000000000..4af506e3ecf4
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/stack-overflow.c
@@ -0,0 +1,24 @@
+// RUN: %clang_hwasan -g %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+// Stack histories currently are not recorded on x86.
+// XFAIL: target=x86_64{{.*}}
+
+__attribute((noinline)) void buggy() {
+  char c[64];
+  char *volatile p = c;
+  p[65] = 0;
+}
+
+int main() {
+  buggy();
+  // CHECK: WRITE of size 1 at
+  // CHECK: #0 {{.*}} in buggy{{.*}}stack-overflow.c:[[@LINE-6]]
+  // CHECK: Cause: stack tag-mismatch
+  // CHECK: is located in stack of thread
+  // CHECK: Potentially referenced stack objects:
+  // CHECK: Cause: stack-buffer-overflow
+  // CHECK-NEXT: 0x{{.*}} is located 1 bytes after a 64-byte local variable c [0x{{.*}},0x{{.*}}) in buggy {{.*}}stack-overflow.c:
+  // CHECK: Memory tags around the buggy address
+
+  // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in buggy
+}
diff --git a/compiler-rt/test/hwasan/TestCases/stack-uar-dynamic.c b/compiler-rt/test/hwasan/TestCases/stack-uar-dynamic.c
index b06568e12eba..14b9cba8aa5e 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uar-dynamic.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uar-dynamic.c
@@ -21,6 +21,7 @@ char *buggy(int b) {
 int main() {
   char *p = buggy(1);
   // CHECK: Potentially referenced stack objects:
-  // CHECK-NEXT: c in buggy
+  // CHECK-NEXT: use-after-scope
+  // CHECK-NEXT: 0x{{.*}} is located 0 bytes inside a 64-byte local variable c [0x{{.*}},0x{{.*}}) in buggy
   p[0] = 0;
 }
diff --git a/compiler-rt/test/hwasan/TestCases/stack-uar.c b/compiler-rt/test/hwasan/TestCases/stack-uar.c
index 48440a47d5f5..9fd4381a8049 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uar.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uar.c
@@ -50,15 +50,16 @@ int main() {
   // CHECK: Cause: stack tag-mismatch
   // CHECK: is located in stack of thread
   // CHECK: Potentially referenced stack objects:
-  // CHECK-NEXT: {{zzz|yyy}} in buggy {{.*}}stack-uar.c:
-  // CHECK-NEXT: Memory tags around the buggy address
+  // CHECK: Cause: use-after-scope
+  // CHECK-NEXT: 0x{{.*}} is located 0 bytes inside a 2048-byte local variable {{zzz|yyy}} [0x{{.*}},0x{{.*}}) in buggy {{.*}}stack-uar.c:
+  // CHECK: Memory tags around the buggy address
 
   // NOSYM: Previously allocated frames:
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uar.c.tmp+0x{{.*}}){{$}}
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uar.c.tmp+0x{{.*}}){{$}}
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uar.c.tmp+0x{{.*}}){{$}}
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uar.c.tmp+0x{{.*}}){{$}}
-  // NOSYM-NEXT: Memory tags around the buggy address
+  // NOSYM: Memory tags around the buggy address
 
   // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in main
 }
diff --git a/compiler-rt/test/hwasan/TestCases/stack-uas.c b/compiler-rt/test/hwasan/TestCases/stack-uas.c
index 4455e5910074..a0e4eb02dd22 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uas.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uas.c
@@ -69,15 +69,16 @@ int main() {
   // CHECK: Cause: stack tag-mismatch
   // CHECK: is located in stack of thread
   // CHECK: Potentially referenced stack objects:
-  // CHECK-NEXT: {{zzz|yyy}} in buggy {{.*}}stack-uas.c:
-  // CHECK-NEXT: Memory tags around the buggy address
+  // CHECK: Cause: use-after-scope
+  // CHECK-NEXT: 0x{{.*}} is located 0 bytes inside a 2048-byte local variable {{zzz|yyy}} [0x{{.*}}) in buggy {{.*}}stack-uas.c:
+  // CHECK: Memory tags around the buggy address
 
   // NOSYM: Previously allocated frames:
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uas.c.tmp+0x{{.*}}){{$}}
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uas.c.tmp+0x{{.*}}){{$}}
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uas.c.tmp+0x{{.*}}){{$}}
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uas.c.tmp+0x{{.*}}){{$}}
-  // NOSYM-NEXT: Memory tags around the buggy address
+  // NOSYM: Memory tags around the buggy address
 
   // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in buggy
 }
diff --git a/compiler-rt/test/hwasan/TestCases/stack-underflow.c b/compiler-rt/test/hwasan/TestCases/stack-underflow.c
new file mode 100644
index 000000000000..e13955ed37b4
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/stack-underflow.c
@@ -0,0 +1,24 @@
+// RUN: %clang_hwasan -g %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+// Stack histories currently are not recorded on x86.
+// XFAIL: target=x86_64{{.*}}
+
+__attribute((noinline)) void buggy() {
+  char c[64];
+  char *volatile p = c;
+  p[-2] = 0;
+}
+
+int main() {
+  buggy();
+  // CHECK: WRITE of size 1 at
+  // CHECK: #0 {{.*}} in buggy{{.*}}stack-underflow.c:[[@LINE-6]]
+  // CHECK: Cause: stack tag-mismatch
+  // CHECK: is located in stack of thread
+  // CHECK: Potentially referenced stack objects:
+  // CHECK: Cause: stack-buffer-overflow
+  // CHECK-NEXT: 0x{{.*}} is located 2 bytes before a 64-byte local variable c [0x{{.*}},0x{{.*}}) in buggy {{.*}}stack-underflow.c:
+  // CHECK: Memory tags around the buggy address
+
+  // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in buggy
+}
diff --git a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
new file mode 100644
index 000000000000..22705ed35ce7
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
@@ -0,0 +1,27 @@
+// RUN: %clang_hwasan -O0 -g %s -o %t && %env_hwasan_opts=strip_path_prefix=/TestCases/ not %run %t 2>&1 | FileCheck %s
+
+// Stack histories currently are not recorded on x86.
+// XFAIL: target=x86_64{{.*}}
+
+#include <assert.h>
+#include <sanitizer/hwasan_interface.h>
+#include <stdio.h>
+
+int t;
+
+__attribute__((noinline)) char *buggy() {
+  char *volatile p;
+  char zzz = {};
+  char yyy = {};
+  p = t ? &yyy : &zzz;
+  return p;
+}
+
+int main() {
+  char *p = buggy();
+  return *p;
+  // CHECK: READ of size 1 at
+  // CHECK: #0 {{.*}} in main strip_path_prefix.c:[[@LINE-2]]
+  // CHECK: Potentially referenced stack objects:
+  // CHECK: in buggy strip_path_prefix.c:[[@LINE-12]]
+}
diff --git a/compiler-rt/test/msan/Linux/eventfd.cpp b/compiler-rt/test/msan/eventfd.cpp
index 4399211258ff..638b883fd2c1 100644
--- a/compiler-rt/test/msan/Linux/eventfd.cpp
+++ b/compiler-rt/test/msan/eventfd.cpp
@@ -1,4 +1,7 @@
-// RUN: %clangxx_msan -O0 %s -o %t && %run %t 2>&1
+/* RUN: %clangxx_msan -O0 %s -o %t && %run %t 2>&1
+
+   REQUIRES: target={{.*(linux|freebsd).*}}
+*/
 
 #include <assert.h>
 #include <sys/eventfd.h>
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 03d431046648..16eb67f2e27c 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -315,6 +315,9 @@ end
 * When a file included via an `INCLUDE` line or `#include` directive
   has a continuation marker at the end of its last line in free form,
   Fortran line continuation works.
+* A `NAMELIST` input group may omit its trailing `/` character if
+  it is followed by another `NAMELIST` input group.
+* A `NAMELIST` input group may begin with either `&` or `$`.
 
 ### Extensions supported when enabled by options
 
@@ -641,6 +644,16 @@ module m
 end
 ```
 
+* When an intrinsic procedure appears in the specification part of a module
+  only in function references, but not an explicit `INTRINSIC` statement,
+  its name is not brought into other scopes by a `USE` statement.
+
+* Should hexadecimal floating-point input editing apply any rounding?
+  F'2023 subclause 13.7.2.3.8 only discusses rounding in the context of
+  decimal-to-binary conversion; it would seem to not apply, and so
+  we don't round.  This seems to be how the Intel Fortran compilers
+  behave.
+
 ## De Facto Standard Features
 
 * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the
@@ -650,3 +663,7 @@ end
   but every Fortran compiler allows the encoding to be changed on an
   open unit.
 
+* A `NAMELIST` input item that references a scalar element of a vector
+  or contiguous array can be used as the initial element of a storage
+  sequence.  For example, "&GRP A(1)=1. 2. 3./" is treated as if had been
+  "&GRP A(1:)=1. 2. 3./".
diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md
index 5231e78335f6..fa39889927e0 100644
--- a/flang/docs/FlangDriver.md
+++ b/flang/docs/FlangDriver.md
@@ -163,6 +163,63 @@ forward compiler options to the frontend driver, `flang-new -fc1`.
 You can read more on the design of `clangDriver` in Clang's [Driver Design &
 Internals](https://clang.llvm.org/docs/DriverInternals.html).
 
+## Linker Driver
+When used as a linker, Flang's frontend driver assembles the command line for an
+external linker command (e.g., LLVM's `lld`) and invokes it to create the final
+executable by linking static and shared libraries together with all the
+translation units supplied as object files.
+
+By default, the Flang linker driver adds several libraries to the linker
+invocation to make sure that all entrypoints for program start
+(Fortran's program unit) and runtime routines can be resolved by the linker.
+
+An abridged example (only showing the Fortran specific linker flags, omission
+indicated by `[...]`) for such a linker invocation on a Linux system would look
+like this:
+
+```
+$ flang -v -o example example.o
+"/usr/bin/ld" [...] example.o [...] "--whole-archive" "-lFortran_main"
+"--no-whole-archive" "-lFortranRuntime" "-lFortranDecimal" [...]
+```
+
+The automatically added libraries are:
+
+* `Fortran_main`: Provides the main entry point `main` that then invokes
+  `_QQmain` with the Fortran program unit.  This library has a dependency to
+  the `FortranRuntime` library.
+* `FortranRuntime`: Provides most of the Flang runtime library.
+* `FortranDecimal`: Provides operations for decimal numbers.
+
+The default is that, when using Flang as the linker, one of the Fortran
+translation units provides the program unit and therefore it is assumed that
+Fortran is the main code part (calling into C/C++ routines via `BIND (C)`
+interfaces).  When composing the linker commandline, Flang uses
+`--whole-archive` and `--no-whole-archive` (Windows: `/WHOLEARCHIVE:`,
+Darwin & AIX: *not implemented yet*) to make sure that all for `Fortran_main`
+is processed by the linker.  This is done to issue a proper error message when
+multiple definitions of `main` occur.  This happens, for instance, when linking
+a code that has a Fortran program unit with a C/C++ code that also defines a
+`main` function.  A user may be required to explicitly provide the C++ runtime
+libraries at link time (e.g., via `-lstdc++` for STL)
+
+If the code is C/C++ based and invokes Fortran routines, one can either use Clang
+or Flang as the linker driver.  If Clang is used, it will automatically all
+required runtime libraries needed by C++ (e.g., for STL) to the linker invocation.
+In this case, one has to explicitly provide the Fortran runtime libraries
+`FortranRuntime` and/or `FortranDecimal`.  An alternative is to use Flang to link
+and use the `-fno-fortran-main` flag.  This flag removes
+`Fortran_main` from the linker stage and hence requires one of the C/C++
+translation units to provide a definition of the `main` function. In this case,
+it may be required to explicitly supply C++ runtime libraries as mentioned above.
+
+When creating shared or static libraries using Flang with `-shared` or `-static`
+flag, Fortran_main is automatically removed from the linker stage (i.e.,
+`-fno-fortran-main` is on by default).  It is assumed that when creating a
+static or shared library, the generated library does not need a `main`
+function, as a final link stage will occur that will provide the `Fortran_main`
+library when creating the final executable.
+
 ## Frontend Driver
 Flang's frontend driver is the main interface between compiler developers and
 the Flang frontend. The high-level design is similar to Clang's frontend
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index fef2b4ea4dd8..189920a0881b 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -695,6 +695,11 @@ CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, LOC
 MALLOC
 ```
 
+### Library subroutine 
+```
+CALL GETLOG(USRNAME)
+```
+
 ## Intrinsic Procedure Name Resolution
 
 When the name of a procedure in a program is the same as the one of an intrinsic
@@ -754,6 +759,7 @@ This phase currently supports all the intrinsic procedures listed above but the
 | Intrinsic subroutines |MVBITS (elemental), CPU_TIME, DATE_AND_TIME, EVENT_QUERY, EXECUTE_COMMAND_LINE, GET_COMMAND, GET_COMMAND_ARGUMENT, GET_ENVIRONMENT_VARIABLE, MOVE_ALLOC, RANDOM_INIT, RANDOM_NUMBER, RANDOM_SEED, SYSTEM_CLOCK |
 | Atomic intrinsic subroutines | ATOMIC_ADD |
 | Collective intrinsic subroutines | CO_REDUCE |
+| Library subroutines | GETLOG|
 
 
 ### Intrinsic Function Folding
diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index 6f10553cdcb4..2338fa1b14a3 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -281,7 +281,7 @@ public:
   READ_FEATURE(ErrorRecovery)
   READ_FEATURE(EventPostStmt)
   READ_FEATURE(EventWaitStmt)
-  READ_FEATURE(EventWaitStmt::EventWaitSpec)
+  READ_FEATURE(EventWaitSpec)
   READ_FEATURE(ExecutableConstruct)
   READ_FEATURE(ExecutionPart)
   READ_FEATURE(ExecutionPartConstruct)
@@ -438,6 +438,7 @@ public:
   READ_FEATURE(NamelistStmt::Group)
   READ_FEATURE(NonLabelDoStmt)
   READ_FEATURE(NoPass)
+  READ_FEATURE(NotifyWaitStmt)
   READ_FEATURE(NullifyStmt)
   READ_FEATURE(NullInit)
   READ_FEATURE(ObjectDecl)
diff --git a/flang/include/flang/Common/uint128.h b/flang/include/flang/Common/uint128.h
index bfd2eef01f6f..03e44eb6997d 100644
--- a/flang/include/flang/Common/uint128.h
+++ b/flang/include/flang/Common/uint128.h
@@ -33,15 +33,18 @@ public:
   constexpr Int128(unsigned n) : low_{n} {}
   constexpr Int128(unsigned long n) : low_{n} {}
   constexpr Int128(unsigned long long n) : low_{n} {}
-  constexpr Int128(int n)
-      : low_{static_cast<std::uint64_t>(n)}, high_{-static_cast<std::uint64_t>(
-                                                 n < 0)} {}
-  constexpr Int128(long n)
-      : low_{static_cast<std::uint64_t>(n)}, high_{-static_cast<std::uint64_t>(
-                                                 n < 0)} {}
-  constexpr Int128(long long n)
-      : low_{static_cast<std::uint64_t>(n)}, high_{-static_cast<std::uint64_t>(
-                                                 n < 0)} {}
+  constexpr Int128(int n) {
+    low_ = static_cast<std::uint64_t>(n);
+    high_ = -static_cast<std::uint64_t>(n < 0);
+  }
+  constexpr Int128(long n) {
+    low_ = static_cast<std::uint64_t>(n);
+    high_ = -static_cast<std::uint64_t>(n < 0);
+  }
+  constexpr Int128(long long n) {
+    low_ = static_cast<std::uint64_t>(n);
+    high_ = -static_cast<std::uint64_t>(n < 0);
+  }
   constexpr Int128(const Int128 &) = default;
   constexpr Int128(Int128 &&) = default;
   constexpr Int128 &operator=(const Int128 &) = default;
@@ -246,7 +249,10 @@ public:
   }
 
 private:
-  constexpr Int128(std::uint64_t hi, std::uint64_t lo) : low_{lo}, high_{hi} {}
+  constexpr Int128(std::uint64_t hi, std::uint64_t lo) {
+    low_ = lo;
+    high_ = hi;
+  }
   constexpr int LeadingZeroes() const {
     if (high_ == 0) {
       return 64 + LeadingZeroBitCount(low_);
@@ -255,7 +261,13 @@ private:
     }
   }
   static constexpr std::uint64_t topBit{std::uint64_t{1} << 63};
+#if FLANG_LITTLE_ENDIAN
   std::uint64_t low_{0}, high_{0};
+#elif FLANG_BIG_ENDIAN
+  std::uint64_t high_{0}, low_{0};
+#else
+#error host endianness is not known
+#endif
 };
 
 using UnsignedInt128 = Int128<false>;
diff --git a/flang/include/flang/Decimal/binary-floating-point.h b/flang/include/flang/Decimal/binary-floating-point.h
index b9346a8585e2..d1992819f85a 100644
--- a/flang/include/flang/Decimal/binary-floating-point.h
+++ b/flang/include/flang/Decimal/binary-floating-point.h
@@ -143,7 +143,7 @@ public:
     if (IsNaN() || IsInfinite() || keepBits >= binaryPrecision) {
       return true;
     }
-    int lostBits{binaryPrecision - keepBits};
+    int lostBits{keepBits < binaryPrecision ? binaryPrecision - keepBits : 0};
     RawType lostMask{static_cast<RawType>((RawType{1} << lostBits) - 1)};
     if (RawType lost{static_cast<RawType>(raw_ & lostMask)}; lost != 0) {
       bool increase{false};
diff --git a/flang/include/flang/Decimal/decimal.h b/flang/include/flang/Decimal/decimal.h
index a4e0ee7c8474..f0997fb63df0 100644
--- a/flang/include/flang/Decimal/decimal.h
+++ b/flang/include/flang/Decimal/decimal.h
@@ -34,6 +34,7 @@ enum ConversionResultFlags {
   Overflow = 1,
   Inexact = 2,
   Invalid = 4,
+  Underflow = 8,
 };
 
 struct ConversionToDecimalResult {
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 8a47a9f65166..c0cbb05c009d 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1227,10 +1227,12 @@ bool IsFunctionResult(const Symbol &);
 bool IsKindTypeParameter(const Symbol &);
 bool IsLenTypeParameter(const Symbol &);
 bool IsExtensibleType(const DerivedTypeSpec *);
+bool IsSequenceOrBindCType(const DerivedTypeSpec *);
 bool IsBuiltinDerivedType(const DerivedTypeSpec *derived, const char *name);
 bool IsBuiltinCPtr(const Symbol &);
 bool IsEventType(const DerivedTypeSpec *);
 bool IsLockType(const DerivedTypeSpec *);
+bool IsNotifyType(const DerivedTypeSpec *);
 // Is this derived type TEAM_TYPE from module ISO_FORTRAN_ENV?
 bool IsTeamType(const DerivedTypeSpec *);
 // Is this derived type TEAM_TYPE, C_PTR, or C_FUNPTR?
diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def
index 72e7bdab12a1..9d03ec88a56b 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.def
+++ b/flang/include/flang/Frontend/CodeGenOptions.def
@@ -38,6 +38,7 @@ CODEGENOPT(Underscoring, 1, 1)
 ENUM_CODEGENOPT(RelocationModel, llvm::Reloc::Model, 3, llvm::Reloc::PIC_) ///< Name of the relocation model to use.
 ENUM_CODEGENOPT(DebugInfo,  llvm::codegenoptions::DebugInfoKind, 4,  llvm::codegenoptions::NoDebugInfo) ///< Level of debug info to generate
 ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 3, llvm::driver::VectorLibrary::NoLibrary) ///< Vector functions library to use
+ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 2, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers
 
 #undef CODEGENOPT
 #undef ENUM_CODEGENOPT
diff --git a/flang/include/flang/ISO_Fortran_binding.h b/flang/include/flang/ISO_Fortran_binding.h
index dd384c516263..4a28d3322a38 100644
--- a/flang/include/flang/ISO_Fortran_binding.h
+++ b/flang/include/flang/ISO_Fortran_binding.h
@@ -189,8 +189,8 @@ RT_API_ATTRS void *CFI_address(
 RT_API_ATTRS int CFI_allocate(CFI_cdesc_t *, const CFI_index_t lower_bounds[],
     const CFI_index_t upper_bounds[], size_t elem_len);
 RT_API_ATTRS int CFI_deallocate(CFI_cdesc_t *);
-int CFI_establish(CFI_cdesc_t *, void *base_addr, CFI_attribute_t, CFI_type_t,
-    size_t elem_len, CFI_rank_t, const CFI_index_t extents[]);
+RT_API_ATTRS int CFI_establish(CFI_cdesc_t *, void *base_addr, CFI_attribute_t,
+    CFI_type_t, size_t elem_len, CFI_rank_t, const CFI_index_t extents[]);
 RT_API_ATTRS int CFI_is_contiguous(const CFI_cdesc_t *);
 RT_API_ATTRS int CFI_section(CFI_cdesc_t *, const CFI_cdesc_t *source,
     const CFI_index_t lower_bounds[], const CFI_index_t upper_bounds[],
diff --git a/flang/include/flang/Lower/PFTBuilder.h b/flang/include/flang/Lower/PFTBuilder.h
index 9c6696ff79da..8d32c3235291 100644
--- a/flang/include/flang/Lower/PFTBuilder.h
+++ b/flang/include/flang/Lower/PFTBuilder.h
@@ -100,13 +100,14 @@ using ActionStmts = std::tuple<
     parser::EventPostStmt, parser::EventWaitStmt, parser::ExitStmt,
     parser::FailImageStmt, parser::FlushStmt, parser::FormTeamStmt,
     parser::GotoStmt, parser::IfStmt, parser::InquireStmt, parser::LockStmt,
-    parser::NullifyStmt, parser::OpenStmt, parser::PointerAssignmentStmt,
-    parser::PrintStmt, parser::ReadStmt, parser::ReturnStmt, parser::RewindStmt,
-    parser::StopStmt, parser::SyncAllStmt, parser::SyncImagesStmt,
-    parser::SyncMemoryStmt, parser::SyncTeamStmt, parser::UnlockStmt,
-    parser::WaitStmt, parser::WhereStmt, parser::WriteStmt,
-    parser::ComputedGotoStmt, parser::ForallStmt, parser::ArithmeticIfStmt,
-    parser::AssignStmt, parser::AssignedGotoStmt, parser::PauseStmt>;
+    parser::NotifyWaitStmt, parser::NullifyStmt, parser::OpenStmt,
+    parser::PointerAssignmentStmt, parser::PrintStmt, parser::ReadStmt,
+    parser::ReturnStmt, parser::RewindStmt, parser::StopStmt,
+    parser::SyncAllStmt, parser::SyncImagesStmt, parser::SyncMemoryStmt,
+    parser::SyncTeamStmt, parser::UnlockStmt, parser::WaitStmt,
+    parser::WhereStmt, parser::WriteStmt, parser::ComputedGotoStmt,
+    parser::ForallStmt, parser::ArithmeticIfStmt, parser::AssignStmt,
+    parser::AssignedGotoStmt, parser::PauseStmt>;
 
 using OtherStmts = std::tuple<parser::EntryStmt, parser::FormatStmt>;
 
diff --git a/flang/include/flang/Lower/Runtime.h b/flang/include/flang/Lower/Runtime.h
index e71496edad9b..77e98a1e019e 100644
--- a/flang/include/flang/Lower/Runtime.h
+++ b/flang/include/flang/Lower/Runtime.h
@@ -34,6 +34,7 @@ namespace parser {
 struct EventPostStmt;
 struct EventWaitStmt;
 struct LockStmt;
+struct NotifyWaitStmt;
 struct PauseStmt;
 struct StopStmt;
 struct SyncAllStmt;
@@ -49,6 +50,8 @@ class AbstractConverter;
 
 // Lowering of Fortran statement related runtime (other than IO and maths)
 
+void genNotifyWaitStatement(AbstractConverter &,
+                            const parser::NotifyWaitStmt &);
 void genEventPostStatement(AbstractConverter &, const parser::EventPostStmt &);
 void genEventWaitStatement(AbstractConverter &, const parser::EventWaitStmt &);
 void genLockStatement(AbstractConverter &, const parser::LockStmt &);
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index e7561dffb756..46dc79f41a18 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -59,7 +59,7 @@ public:
   bool isVariable() const { return !isValue(); }
   bool isMutableBox() const { return hlfir::isBoxAddressType(getType()); }
   bool isProcedurePointer() const {
-    return hlfir::isBoxProcAddressType(getType());
+    return fir::isBoxProcAddressType(getType());
   }
   bool isBoxAddressOrValue() const {
     return hlfir::isBoxAddressOrValueType(getType());
@@ -404,15 +404,15 @@ mlir::Value inlineElementalOp(
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 convertToValue(mlir::Location loc, fir::FirOpBuilder &builder,
-               const hlfir::Entity &entity);
+               hlfir::Entity entity);
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 convertToAddress(mlir::Location loc, fir::FirOpBuilder &builder,
-                 const hlfir::Entity &entity, mlir::Type targetType);
+                 hlfir::Entity entity, mlir::Type targetType);
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 convertToBox(mlir::Location loc, fir::FirOpBuilder &builder,
-             const hlfir::Entity &entity, mlir::Type targetType);
+             hlfir::Entity entity, mlir::Type targetType);
 
 /// Clone an hlfir.elemental_addr into an hlfir.elemental value.
 hlfir::ElementalOp cloneToElementalOp(mlir::Location loc,
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index ba0c4806c759..dba946975e19 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -202,6 +202,7 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genCAssociatedCPtr(mlir::Type,
                                         llvm::ArrayRef<fir::ExtendedValue>);
   void genCFPointer(llvm::ArrayRef<fir::ExtendedValue>);
+  void genCFProcPointer(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCFunLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   void genDateAndTime(llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/include/flang/Optimizer/Dialect/FIRType.h b/flang/include/flang/Optimizer/Dialect/FIRType.h
index a79c67dfe6de..ecfa9839617d 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRType.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRType.h
@@ -436,6 +436,12 @@ inline bool isBoxAddressOrValue(mlir::Type t) {
   return fir::unwrapRefType(t).isa<fir::BaseBoxType>();
 }
 
+/// Is this a fir.boxproc address type?
+inline bool isBoxProcAddressType(mlir::Type t) {
+  t = fir::dyn_cast_ptrEleTy(t);
+  return t && t.isa<fir::BoxProcType>();
+}
+
 /// Return a string representation of `ty`.
 ///
 /// fir.array<10x10xf32> -> prefix_10x10xf32
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h b/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
index e8f284852982..aa68d0811c48 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
@@ -67,12 +67,6 @@ inline bool isBoxAddressType(mlir::Type type) {
   return type && type.isa<fir::BaseBoxType>();
 }
 
-/// Is this a fir.boxproc address type?
-inline bool isBoxProcAddressType(mlir::Type type) {
-  type = fir::dyn_cast_ptrEleTy(type);
-  return type && type.isa<fir::BoxProcType>();
-}
-
 /// Is this a fir.box or fir.class address or value type?
 inline bool isBoxAddressOrValueType(mlir::Type type) {
   return fir::unwrapRefType(type).isa<fir::BaseBoxType>();
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 92bc7246eca7..6970da8698ae 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -10,6 +10,7 @@
 #define FORTRAN_OPTIMIZER_TRANSFORMS_PASSES_H
 
 #include "flang/Optimizer/Dialect/FIROps.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
 #include <memory>
@@ -83,6 +84,15 @@ std::unique_ptr<mlir::Pass> createVScaleAttrPass();
 std::unique_ptr<mlir::Pass>
 createVScaleAttrPass(std::pair<unsigned, unsigned> vscaleAttr);
 
+struct FunctionAttrTypes {
+  mlir::LLVM::framePointerKind::FramePointerKind framePointerKind =
+      mlir::LLVM::framePointerKind::FramePointerKind::None;
+};
+
+std::unique_ptr<mlir::Pass> createFunctionAttrPass();
+std::unique_ptr<mlir::Pass>
+createFunctionAttrPass(FunctionAttrTypes &functionAttr);
+
 // declarative passes
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/Transforms/Passes.h.inc"
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index c3768fd2d689..e3c45d41f04c 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -349,4 +349,25 @@ def VScaleAttr : Pass<"vscale-attr", "mlir::func::FuncOp"> {
   let constructor = "::fir::createVScaleAttrPass()";
 }
 
+def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> {
+  let summary = "Pass that adds function attributes expected at LLVM IR level";
+  let description = [{ This feature introduces a general attribute aimed at
+     customizing function characteristics. 
+     Options include:
+     Add "frame-pointer" attribute to functions: Set an attribute for the frame 
+     pointer on functions, to avoid saving the frame pointer in a register in 
+     functions where it is unnecessary. This eliminates the need for
+     instructions to save, establish, and restore frame pointers, while also
+     freeing up an additional register in numerous functions. However, this
+     approach can make debugging unfeasible on certain machines.
+  }];
+  let options = [
+    Option<"framePointerKind", "frame-pointer",
+           "mlir::LLVM::framePointerKind::FramePointerKind", 
+           /*default=*/"mlir::LLVM::framePointerKind::FramePointerKind{}",
+           "frame pointer">,
+  ];
+  let constructor = "::fir::createFunctionAttrPass()";
+}
+
 #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 7c479a2334ea..1defbf132327 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -301,8 +301,8 @@ public:
   NODE(parser, ErrLabel)
   NODE(parser, ErrorRecovery)
   NODE(parser, EventPostStmt)
+  NODE(parser, EventWaitSpec)
   NODE(parser, EventWaitStmt)
-  NODE(EventWaitStmt, EventWaitSpec)
   NODE(parser, ExecutableConstruct)
   NODE(parser, ExecutionPart)
   NODE(parser, ExecutionPartConstruct)
@@ -462,6 +462,7 @@ public:
   NODE(NamelistStmt, Group)
   NODE(parser, NonLabelDoStmt)
   NODE(parser, NoPass)
+  NODE(parser, NotifyWaitStmt)
   NODE(parser, NullifyStmt)
   NODE(parser, NullInit)
   NODE(parser, ObjectDecl)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 393e0e24ec5c..71195f2bb9dd 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -209,11 +209,13 @@ struct ExitStmt; // R1156
 struct GotoStmt; // R1157
 struct ComputedGotoStmt; // R1158
 struct StopStmt; // R1160, R1161
+struct NotifyWaitStmt; // F2023: R1166
 struct SyncAllStmt; // R1164
 struct SyncImagesStmt; // R1166
 struct SyncMemoryStmt; // R1168
 struct SyncTeamStmt; // R1169
 struct EventPostStmt; // R1170, R1171
+struct EventWaitSpec; // F2023: R1177
 struct EventWaitStmt; // R1172, R1173, R1174
 struct FormTeamStmt; // R1175, R1176, R1177
 struct LockStmt; // R1178
@@ -477,9 +479,9 @@ EMPTY_CLASS(FailImageStmt);
 //        close-stmt | continue-stmt | cycle-stmt | deallocate-stmt |
 //        endfile-stmt | error-stop-stmt | event-post-stmt | event-wait-stmt |
 //        exit-stmt | fail-image-stmt | flush-stmt | form-team-stmt |
-//        goto-stmt | if-stmt | inquire-stmt | lock-stmt | nullify-stmt |
-//        open-stmt | pointer-assignment-stmt | print-stmt | read-stmt |
-//        return-stmt | rewind-stmt | stop-stmt | sync-all-stmt |
+//        goto-stmt | if-stmt | inquire-stmt | lock-stmt | notify-wait-stmt |
+//        nullify-stmt | open-stmt | pointer-assignment-stmt | print-stmt |
+//        read-stmt | return-stmt | rewind-stmt | stop-stmt | sync-all-stmt |
 //        sync-images-stmt | sync-memory-stmt | sync-team-stmt | unlock-stmt |
 //        wait-stmt | where-stmt | write-stmt | computed-goto-stmt | forall-stmt
 struct ActionStmt {
@@ -494,8 +496,8 @@ struct ActionStmt {
       common::Indirection<FlushStmt>, common::Indirection<FormTeamStmt>,
       common::Indirection<GotoStmt>, common::Indirection<IfStmt>,
       common::Indirection<InquireStmt>, common::Indirection<LockStmt>,
-      common::Indirection<NullifyStmt>, common::Indirection<OpenStmt>,
-      common::Indirection<PointerAssignmentStmt>,
+      common::Indirection<NotifyWaitStmt>, common::Indirection<NullifyStmt>,
+      common::Indirection<OpenStmt>, common::Indirection<PointerAssignmentStmt>,
       common::Indirection<PrintStmt>, common::Indirection<ReadStmt>,
       common::Indirection<ReturnStmt>, common::Indirection<RewindStmt>,
       common::Indirection<StopStmt>, common::Indirection<SyncAllStmt>,
@@ -2492,6 +2494,13 @@ struct StopStmt {
   std::tuple<Kind, std::optional<StopCode>, std::optional<ScalarLogicalExpr>> t;
 };
 
+// F2023: R1166 notify-wait-stmt -> NOTIFY WAIT ( notify-variable [,
+// event-wait-spec-list] )
+struct NotifyWaitStmt {
+  TUPLE_CLASS_BOILERPLATE(NotifyWaitStmt);
+  std::tuple<Scalar<Variable>, std::list<EventWaitSpec>> t;
+};
+
 // R1164 sync-all-stmt -> SYNC ALL [( [sync-stat-list] )]
 WRAPPER_CLASS(SyncAllStmt, std::list<StatOrErrmsg>);
 
@@ -2524,15 +2533,16 @@ struct EventPostStmt {
   std::tuple<EventVariable, std::list<StatOrErrmsg>> t;
 };
 
+// R1173 event-wait-spec -> until-spec | sync-stat
+struct EventWaitSpec {
+  UNION_CLASS_BOILERPLATE(EventWaitSpec);
+  std::variant<ScalarIntExpr, StatOrErrmsg> u;
+};
+
 // R1172 event-wait-stmt ->
 //         EVENT WAIT ( event-variable [, event-wait-spec-list] )
-// R1173 event-wait-spec -> until-spec | sync-stat
 // R1174 until-spec -> UNTIL_COUNT = scalar-int-expr
 struct EventWaitStmt {
-  struct EventWaitSpec {
-    UNION_CLASS_BOILERPLATE(EventWaitSpec);
-    std::variant<ScalarIntExpr, StatOrErrmsg> u;
-  };
   TUPLE_CLASS_BOILERPLATE(EventWaitStmt);
   std::tuple<EventVariable, std::list<EventWaitSpec>> t;
 };
diff --git a/flang/include/flang/Runtime/allocatable.h b/flang/include/flang/Runtime/allocatable.h
index 4169483398f6..58061d986209 100644
--- a/flang/include/flang/Runtime/allocatable.h
+++ b/flang/include/flang/Runtime/allocatable.h
@@ -26,22 +26,22 @@ extern "C" {
 // A descriptor must be initialized before being used for any purpose,
 // but needs reinitialization in a deallocated state only when there is
 // a change of type, rank, or corank.
-void RTNAME(AllocatableInitIntrinsic)(
+void RTDECL(AllocatableInitIntrinsic)(
     Descriptor &, TypeCategory, int kind, int rank = 0, int corank = 0);
-void RTNAME(AllocatableInitCharacter)(Descriptor &, SubscriptValue length = 0,
+void RTDECL(AllocatableInitCharacter)(Descriptor &, SubscriptValue length = 0,
     int kind = 1, int rank = 0, int corank = 0);
-void RTNAME(AllocatableInitDerived)(
+void RTDECL(AllocatableInitDerived)(
     Descriptor &, const typeInfo::DerivedType &, int rank = 0, int corank = 0);
 
 // Initializes the descriptor for an allocatable of intrinsic or derived type.
 // These functions are meant to be used in the allocate statement lowering. If
 // the descriptor is allocated, the initialization is skiped so the error
 // handling can be done by AllocatableAllocate.
-void RTNAME(AllocatableInitIntrinsicForAllocate)(
+void RTDECL(AllocatableInitIntrinsicForAllocate)(
     Descriptor &, TypeCategory, int kind, int rank = 0, int corank = 0);
-void RTNAME(AllocatableInitCharacterForAllocate)(Descriptor &,
+void RTDECL(AllocatableInitCharacterForAllocate)(Descriptor &,
     SubscriptValue length = 0, int kind = 1, int rank = 0, int corank = 0);
-void RTNAME(AllocatableInitDerivedForAllocate)(
+void RTDECL(AllocatableInitDerivedForAllocate)(
     Descriptor &, const typeInfo::DerivedType &, int rank = 0, int corank = 0);
 
 // Checks that an allocatable is not already allocated in statements
@@ -50,29 +50,29 @@ void RTNAME(AllocatableInitDerivedForAllocate)(
 // (If there's no STAT=, the error will be caught later anyway, but
 // this API allows the error to be caught before descriptor is modified.)
 // Return 0 on success (deallocated state), else the STAT= value.
-int RTNAME(AllocatableCheckAllocated)(Descriptor &,
+int RTDECL(AllocatableCheckAllocated)(Descriptor &,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 // For MOLD= allocation; sets bounds, cobounds, and length type
 // parameters from another descriptor. The destination descriptor must
 // be initialized and deallocated.
-void RTNAME(AllocatableApplyMold)(
+void RTDECL(AllocatableApplyMold)(
     Descriptor &, const Descriptor &mold, int rank = 0);
 
 // Explicitly sets the bounds and length type parameters of an initialized
 // deallocated allocatable.
-void RTNAME(AllocatableSetBounds)(
+void RTDECL(AllocatableSetBounds)(
     Descriptor &, int zeroBasedDim, SubscriptValue lower, SubscriptValue upper);
 
 // The upper cobound is ignored for the last codimension.
-void RTNAME(AllocatableSetCoBounds)(Descriptor &, int zeroBasedCoDim,
+void RTDECL(AllocatableSetCoBounds)(Descriptor &, int zeroBasedCoDim,
     SubscriptValue lower, SubscriptValue upper = 0);
 
 // Length type parameters are indexed in declaration order; i.e., 0 is the
 // first length type parameter in the deepest base type.  (Not for use
 // with CHARACTER; see above.)
-void RTNAME(AllocatableSetDerivedLength)(
+void RTDECL(AllocatableSetDerivedLength)(
     Descriptor &, int which, SubscriptValue);
 
 // When an explicit type-spec appears in an ALLOCATE statement for an
@@ -80,7 +80,7 @@ void RTNAME(AllocatableSetDerivedLength)(
 // a derived type or CHARACTER value, the explicit value has to match
 // the length type parameter's value.  This API checks that requirement.
 // Returns 0 for success, or the STAT= value on failure with hasStat==true.
-int RTNAME(AllocatableCheckLengthParameter)(Descriptor &,
+int RTDECL(AllocatableCheckLengthParameter)(Descriptor &,
     int which /* 0 for CHARACTER length */, SubscriptValue other,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
@@ -94,10 +94,10 @@ int RTNAME(AllocatableCheckLengthParameter)(Descriptor &,
 // Successfully allocated memory is initialized if the allocatable has a
 // derived type, and is always initialized by AllocatableAllocateSource().
 // Performs all necessary coarray synchronization and validation actions.
-int RTNAME(AllocatableAllocate)(Descriptor &, bool hasStat = false,
+int RTDECL(AllocatableAllocate)(Descriptor &, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
-int RTNAME(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
+int RTDECL(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
@@ -105,7 +105,7 @@ int RTNAME(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
 // but note the order of first two arguments is reversed for consistency
 // with the other APIs for allocatables.)  The destination descriptor
 // must be initialized.
-std::int32_t RTNAME(MoveAlloc)(Descriptor &to, Descriptor &from,
+std::int32_t RTDECL(MoveAlloc)(Descriptor &to, Descriptor &from,
     const typeInfo::DerivedType *, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
@@ -113,19 +113,19 @@ std::int32_t RTNAME(MoveAlloc)(Descriptor &to, Descriptor &from,
 // Deallocates an allocatable.  Finalizes elements &/or components as needed.
 // The allocatable is left in an initialized state suitable for reallocation
 // with the same bounds, cobounds, and length type parameters.
-int RTNAME(AllocatableDeallocate)(Descriptor &, bool hasStat = false,
+int RTDECL(AllocatableDeallocate)(Descriptor &, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 // Same as AllocatableDeallocate but also set the dynamic type as the declared
 // type as mentioned in 7.3.2.3 note 7.
-int RTNAME(AllocatableDeallocatePolymorphic)(Descriptor &,
+int RTDECL(AllocatableDeallocatePolymorphic)(Descriptor &,
     const typeInfo::DerivedType *, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 // Variant of above that does not finalize; for intermediate results
-void RTNAME(AllocatableDeallocateNoFinal)(
+void RTDECL(AllocatableDeallocateNoFinal)(
     Descriptor &, const char *sourceFile = nullptr, int sourceLine = 0);
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/include/flang/Runtime/api-attrs.h b/flang/include/flang/Runtime/api-attrs.h
index 61da2c06d3a4..9c8a67ffc34a 100644
--- a/flang/include/flang/Runtime/api-attrs.h
+++ b/flang/include/flang/Runtime/api-attrs.h
@@ -121,4 +121,15 @@
 #undef RT_DEVICE_COMPILATION
 #endif
 
+#if defined(__CUDACC__)
+#define RT_DIAG_PUSH _Pragma("nv_diagnostic push")
+#define RT_DIAG_POP _Pragma("nv_diagnostic pop")
+#define RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN \
+  _Pragma("nv_diag_suppress 20011") _Pragma("nv_diag_suppress 20014")
+#else /* !defined(__CUDACC__) */
+#define RT_DIAG_PUSH
+#define RT_DIAG_POP
+#define RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+#endif /* !defined(__CUDACC__) */
+
 #endif /* !FORTRAN_RUNTIME_API_ATTRS_H_ */
diff --git a/flang/include/flang/Runtime/array-constructor.h b/flang/include/flang/Runtime/array-constructor.h
index 5274a2fc9e08..46fc0418c799 100644
--- a/flang/include/flang/Runtime/array-constructor.h
+++ b/flang/include/flang/Runtime/array-constructor.h
@@ -21,15 +21,17 @@ namespace Fortran::runtime {
 // Runtime data structure to hold information about the storage of
 // an array constructor being constructed.
 struct ArrayConstructorVector {
-  ArrayConstructorVector(class Descriptor &to, SubscriptValue nextValuePosition,
-      SubscriptValue actualAllocationSize, const char *sourceFile,
-      int sourceLine, bool useValueLengthParameters)
+  RT_API_ATTRS ArrayConstructorVector(class Descriptor &to,
+      SubscriptValue nextValuePosition, SubscriptValue actualAllocationSize,
+      const char *sourceFile, int sourceLine, bool useValueLengthParameters)
       : to{to}, nextValuePosition{nextValuePosition},
         actualAllocationSize{actualAllocationSize}, sourceFile{sourceFile},
-        sourceLine{sourceLine}, useValueLengthParameters_{
-                                    useValueLengthParameters} {}
+        sourceLine{sourceLine},
+        useValueLengthParameters_{useValueLengthParameters} {}
 
-  bool useValueLengthParameters() const { return useValueLengthParameters_; }
+  RT_API_ATTRS bool useValueLengthParameters() const {
+    return useValueLengthParameters_;
+  }
 
   class Descriptor &to;
   SubscriptValue nextValuePosition;
@@ -95,13 +97,13 @@ extern "C" {
 // the target the runtime is compiled for). This avoids the need for the runtime
 // to maintain a state, or to use dynamic allocation for it. "vectorClassSize"
 // is used to validate that lowering allocated enough space for it.
-void RTNAME(InitArrayConstructorVector)(ArrayConstructorVector &vector,
+void RTDECL(InitArrayConstructorVector)(ArrayConstructorVector &vector,
     Descriptor &to, bool useValueLengthParameters, int vectorClassSize,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // Generic API to push any kind of entity into the array constructor (any
 // Fortran type and any rank).
-void RTNAME(PushArrayConstructorValue)(
+void RTDECL(PushArrayConstructorValue)(
     ArrayConstructorVector &vector, const Descriptor &from);
 
 // API to push scalar array constructor value of:
@@ -109,7 +111,7 @@ void RTNAME(PushArrayConstructorValue)(
 //   - or a derived type that has no length parameters, and no allocatable
 //   component (that would require deep copies).
 // It requires no descriptor for the value that is passed via its base address.
-void RTNAME(PushArrayConstructorSimpleScalar)(
+void RTDECL(PushArrayConstructorSimpleScalar)(
     ArrayConstructorVector &vector, void *from);
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/include/flang/Runtime/character.h b/flang/include/flang/Runtime/character.h
index 768de75b639c..dd47686fe858 100644
--- a/flang/include/flang/Runtime/character.h
+++ b/flang/include/flang/Runtime/character.h
@@ -20,14 +20,16 @@ namespace Fortran::runtime {
 class Descriptor;
 
 template <typename CHAR>
-int CharacterScalarCompare(
+RT_API_ATTRS int CharacterScalarCompare(
     const CHAR *x, const CHAR *y, std::size_t xChars, std::size_t yChars);
-extern template int CharacterScalarCompare<char>(
+extern template RT_API_ATTRS int CharacterScalarCompare<char>(
     const char *x, const char *y, std::size_t xChars, std::size_t yChars);
-extern template int CharacterScalarCompare<char16_t>(const char16_t *x,
-    const char16_t *y, std::size_t xChars, std::size_t yChars);
-extern template int CharacterScalarCompare<char32_t>(const char32_t *x,
-    const char32_t *y, std::size_t xChars, std::size_t yChars);
+extern template RT_API_ATTRS int CharacterScalarCompare<char16_t>(
+    const char16_t *x, const char16_t *y, std::size_t xChars,
+    std::size_t yChars);
+extern template RT_API_ATTRS int CharacterScalarCompare<char32_t>(
+    const char32_t *x, const char32_t *y, std::size_t xChars,
+    std::size_t yChars);
 
 extern "C" {
 
@@ -36,12 +38,12 @@ extern "C" {
 // initialized CHARACTER allocatable scalar or array descriptor -- use
 // AllocatableInitCharacter() to set one up.  Crashes when not
 // conforming.  Assumes independence of data.
-void RTNAME(CharacterConcatenate)(Descriptor &accumulator,
+void RTDECL(CharacterConcatenate)(Descriptor &accumulator,
     const Descriptor &from, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 // Convenience specialization for ASCII scalars concatenation.
-void RTNAME(CharacterConcatenateScalar1)(
+void RTDECL(CharacterConcatenateScalar1)(
     Descriptor &accumulator, const char *from, std::size_t chars);
 
 // CHARACTER comparisons.  The kinds must match.  Like std::memcmp(),
@@ -52,77 +54,77 @@ void RTNAME(CharacterConcatenateScalar1)(
 // N.B.: Calls to the restricted specific intrinsic functions LGE, LGT, LLE,
 // & LLT are converted into calls to these during lowering; they don't have
 // to be able to be passed as actual procedure arguments.
-int RTNAME(CharacterCompareScalar)(const Descriptor &, const Descriptor &);
-int RTNAME(CharacterCompareScalar1)(
+int RTDECL(CharacterCompareScalar)(const Descriptor &, const Descriptor &);
+int RTDECL(CharacterCompareScalar1)(
     const char *x, const char *y, std::size_t xChars, std::size_t yChars);
-int RTNAME(CharacterCompareScalar2)(const char16_t *x, const char16_t *y,
+int RTDECL(CharacterCompareScalar2)(const char16_t *x, const char16_t *y,
     std::size_t xChars, std::size_t yChars);
-int RTNAME(CharacterCompareScalar4)(const char32_t *x, const char32_t *y,
+int RTDECL(CharacterCompareScalar4)(const char32_t *x, const char32_t *y,
     std::size_t xChars, std::size_t yChars);
 
 // General CHARACTER comparison; the result is a LOGICAL(KIND=1) array that
 // is established and populated.
-void RTNAME(CharacterCompare)(
+void RTDECL(CharacterCompare)(
     Descriptor &result, const Descriptor &, const Descriptor &);
 
 // Special-case support for optimized ASCII scalar expressions.
 
 // Copies data from 'rhs' to the remaining space (lhsLength - offset)
 // in 'lhs', if any.  Returns the new offset.  Assumes independence.
-std::size_t RTNAME(CharacterAppend1)(char *lhs, std::size_t lhsBytes,
+std::size_t RTDECL(CharacterAppend1)(char *lhs, std::size_t lhsBytes,
     std::size_t offset, const char *rhs, std::size_t rhsBytes);
 
 // Appends any necessary spaces to a CHARACTER(KIND=1) scalar.
-void RTNAME(CharacterPad1)(char *lhs, std::size_t bytes, std::size_t offset);
+void RTDECL(CharacterPad1)(char *lhs, std::size_t bytes, std::size_t offset);
 
 // Intrinsic functions
 // The result descriptors below are all established by the runtime.
-void RTNAME(Adjustl)(Descriptor &result, const Descriptor &,
+void RTDECL(Adjustl)(Descriptor &result, const Descriptor &,
     const char *sourceFile = nullptr, int sourceLine = 0);
-void RTNAME(Adjustr)(Descriptor &result, const Descriptor &,
+void RTDECL(Adjustr)(Descriptor &result, const Descriptor &,
     const char *sourceFile = nullptr, int sourceLine = 0);
-std::size_t RTNAME(LenTrim1)(const char *, std::size_t);
-std::size_t RTNAME(LenTrim2)(const char16_t *, std::size_t);
-std::size_t RTNAME(LenTrim4)(const char32_t *, std::size_t);
-void RTNAME(LenTrim)(Descriptor &result, const Descriptor &, int kind,
+std::size_t RTDECL(LenTrim1)(const char *, std::size_t);
+std::size_t RTDECL(LenTrim2)(const char16_t *, std::size_t);
+std::size_t RTDECL(LenTrim4)(const char32_t *, std::size_t);
+void RTDECL(LenTrim)(Descriptor &result, const Descriptor &, int kind,
     const char *sourceFile = nullptr, int sourceLine = 0);
-void RTNAME(Repeat)(Descriptor &result, const Descriptor &string,
+void RTDECL(Repeat)(Descriptor &result, const Descriptor &string,
     std::int64_t ncopies, const char *sourceFile = nullptr, int sourceLine = 0);
-void RTNAME(Trim)(Descriptor &result, const Descriptor &string,
+void RTDECL(Trim)(Descriptor &result, const Descriptor &string,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
-void RTNAME(CharacterMax)(Descriptor &accumulator, const Descriptor &x,
+void RTDECL(CharacterMax)(Descriptor &accumulator, const Descriptor &x,
     const char *sourceFile = nullptr, int sourceLine = 0);
-void RTNAME(CharacterMin)(Descriptor &accumulator, const Descriptor &x,
+void RTDECL(CharacterMin)(Descriptor &accumulator, const Descriptor &x,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
-std::size_t RTNAME(Index1)(const char *, std::size_t, const char *substring,
+std::size_t RTDECL(Index1)(const char *, std::size_t, const char *substring,
     std::size_t, bool back = false);
-std::size_t RTNAME(Index2)(const char16_t *, std::size_t,
+std::size_t RTDECL(Index2)(const char16_t *, std::size_t,
     const char16_t *substring, std::size_t, bool back = false);
-std::size_t RTNAME(Index4)(const char32_t *, std::size_t,
+std::size_t RTDECL(Index4)(const char32_t *, std::size_t,
     const char32_t *substring, std::size_t, bool back = false);
-void RTNAME(Index)(Descriptor &result, const Descriptor &string,
+void RTDECL(Index)(Descriptor &result, const Descriptor &string,
     const Descriptor &substring, const Descriptor *back /*can be null*/,
     int kind, const char *sourceFile = nullptr, int sourceLine = 0);
 
-std::size_t RTNAME(Scan1)(
+std::size_t RTDECL(Scan1)(
     const char *, std::size_t, const char *set, std::size_t, bool back = false);
-std::size_t RTNAME(Scan2)(const char16_t *, std::size_t, const char16_t *set,
+std::size_t RTDECL(Scan2)(const char16_t *, std::size_t, const char16_t *set,
     std::size_t, bool back = false);
-std::size_t RTNAME(Scan4)(const char32_t *, std::size_t, const char32_t *set,
+std::size_t RTDECL(Scan4)(const char32_t *, std::size_t, const char32_t *set,
     std::size_t, bool back = false);
-void RTNAME(Scan)(Descriptor &result, const Descriptor &string,
+void RTDECL(Scan)(Descriptor &result, const Descriptor &string,
     const Descriptor &set, const Descriptor *back /*can be null*/, int kind,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
-std::size_t RTNAME(Verify1)(
+std::size_t RTDECL(Verify1)(
     const char *, std::size_t, const char *set, std::size_t, bool back = false);
-std::size_t RTNAME(Verify2)(const char16_t *, std::size_t, const char16_t *set,
+std::size_t RTDECL(Verify2)(const char16_t *, std::size_t, const char16_t *set,
     std::size_t, bool back = false);
-std::size_t RTNAME(Verify4)(const char32_t *, std::size_t, const char32_t *set,
+std::size_t RTDECL(Verify4)(const char32_t *, std::size_t, const char32_t *set,
     std::size_t, bool back = false);
-void RTNAME(Verify)(Descriptor &result, const Descriptor &string,
+void RTDECL(Verify)(Descriptor &result, const Descriptor &string,
     const Descriptor &set, const Descriptor *back /*can be null*/, int kind,
     const char *sourceFile = nullptr, int sourceLine = 0);
 }
diff --git a/flang/include/flang/Runtime/derived-api.h b/flang/include/flang/Runtime/derived-api.h
index decba9f686d9..79aa7d82de88 100644
--- a/flang/include/flang/Runtime/derived-api.h
+++ b/flang/include/flang/Runtime/derived-api.h
@@ -29,37 +29,37 @@ extern "C" {
 // Initializes and allocates an object's components, if it has a derived type
 // with any default component initialization or automatic components.
 // The descriptor must be initialized and non-null.
-void RTNAME(Initialize)(
+void RTDECL(Initialize)(
     const Descriptor &, const char *sourceFile = nullptr, int sourceLine = 0);
 
 // Finalizes an object and its components.  Deallocates any
 // allocatable/automatic components.  Does not deallocate the descriptor's
 // storage.
-void RTNAME(Destroy)(const Descriptor &);
+void RTDECL(Destroy)(const Descriptor &);
 
 // Finalizes the object and its components.
-void RTNAME(Finalize)(
+void RTDECL(Finalize)(
     const Descriptor &, const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Deallocates any allocatable/automatic components.
 /// Does not deallocate the descriptor's storage.
 /// Does not perform any finalization.
-void RTNAME(DestroyWithoutFinalization)(const Descriptor &);
+void RTDECL(DestroyWithoutFinalization)(const Descriptor &);
 
 // Intrinsic or defined assignment, with scalar expansion but not type
 // conversion.
-void RTNAME(Assign)(const Descriptor &, const Descriptor &,
+void RTDECL(Assign)(const Descriptor &, const Descriptor &,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // Perform the test of the CLASS IS type guard statement of the SELECT TYPE
 // construct.
-bool RTNAME(ClassIs)(const Descriptor &, const typeInfo::DerivedType &);
+bool RTDECL(ClassIs)(const Descriptor &, const typeInfo::DerivedType &);
 
 // Perform the test of the SAME_TYPE_AS intrinsic.
-bool RTNAME(SameTypeAs)(const Descriptor &, const Descriptor &);
+bool RTDECL(SameTypeAs)(const Descriptor &, const Descriptor &);
 
 // Perform the test of the EXTENDS_TYPE_OF intrinsic.
-bool RTNAME(ExtendsTypeOf)(const Descriptor &, const Descriptor &);
+bool RTDECL(ExtendsTypeOf)(const Descriptor &, const Descriptor &);
 
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h
index fa68d9776969..e36b37c1a917 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang/include/flang/Runtime/descriptor.h
@@ -67,16 +67,16 @@ public:
   }
   // Do not use this API to cause the LB of an empty dimension
   // to be anything other than 1.  Use SetBounds() instead if you can.
-  Dimension &SetLowerBound(SubscriptValue lower) {
+  RT_API_ATTRS Dimension &SetLowerBound(SubscriptValue lower) {
     raw_.lower_bound = lower;
     return *this;
   }
-  Dimension &SetUpperBound(SubscriptValue upper) {
+  RT_API_ATTRS Dimension &SetUpperBound(SubscriptValue upper) {
     auto lower{raw_.lower_bound};
     raw_.extent = upper >= lower ? upper - lower + 1 : 0;
     return *this;
   }
-  Dimension &SetExtent(SubscriptValue extent) {
+  RT_API_ATTRS Dimension &SetExtent(SubscriptValue extent) {
     raw_.extent = extent;
     return *this;
   }
@@ -467,5 +467,6 @@ public:
 private:
   char storage_[byteSize]{};
 };
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_DESCRIPTOR_H_
diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h
index ad592814e5ac..175113c57ccb 100644
--- a/flang/include/flang/Runtime/extensions.h
+++ b/flang/include/flang/Runtime/extensions.h
@@ -14,6 +14,7 @@
 
 #define FORTRAN_PROCEDURE_NAME(name) name##_
 
+#include <cstddef>
 #include <cstdint>
 
 extern "C" {
@@ -28,5 +29,8 @@ std::int32_t FORTRAN_PROCEDURE_NAME(iargc)();
 void FORTRAN_PROCEDURE_NAME(getarg)(
     std::int32_t &n, std::int8_t *arg, std::int64_t length);
 
+// GNU extension subroutine GETLOG(C).
+void FORTRAN_PROCEDURE_NAME(getlog)(std::byte *name, std::int64_t length);
+
 } // extern "C"
 #endif // FORTRAN_RUNTIME_EXTENSIONS_H_
diff --git a/flang/include/flang/Runtime/inquiry.h b/flang/include/flang/Runtime/inquiry.h
index 8d673637b300..3fe670b0fae3 100644
--- a/flang/include/flang/Runtime/inquiry.h
+++ b/flang/include/flang/Runtime/inquiry.h
@@ -21,13 +21,13 @@ class Descriptor;
 
 extern "C" {
 
-std::int64_t RTNAME(LboundDim)(const Descriptor &array, int dim,
+std::int64_t RTDECL(LboundDim)(const Descriptor &array, int dim,
     const char *sourceFile = nullptr, int line = 0);
-void RTNAME(Ubound)(Descriptor &result, const Descriptor &array, int kind,
+void RTDECL(Ubound)(Descriptor &result, const Descriptor &array, int kind,
     const char *sourceFile = nullptr, int line = 0);
-std::int64_t RTNAME(Size)(
+std::int64_t RTDECL(Size)(
     const Descriptor &array, const char *sourceFile = nullptr, int line = 0);
-std::int64_t RTNAME(SizeDim)(const Descriptor &array, int dim,
+std::int64_t RTDECL(SizeDim)(const Descriptor &array, int dim,
     const char *sourceFile = nullptr, int line = 0);
 
 } // extern "C"
diff --git a/flang/include/flang/Runtime/io-api.h b/flang/include/flang/Runtime/io-api.h
index 41574e3bb80a..0277f0ea9e97 100644
--- a/flang/include/flang/Runtime/io-api.h
+++ b/flang/include/flang/Runtime/io-api.h
@@ -14,6 +14,7 @@
 #include "flang/Common/uint128.h"
 #include "flang/Runtime/entry-names.h"
 #include "flang/Runtime/iostat.h"
+#include "flang/Runtime/magic-numbers.h"
 #include <cinttypes>
 #include <cstddef>
 
@@ -29,7 +30,9 @@ class IoStatementState;
 using Cookie = IoStatementState *;
 using ExternalUnit = int;
 using AsynchronousId = int;
-static constexpr ExternalUnit DefaultUnit{-1}; // READ(*), WRITE(*), PRINT
+
+static constexpr ExternalUnit DefaultOutputUnit{FORTRAN_DEFAULT_OUTPUT_UNIT};
+static constexpr ExternalUnit DefaultInputUnit{FORTRAN_DEFAULT_INPUT_UNIT};
 
 // INQUIRE specifiers are encoded as simple base-26 packings of
 // the spellings of their keywords.
@@ -57,7 +60,8 @@ extern "C" {
 
 // These functions initiate data transfer statements (READ, WRITE, PRINT).
 // Example: PRINT *, 666 is implemented as the series of calls:
-//   Cookie cookie{BeginExternalListOutput(DefaultUnit,__FILE__,__LINE__)};
+//   Cookie cookie{BeginExternalListOutput(DefaultOutputUnit,
+//                                         __FILE__, __LINE__)};
 //   OutputInteger32(cookie, 666);
 //   EndIoStatement(cookie);
 // Formatted I/O with explicit formats can supply the format as a
@@ -135,19 +139,21 @@ enum Iostat IONAME(CheckUnitNumberInRange128)(common::int128_t unit,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // External synchronous I/O initiation
-Cookie IONAME(BeginExternalListOutput)(ExternalUnit = DefaultUnit,
+Cookie IONAME(BeginExternalListOutput)(ExternalUnit = DefaultOutputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IONAME(BeginExternalListInput)(ExternalUnit = DefaultUnit,
+Cookie IONAME(BeginExternalListInput)(ExternalUnit = DefaultInputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
 Cookie IONAME(BeginExternalFormattedOutput)(const char *format, std::size_t,
-    const Descriptor *formatDescriptor = nullptr, ExternalUnit = DefaultUnit,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor *formatDescriptor = nullptr,
+    ExternalUnit = DefaultOutputUnit, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 Cookie IONAME(BeginExternalFormattedInput)(const char *format, std::size_t,
-    const Descriptor *formatDescriptor = nullptr, ExternalUnit = DefaultUnit,
-    const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IONAME(BeginUnformattedOutput)(ExternalUnit = DefaultUnit,
+    const Descriptor *formatDescriptor = nullptr,
+    ExternalUnit = DefaultInputUnit, const char *sourceFile = nullptr,
+    int sourceLine = 0);
+Cookie IONAME(BeginUnformattedOutput)(ExternalUnit = DefaultOutputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
-Cookie IONAME(BeginUnformattedInput)(ExternalUnit = DefaultUnit,
+Cookie IONAME(BeginUnformattedInput)(ExternalUnit = DefaultInputUnit,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
 // WAIT(ID=)
@@ -190,7 +196,7 @@ Cookie IONAME(BeginInquireIoLength)(
 // This call makes the runtime library defer those particular error/end
 // conditions to the EndIoStatement() call rather than terminating
 // the image.  E.g., for READ(*,*,END=666) A, B, (C(J),J=1,N)
-//   Cookie cookie{BeginExternalListInput(DefaultUnit,__FILE__,__LINE__)};
+//   Cookie cookie{BeginExternalListInput(DefaultInputUnit,__FILE__,__LINE__)};
 //   EnableHandlers(cookie, false, false, true /*END=*/, false);
 //   if (InputReal64(cookie, &A)) {
 //     if (InputReal64(cookie, &B)) {
diff --git a/flang/include/flang/Runtime/magic-numbers.h b/flang/include/flang/Runtime/magic-numbers.h
index d00d5027d4ed..196b13ad3755 100644
--- a/flang/include/flang/Runtime/magic-numbers.h
+++ b/flang/include/flang/Runtime/magic-numbers.h
@@ -27,6 +27,10 @@ start at 100 so as to never conflict with those codes.
 #ifndef FORTRAN_RUNTIME_MAGIC_NUMBERS_H_
 #define FORTRAN_RUNTIME_MAGIC_NUMBERS_H_
 
+#define FORTRAN_DEFAULT_OUTPUT_UNIT 6
+#define FORTRAN_DEFAULT_INPUT_UNIT 5
+#define FORTRAN_ERROR_UNIT 0
+
 #define FORTRAN_RUNTIME_IOSTAT_END (-1)
 #define FORTRAN_RUNTIME_IOSTAT_EOR (-2)
 #define FORTRAN_RUNTIME_IOSTAT_FLUSH (-3)
diff --git a/flang/include/flang/Runtime/matmul-transpose.h b/flang/include/flang/Runtime/matmul-transpose.h
index 7cfb189863df..5eb5896972e0 100644
--- a/flang/include/flang/Runtime/matmul-transpose.h
+++ b/flang/include/flang/Runtime/matmul-transpose.h
@@ -18,12 +18,12 @@ extern "C" {
 // The most general MATMUL(TRANSPOSE()).  All type and shape information is
 // taken from the arguments' descriptors, and the result is dynamically
 // allocated.
-void RTNAME(MatmulTranspose)(Descriptor &, const Descriptor &,
+void RTDECL(MatmulTranspose)(Descriptor &, const Descriptor &,
     const Descriptor &, const char *sourceFile = nullptr, int line = 0);
 
 // A non-allocating variant; the result's descriptor must be established
 // and have a valid base address.
-void RTNAME(MatmulTransposeDirect)(const Descriptor &, const Descriptor &,
+void RTDECL(MatmulTransposeDirect)(const Descriptor &, const Descriptor &,
     const Descriptor &, const char *sourceFile = nullptr, int line = 0);
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/include/flang/Runtime/matmul.h b/flang/include/flang/Runtime/matmul.h
index 4598c487a12c..40581d44de9e 100644
--- a/flang/include/flang/Runtime/matmul.h
+++ b/flang/include/flang/Runtime/matmul.h
@@ -17,12 +17,12 @@ extern "C" {
 
 // The most general MATMUL.  All type and shape information is taken from the
 // arguments' descriptors, and the result is dynamically allocated.
-void RTNAME(Matmul)(Descriptor &, const Descriptor &, const Descriptor &,
+void RTDECL(Matmul)(Descriptor &, const Descriptor &, const Descriptor &,
     const char *sourceFile = nullptr, int line = 0);
 
 // A non-allocating variant; the result's descriptor must be established
 // and have a valid base address.
-void RTNAME(MatmulDirect)(const Descriptor &, const Descriptor &,
+void RTDECL(MatmulDirect)(const Descriptor &, const Descriptor &,
     const Descriptor &, const char *sourceFile = nullptr, int line = 0);
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/include/flang/Runtime/memory.h b/flang/include/flang/Runtime/memory.h
index bde056f439a5..e24c509f4e90 100644
--- a/flang/include/flang/Runtime/memory.h
+++ b/flang/include/flang/Runtime/memory.h
@@ -23,14 +23,17 @@ class Terminator;
 
 [[nodiscard]] RT_API_ATTRS void *AllocateMemoryOrCrash(
     const Terminator &, std::size_t bytes);
-template <typename A> [[nodiscard]] A &AllocateOrCrash(const Terminator &t) {
+template <typename A>
+[[nodiscard]] RT_API_ATTRS A &AllocateOrCrash(const Terminator &t) {
   return *reinterpret_cast<A *>(AllocateMemoryOrCrash(t, sizeof(A)));
 }
+RT_API_ATTRS void *ReallocateMemoryOrCrash(
+    const Terminator &, void *ptr, std::size_t newByteSize);
 RT_API_ATTRS void FreeMemory(void *);
 template <typename A> RT_API_ATTRS void FreeMemory(A *p) {
   FreeMemory(reinterpret_cast<void *>(p));
 }
-template <typename A> void FreeMemoryAndNullify(A *&p) {
+template <typename A> RT_API_ATTRS void FreeMemoryAndNullify(A *&p) {
   FreeMemory(p);
   p = nullptr;
 }
diff --git a/flang/include/flang/Runtime/misc-intrinsic.h b/flang/include/flang/Runtime/misc-intrinsic.h
index d4c20539532d..73cc9e2023d9 100644
--- a/flang/include/flang/Runtime/misc-intrinsic.h
+++ b/flang/include/flang/Runtime/misc-intrinsic.h
@@ -19,9 +19,9 @@ namespace Fortran::runtime {
 class Descriptor;
 
 extern "C" {
-void RTNAME(Transfer)(Descriptor &result, const Descriptor &source,
+void RTDECL(Transfer)(Descriptor &result, const Descriptor &source,
     const Descriptor &mold, const char *sourceFile, int line);
-void RTNAME(TransferSize)(Descriptor &result, const Descriptor &source,
+void RTDECL(TransferSize)(Descriptor &result, const Descriptor &source,
     const Descriptor &mold, const char *sourceFile, int line,
     std::int64_t size);
 } // extern "C"
diff --git a/flang/include/flang/Runtime/numeric.h b/flang/include/flang/Runtime/numeric.h
index e4e11a61731a..3d9cb8b5b0ac 100644
--- a/flang/include/flang/Runtime/numeric.h
+++ b/flang/include/flang/Runtime/numeric.h
@@ -20,280 +20,280 @@ namespace Fortran::runtime {
 extern "C" {
 
 // CEILING
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Ceiling4_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(Ceiling4_1)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Ceiling4_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(Ceiling4_2)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Ceiling4_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Ceiling4_4)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Ceiling4_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Ceiling4_8)(
     CppTypeFor<TypeCategory::Real, 4>);
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Ceiling4_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(Ceiling4_16)(
     CppTypeFor<TypeCategory::Real, 4>);
 #endif
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Ceiling8_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(Ceiling8_1)(
     CppTypeFor<TypeCategory::Real, 8>);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Ceiling8_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(Ceiling8_2)(
     CppTypeFor<TypeCategory::Real, 8>);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Ceiling8_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Ceiling8_4)(
     CppTypeFor<TypeCategory::Real, 8>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Ceiling8_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Ceiling8_8)(
     CppTypeFor<TypeCategory::Real, 8>);
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Ceiling8_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(Ceiling8_16)(
     CppTypeFor<TypeCategory::Real, 8>);
 #endif
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Ceiling10_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(Ceiling10_1)(
     CppTypeFor<TypeCategory::Real, 10>);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Ceiling10_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(Ceiling10_2)(
     CppTypeFor<TypeCategory::Real, 10>);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Ceiling10_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Ceiling10_4)(
     CppTypeFor<TypeCategory::Real, 10>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Ceiling10_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Ceiling10_8)(
     CppTypeFor<TypeCategory::Real, 10>);
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Ceiling10_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(Ceiling10_16)(
     CppTypeFor<TypeCategory::Real, 10>);
 #endif
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Ceiling16_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(Ceiling16_1)(
     CppTypeFor<TypeCategory::Real, 16>);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Ceiling16_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(Ceiling16_2)(
     CppTypeFor<TypeCategory::Real, 16>);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Ceiling16_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Ceiling16_4)(
     CppTypeFor<TypeCategory::Real, 16>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Ceiling16_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Ceiling16_8)(
     CppTypeFor<TypeCategory::Real, 16>);
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Ceiling16_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(Ceiling16_16)(
     CppTypeFor<TypeCategory::Real, 16>);
 #endif
 #endif
 
 // EXPONENT is defined to return default INTEGER; support INTEGER(4 & 8)
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Exponent4_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Exponent4_4)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Exponent4_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Exponent4_8)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Exponent8_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Exponent8_4)(
     CppTypeFor<TypeCategory::Real, 8>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Exponent8_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Exponent8_8)(
     CppTypeFor<TypeCategory::Real, 8>);
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Exponent10_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Exponent10_4)(
     CppTypeFor<TypeCategory::Real, 10>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Exponent10_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Exponent10_8)(
     CppTypeFor<TypeCategory::Real, 10>);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Exponent16_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Exponent16_4)(
     CppTypeFor<TypeCategory::Real, 16>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Exponent16_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Exponent16_8)(
     CppTypeFor<TypeCategory::Real, 16>);
 #endif
 
 // FLOOR
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Floor4_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(Floor4_1)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Floor4_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(Floor4_2)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Floor4_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Floor4_4)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Floor4_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Floor4_8)(
     CppTypeFor<TypeCategory::Real, 4>);
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Floor4_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(Floor4_16)(
     CppTypeFor<TypeCategory::Real, 4>);
 #endif
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Floor8_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(Floor8_1)(
     CppTypeFor<TypeCategory::Real, 8>);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Floor8_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(Floor8_2)(
     CppTypeFor<TypeCategory::Real, 8>);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Floor8_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Floor8_4)(
     CppTypeFor<TypeCategory::Real, 8>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Floor8_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Floor8_8)(
     CppTypeFor<TypeCategory::Real, 8>);
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Floor8_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(Floor8_16)(
     CppTypeFor<TypeCategory::Real, 8>);
 #endif
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Floor10_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(Floor10_1)(
     CppTypeFor<TypeCategory::Real, 10>);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Floor10_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(Floor10_2)(
     CppTypeFor<TypeCategory::Real, 10>);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Floor10_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Floor10_4)(
     CppTypeFor<TypeCategory::Real, 10>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Floor10_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Floor10_8)(
     CppTypeFor<TypeCategory::Real, 10>);
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Floor10_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(Floor10_16)(
     CppTypeFor<TypeCategory::Real, 10>);
 #endif
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Floor16_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(Floor16_1)(
     CppTypeFor<TypeCategory::Real, 16>);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Floor16_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(Floor16_2)(
     CppTypeFor<TypeCategory::Real, 16>);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Floor16_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Floor16_4)(
     CppTypeFor<TypeCategory::Real, 16>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Floor16_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Floor16_8)(
     CppTypeFor<TypeCategory::Real, 16>);
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Floor16_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(Floor16_16)(
     CppTypeFor<TypeCategory::Real, 16>);
 #endif
 #endif
 
 // FRACTION
-CppTypeFor<TypeCategory::Real, 4> RTNAME(Fraction4)(
+CppTypeFor<TypeCategory::Real, 4> RTDECL(Fraction4)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Real, 8> RTNAME(Fraction8)(
+CppTypeFor<TypeCategory::Real, 8> RTDECL(Fraction8)(
     CppTypeFor<TypeCategory::Real, 8>);
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(Fraction10)(
+CppTypeFor<TypeCategory::Real, 10> RTDECL(Fraction10)(
     CppTypeFor<TypeCategory::Real, 10>);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(Fraction16)(
+CppTypeFor<TypeCategory::Real, 16> RTDECL(Fraction16)(
     CppTypeFor<TypeCategory::Real, 16>);
 #endif
 
 // ISNAN / IEEE_IS_NAN
-bool RTNAME(IsNaN4)(CppTypeFor<TypeCategory::Real, 4>);
-bool RTNAME(IsNaN8)(CppTypeFor<TypeCategory::Real, 8>);
+bool RTDECL(IsNaN4)(CppTypeFor<TypeCategory::Real, 4>);
+bool RTDECL(IsNaN8)(CppTypeFor<TypeCategory::Real, 8>);
 #if LDBL_MANT_DIG == 64
-bool RTNAME(IsNaN10)(CppTypeFor<TypeCategory::Real, 10>);
+bool RTDECL(IsNaN10)(CppTypeFor<TypeCategory::Real, 10>);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-bool RTNAME(IsNaN16)(CppTypeFor<TypeCategory::Real, 16>);
+bool RTDECL(IsNaN16)(CppTypeFor<TypeCategory::Real, 16>);
 #endif
 
 // MOD & MODULO
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(ModInteger1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(ModInteger1)(
     CppTypeFor<TypeCategory::Integer, 1>, CppTypeFor<TypeCategory::Integer, 1>,
     const char *sourceFile = nullptr, int sourceLine = 0);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(ModInteger2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(ModInteger2)(
     CppTypeFor<TypeCategory::Integer, 2>, CppTypeFor<TypeCategory::Integer, 2>,
     const char *sourceFile = nullptr, int sourceLine = 0);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(ModInteger4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(ModInteger4)(
     CppTypeFor<TypeCategory::Integer, 4>, CppTypeFor<TypeCategory::Integer, 4>,
     const char *sourceFile = nullptr, int sourceLine = 0);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(ModInteger8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(ModInteger8)(
     CppTypeFor<TypeCategory::Integer, 8>, CppTypeFor<TypeCategory::Integer, 8>,
     const char *sourceFile = nullptr, int sourceLine = 0);
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(ModInteger16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(ModInteger16)(
     CppTypeFor<TypeCategory::Integer, 16>,
     CppTypeFor<TypeCategory::Integer, 16>, const char *sourceFile = nullptr,
     int sourceLine = 0);
 #endif
-CppTypeFor<TypeCategory::Real, 4> RTNAME(ModReal4)(
+CppTypeFor<TypeCategory::Real, 4> RTDECL(ModReal4)(
     CppTypeFor<TypeCategory::Real, 4>, CppTypeFor<TypeCategory::Real, 4>,
     const char *sourceFile = nullptr, int sourceLine = 0);
-CppTypeFor<TypeCategory::Real, 8> RTNAME(ModReal8)(
+CppTypeFor<TypeCategory::Real, 8> RTDECL(ModReal8)(
     CppTypeFor<TypeCategory::Real, 8>, CppTypeFor<TypeCategory::Real, 8>,
     const char *sourceFile = nullptr, int sourceLine = 0);
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(ModReal10)(
+CppTypeFor<TypeCategory::Real, 10> RTDECL(ModReal10)(
     CppTypeFor<TypeCategory::Real, 10>, CppTypeFor<TypeCategory::Real, 10>,
     const char *sourceFile = nullptr, int sourceLine = 0);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(ModReal16)(
+CppTypeFor<TypeCategory::Real, 16> RTDECL(ModReal16)(
     CppTypeFor<TypeCategory::Real, 16>, CppTypeFor<TypeCategory::Real, 16>,
     const char *sourceFile = nullptr, int sourceLine = 0);
 #endif
 
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(ModuloInteger1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(ModuloInteger1)(
     CppTypeFor<TypeCategory::Integer, 1>, CppTypeFor<TypeCategory::Integer, 1>,
     const char *sourceFile = nullptr, int sourceLine = 0);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(ModuloInteger2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(ModuloInteger2)(
     CppTypeFor<TypeCategory::Integer, 2>, CppTypeFor<TypeCategory::Integer, 2>,
     const char *sourceFile = nullptr, int sourceLine = 0);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(ModuloInteger4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(ModuloInteger4)(
     CppTypeFor<TypeCategory::Integer, 4>, CppTypeFor<TypeCategory::Integer, 4>,
     const char *sourceFile = nullptr, int sourceLine = 0);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(ModuloInteger8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(ModuloInteger8)(
     CppTypeFor<TypeCategory::Integer, 8>, CppTypeFor<TypeCategory::Integer, 8>,
     const char *sourceFile = nullptr, int sourceLine = 0);
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(ModuloInteger16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(ModuloInteger16)(
     CppTypeFor<TypeCategory::Integer, 16>,
     CppTypeFor<TypeCategory::Integer, 16>, const char *sourceFile = nullptr,
     int sourceLine = 0);
 #endif
-CppTypeFor<TypeCategory::Real, 4> RTNAME(ModuloReal4)(
+CppTypeFor<TypeCategory::Real, 4> RTDECL(ModuloReal4)(
     CppTypeFor<TypeCategory::Real, 4>, CppTypeFor<TypeCategory::Real, 4>,
     const char *sourceFile = nullptr, int sourceLine = 0);
-CppTypeFor<TypeCategory::Real, 8> RTNAME(ModuloReal8)(
+CppTypeFor<TypeCategory::Real, 8> RTDECL(ModuloReal8)(
     CppTypeFor<TypeCategory::Real, 8>, CppTypeFor<TypeCategory::Real, 8>,
     const char *sourceFile = nullptr, int sourceLine = 0);
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(ModuloReal10)(
+CppTypeFor<TypeCategory::Real, 10> RTDECL(ModuloReal10)(
     CppTypeFor<TypeCategory::Real, 10>, CppTypeFor<TypeCategory::Real, 10>,
     const char *sourceFile = nullptr, int sourceLine = 0);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(ModuloReal16)(
+CppTypeFor<TypeCategory::Real, 16> RTDECL(ModuloReal16)(
     CppTypeFor<TypeCategory::Real, 16>, CppTypeFor<TypeCategory::Real, 16>,
     const char *sourceFile = nullptr, int sourceLine = 0);
 #endif
 
 // NINT
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Nint4_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(Nint4_1)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Nint4_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(Nint4_2)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Nint4_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Nint4_4)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Nint4_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Nint4_8)(
     CppTypeFor<TypeCategory::Real, 4>);
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Nint4_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(Nint4_16)(
     CppTypeFor<TypeCategory::Real, 4>);
 #endif
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Nint8_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(Nint8_1)(
     CppTypeFor<TypeCategory::Real, 8>);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Nint8_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(Nint8_2)(
     CppTypeFor<TypeCategory::Real, 8>);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Nint8_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Nint8_4)(
     CppTypeFor<TypeCategory::Real, 8>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Nint8_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Nint8_8)(
     CppTypeFor<TypeCategory::Real, 8>);
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Nint8_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(Nint8_16)(
     CppTypeFor<TypeCategory::Real, 8>);
 #endif
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Nint10_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(Nint10_1)(
     CppTypeFor<TypeCategory::Real, 10>);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Nint10_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(Nint10_2)(
     CppTypeFor<TypeCategory::Real, 10>);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Nint10_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Nint10_4)(
     CppTypeFor<TypeCategory::Real, 10>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Nint10_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Nint10_8)(
     CppTypeFor<TypeCategory::Real, 10>);
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Nint10_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(Nint10_16)(
     CppTypeFor<TypeCategory::Real, 10>);
 #endif
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Nint16_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDECL(Nint16_1)(
     CppTypeFor<TypeCategory::Real, 16>);
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Nint16_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDECL(Nint16_2)(
     CppTypeFor<TypeCategory::Real, 16>);
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Nint16_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(Nint16_4)(
     CppTypeFor<TypeCategory::Real, 16>);
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Nint16_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDECL(Nint16_8)(
     CppTypeFor<TypeCategory::Real, 16>);
 #if defined __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Nint16_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDECL(Nint16_16)(
     CppTypeFor<TypeCategory::Real, 16>);
 #endif
 #endif
@@ -301,113 +301,113 @@ CppTypeFor<TypeCategory::Integer, 16> RTNAME(Nint16_16)(
 // NEAREST
 // The second argument to NEAREST is the result of a comparison
 // to zero (i.e., S > 0)
-CppTypeFor<TypeCategory::Real, 4> RTNAME(Nearest4)(
+CppTypeFor<TypeCategory::Real, 4> RTDECL(Nearest4)(
     CppTypeFor<TypeCategory::Real, 4>, bool positive);
-CppTypeFor<TypeCategory::Real, 8> RTNAME(Nearest8)(
+CppTypeFor<TypeCategory::Real, 8> RTDECL(Nearest8)(
     CppTypeFor<TypeCategory::Real, 8>, bool positive);
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(Nearest10)(
+CppTypeFor<TypeCategory::Real, 10> RTDECL(Nearest10)(
     CppTypeFor<TypeCategory::Real, 10>, bool positive);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(Nearest16)(
+CppTypeFor<TypeCategory::Real, 16> RTDECL(Nearest16)(
     CppTypeFor<TypeCategory::Real, 16>, bool positive);
 #endif
 
 // RRSPACING
-CppTypeFor<TypeCategory::Real, 4> RTNAME(RRSpacing4)(
+CppTypeFor<TypeCategory::Real, 4> RTDECL(RRSpacing4)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Real, 8> RTNAME(RRSpacing8)(
+CppTypeFor<TypeCategory::Real, 8> RTDECL(RRSpacing8)(
     CppTypeFor<TypeCategory::Real, 8>);
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(RRSpacing10)(
+CppTypeFor<TypeCategory::Real, 10> RTDECL(RRSpacing10)(
     CppTypeFor<TypeCategory::Real, 10>);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(RRSpacing16)(
+CppTypeFor<TypeCategory::Real, 16> RTDECL(RRSpacing16)(
     CppTypeFor<TypeCategory::Real, 16>);
 #endif
 
 // SET_EXPONENT's I= argument can be any INTEGER kind; upcast it to 64-bit
-CppTypeFor<TypeCategory::Real, 4> RTNAME(SetExponent4)(
+CppTypeFor<TypeCategory::Real, 4> RTDECL(SetExponent4)(
     CppTypeFor<TypeCategory::Real, 4>, std::int64_t);
-CppTypeFor<TypeCategory::Real, 8> RTNAME(SetExponent8)(
+CppTypeFor<TypeCategory::Real, 8> RTDECL(SetExponent8)(
     CppTypeFor<TypeCategory::Real, 8>, std::int64_t);
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(SetExponent10)(
+CppTypeFor<TypeCategory::Real, 10> RTDECL(SetExponent10)(
     CppTypeFor<TypeCategory::Real, 10>, std::int64_t);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(SetExponent16)(
+CppTypeFor<TypeCategory::Real, 16> RTDECL(SetExponent16)(
     CppTypeFor<TypeCategory::Real, 16>, std::int64_t);
 #endif
 
 // SCALE
-CppTypeFor<TypeCategory::Real, 4> RTNAME(Scale4)(
+CppTypeFor<TypeCategory::Real, 4> RTDECL(Scale4)(
     CppTypeFor<TypeCategory::Real, 4>, std::int64_t);
-CppTypeFor<TypeCategory::Real, 8> RTNAME(Scale8)(
+CppTypeFor<TypeCategory::Real, 8> RTDECL(Scale8)(
     CppTypeFor<TypeCategory::Real, 8>, std::int64_t);
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(Scale10)(
+CppTypeFor<TypeCategory::Real, 10> RTDECL(Scale10)(
     CppTypeFor<TypeCategory::Real, 10>, std::int64_t);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(Scale16)(
+CppTypeFor<TypeCategory::Real, 16> RTDECL(Scale16)(
     CppTypeFor<TypeCategory::Real, 16>, std::int64_t);
 #endif
 
 // SELECTED_INT_KIND
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(SelectedIntKind)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedIntKind)(
     const char *, int, void *, int);
 
 // SELECTED_REAL_KIND
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(SelectedRealKind)(
+CppTypeFor<TypeCategory::Integer, 4> RTDECL(SelectedRealKind)(
     const char *, int, void *, int, void *, int, void *, int);
 
 // SPACING
-CppTypeFor<TypeCategory::Real, 4> RTNAME(Spacing4)(
+CppTypeFor<TypeCategory::Real, 4> RTDECL(Spacing4)(
     CppTypeFor<TypeCategory::Real, 4>);
-CppTypeFor<TypeCategory::Real, 8> RTNAME(Spacing8)(
+CppTypeFor<TypeCategory::Real, 8> RTDECL(Spacing8)(
     CppTypeFor<TypeCategory::Real, 8>);
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(Spacing10)(
+CppTypeFor<TypeCategory::Real, 10> RTDECL(Spacing10)(
     CppTypeFor<TypeCategory::Real, 10>);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(Spacing16)(
+CppTypeFor<TypeCategory::Real, 16> RTDECL(Spacing16)(
     CppTypeFor<TypeCategory::Real, 16>);
 #endif
 
-CppTypeFor<TypeCategory::Real, 4> RTNAME(FPow4i)(
+CppTypeFor<TypeCategory::Real, 4> RTDECL(FPow4i)(
     CppTypeFor<TypeCategory::Real, 4> b,
     CppTypeFor<TypeCategory::Integer, 4> e);
-CppTypeFor<TypeCategory::Real, 8> RTNAME(FPow8i)(
+CppTypeFor<TypeCategory::Real, 8> RTDECL(FPow8i)(
     CppTypeFor<TypeCategory::Real, 8> b,
     CppTypeFor<TypeCategory::Integer, 4> e);
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(FPow10i)(
+CppTypeFor<TypeCategory::Real, 10> RTDECL(FPow10i)(
     CppTypeFor<TypeCategory::Real, 10> b,
     CppTypeFor<TypeCategory::Integer, 4> e);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(FPow16i)(
+CppTypeFor<TypeCategory::Real, 16> RTDECL(FPow16i)(
     CppTypeFor<TypeCategory::Real, 16> b,
     CppTypeFor<TypeCategory::Integer, 4> e);
 #endif
 
-CppTypeFor<TypeCategory::Real, 4> RTNAME(FPow4k)(
+CppTypeFor<TypeCategory::Real, 4> RTDECL(FPow4k)(
     CppTypeFor<TypeCategory::Real, 4> b,
     CppTypeFor<TypeCategory::Integer, 8> e);
-CppTypeFor<TypeCategory::Real, 8> RTNAME(FPow8k)(
+CppTypeFor<TypeCategory::Real, 8> RTDECL(FPow8k)(
     CppTypeFor<TypeCategory::Real, 8> b,
     CppTypeFor<TypeCategory::Integer, 8> e);
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(FPow10k)(
+CppTypeFor<TypeCategory::Real, 10> RTDECL(FPow10k)(
     CppTypeFor<TypeCategory::Real, 10> b,
     CppTypeFor<TypeCategory::Integer, 8> e);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(FPow16k)(
+CppTypeFor<TypeCategory::Real, 16> RTDECL(FPow16k)(
     CppTypeFor<TypeCategory::Real, 16> b,
     CppTypeFor<TypeCategory::Integer, 8> e);
 #endif
diff --git a/flang/include/flang/Runtime/pointer.h b/flang/include/flang/Runtime/pointer.h
index 52ab9482ed95..6ceb70ebb676 100644
--- a/flang/include/flang/Runtime/pointer.h
+++ b/flang/include/flang/Runtime/pointer.h
@@ -21,45 +21,45 @@ extern "C" {
 // Data pointer initialization for NULLIFY(), "p=>NULL()`, & for ALLOCATE().
 
 // Initializes a pointer to a disassociated state for NULLIFY() or "p=>NULL()".
-void RTNAME(PointerNullifyIntrinsic)(
+void RTDECL(PointerNullifyIntrinsic)(
     Descriptor &, TypeCategory, int kind, int rank = 0, int corank = 0);
-void RTNAME(PointerNullifyCharacter)(Descriptor &, SubscriptValue length = 0,
+void RTDECL(PointerNullifyCharacter)(Descriptor &, SubscriptValue length = 0,
     int kind = 1, int rank = 0, int corank = 0);
-void RTNAME(PointerNullifyDerived)(
+void RTDECL(PointerNullifyDerived)(
     Descriptor &, const typeInfo::DerivedType &, int rank = 0, int corank = 0);
 
 // Explicitly sets the bounds of an initialized disassociated pointer.
 // The upper cobound is ignored for the last codimension.
-void RTNAME(PointerSetBounds)(
+void RTDECL(PointerSetBounds)(
     Descriptor &, int zeroBasedDim, SubscriptValue lower, SubscriptValue upper);
-void RTNAME(PointerSetCoBounds)(Descriptor &, int zeroBasedCoDim,
+void RTDECL(PointerSetCoBounds)(Descriptor &, int zeroBasedCoDim,
     SubscriptValue lower, SubscriptValue upper = 0);
 
 // Length type parameters are indexed in declaration order; i.e., 0 is the
 // first length type parameter in the deepest base type.  (Not for use
 // with CHARACTER; see above.)
-void RTNAME(PointerSetDerivedLength)(Descriptor &, int which, SubscriptValue);
+void RTDECL(PointerSetDerivedLength)(Descriptor &, int which, SubscriptValue);
 
 // For MOLD= allocation: acquires information from another descriptor
 // to initialize a null data pointer.
-void RTNAME(PointerApplyMold)(
+void RTDECL(PointerApplyMold)(
     Descriptor &, const Descriptor &mold, int rank = 0);
 
 // Data pointer association for "p=>TARGET"
 
 // Associates a scalar pointer with a simple scalar target.
-void RTNAME(PointerAssociateScalar)(Descriptor &, void *);
+void RTDECL(PointerAssociateScalar)(Descriptor &, void *);
 
 // Associates a pointer with a target of the same rank, possibly with new lower
 // bounds, which are passed in a vector whose length must equal the rank.
-void RTNAME(PointerAssociate)(Descriptor &, const Descriptor &target);
-void RTNAME(PointerAssociateLowerBounds)(
+void RTDECL(PointerAssociate)(Descriptor &, const Descriptor &target);
+void RTDECL(PointerAssociateLowerBounds)(
     Descriptor &, const Descriptor &target, const Descriptor &lowerBounds);
 
 // Associates a pointer with a target with bounds remapping.  The target must be
 // simply contiguous &/or of rank 1.  The bounds constitute a [2,newRank]
 // integer array whose columns are [lower bound, upper bound] on each dimension.
-void RTNAME(PointerAssociateRemapping)(Descriptor &, const Descriptor &target,
+void RTDECL(PointerAssociateRemapping)(Descriptor &, const Descriptor &target,
     const Descriptor &bounds, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
@@ -70,7 +70,7 @@ void RTNAME(PointerAssociateRemapping)(Descriptor &, const Descriptor &target,
 // a derived type or CHARACTER value, the explicit value has to match
 // the length type parameter's value.  This API checks that requirement.
 // Returns 0 for success, or the STAT= value on failure with hasStat==true.
-int RTNAME(PointerCheckLengthParameter)(Descriptor &,
+int RTDECL(PointerCheckLengthParameter)(Descriptor &,
     int which /* 0 for CHARACTER length */, SubscriptValue other,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
@@ -83,10 +83,10 @@ int RTNAME(PointerCheckLengthParameter)(Descriptor &,
 // Successfully allocated memory is initialized if the pointer has a
 // derived type, and is always initialized by PointerAllocateSource().
 // Performs all necessary coarray synchronization and validation actions.
-int RTNAME(PointerAllocate)(Descriptor &, bool hasStat = false,
+int RTDECL(PointerAllocate)(Descriptor &, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
-int RTNAME(PointerAllocateSource)(Descriptor &, const Descriptor &source,
+int RTDECL(PointerAllocateSource)(Descriptor &, const Descriptor &source,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
 
@@ -95,13 +95,13 @@ int RTNAME(PointerAllocateSource)(Descriptor &, const Descriptor &source,
 // Finalizes elements &/or components as needed. The pointer is left
 // in an initialized disassociated state suitable for reallocation
 // with the same bounds, cobounds, and length type parameters.
-int RTNAME(PointerDeallocate)(Descriptor &, bool hasStat = false,
+int RTDECL(PointerDeallocate)(Descriptor &, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 // Same as PointerDeallocate but also set the dynamic type as the declared type
 // as mentioned in 7.3.2.3 note 7.
-int RTNAME(PointerDeallocatePolymorphic)(Descriptor &,
+int RTDECL(PointerDeallocatePolymorphic)(Descriptor &,
     const typeInfo::DerivedType *, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
@@ -109,10 +109,10 @@ int RTNAME(PointerDeallocatePolymorphic)(Descriptor &,
 // Association inquiries for ASSOCIATED()
 
 // True when the pointer is not disassociated.
-bool RTNAME(PointerIsAssociated)(const Descriptor &);
+bool RTDECL(PointerIsAssociated)(const Descriptor &);
 
 // True when the pointer is associated with a specific target.
-bool RTNAME(PointerIsAssociatedWith)(
+bool RTDECL(PointerIsAssociatedWith)(
     const Descriptor &, const Descriptor *target);
 
 } // extern "C"
diff --git a/flang/include/flang/Runtime/ragged.h b/flang/include/flang/Runtime/ragged.h
index e4b5838212a8..f52a619c55b4 100644
--- a/flang/include/flang/Runtime/ragged.h
+++ b/flang/include/flang/Runtime/ragged.h
@@ -29,11 +29,6 @@ struct RaggedArrayHeader {
   std::int64_t *extentPointer;
 };
 
-RaggedArrayHeader *RaggedArrayAllocate(
-    RaggedArrayHeader *, bool, std::int64_t, std::int64_t, std::int64_t *);
-
-void RaggedArrayDeallocate(RaggedArrayHeader *);
-
 extern "C" {
 
 // For more on ragged arrays see https://en.wikipedia.org/wiki/Jagged_array. The
@@ -53,12 +48,12 @@ extern "C" {
 // non-negative rank indicates the length of the extentVector, which is a list
 // of non-negative extents. elementSize is the size of a data element in the
 // rectangular space defined by the extentVector.
-void *RTNAME(RaggedArrayAllocate)(void *header, bool isHeader,
+void *RTDECL(RaggedArrayAllocate)(void *header, bool isHeader,
     std::int64_t rank, std::int64_t elementSize, std::int64_t *extentVector);
 
 // Runtime helper for deallocation of ragged array buffers. The root header of
 // the ragged array structure is passed to deallocate the entire ragged array.
-void RTNAME(RaggedArrayDeallocate)(void *raggedArrayHeader);
+void RTDECL(RaggedArrayDeallocate)(void *raggedArrayHeader);
 
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/include/flang/Runtime/reduction.h b/flang/include/flang/Runtime/reduction.h
index a8469cb9dac8..b91fec0cd26b 100644
--- a/flang/include/flang/Runtime/reduction.h
+++ b/flang/include/flang/Runtime/reduction.h
@@ -46,389 +46,389 @@ extern "C" {
 
 // SUM()
 
-std::int8_t RTNAME(SumInteger1)(const Descriptor &, const char *source,
+std::int8_t RTDECL(SumInteger1)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
-std::int16_t RTNAME(SumInteger2)(const Descriptor &, const char *source,
+std::int16_t RTDECL(SumInteger2)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
-std::int32_t RTNAME(SumInteger4)(const Descriptor &, const char *source,
+std::int32_t RTDECL(SumInteger4)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
-std::int64_t RTNAME(SumInteger8)(const Descriptor &, const char *source,
+std::int64_t RTDECL(SumInteger8)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #ifdef __SIZEOF_INT128__
-common::int128_t RTNAME(SumInteger16)(const Descriptor &, const char *source,
+common::int128_t RTDECL(SumInteger16)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #endif
 
 // REAL/COMPLEX(2 & 3) return 32-bit float results for the caller to downconvert
-float RTNAME(SumReal2)(const Descriptor &, const char *source, int line,
+float RTDECL(SumReal2)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-float RTNAME(SumReal3)(const Descriptor &, const char *source, int line,
+float RTDECL(SumReal3)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-float RTNAME(SumReal4)(const Descriptor &, const char *source, int line,
+float RTDECL(SumReal4)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-double RTNAME(SumReal8)(const Descriptor &, const char *source, int line,
+double RTDECL(SumReal8)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
 #if LDBL_MANT_DIG == 64
-long double RTNAME(SumReal10)(const Descriptor &, const char *source, int line,
+long double RTDECL(SumReal10)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppFloat128Type RTNAME(SumReal16)(const Descriptor &, const char *source,
+CppFloat128Type RTDECL(SumReal16)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #endif
 
-void RTNAME(CppSumComplex2)(std::complex<float> &, const Descriptor &,
+void RTDECL(CppSumComplex2)(std::complex<float> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTNAME(CppSumComplex3)(std::complex<float> &, const Descriptor &,
+void RTDECL(CppSumComplex3)(std::complex<float> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTNAME(CppSumComplex4)(std::complex<float> &, const Descriptor &,
+void RTDECL(CppSumComplex4)(std::complex<float> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTNAME(CppSumComplex8)(std::complex<double> &, const Descriptor &,
+void RTDECL(CppSumComplex8)(std::complex<double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTNAME(CppSumComplex10)(std::complex<long double> &, const Descriptor &,
+void RTDECL(CppSumComplex10)(std::complex<long double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTNAME(CppSumComplex16)(std::complex<long double> &, const Descriptor &,
+void RTDECL(CppSumComplex16)(std::complex<long double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
 
-void RTNAME(SumDim)(Descriptor &result, const Descriptor &array, int dim,
+void RTDECL(SumDim)(Descriptor &result, const Descriptor &array, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
 
 // PRODUCT()
 
-std::int8_t RTNAME(ProductInteger1)(const Descriptor &, const char *source,
+std::int8_t RTDECL(ProductInteger1)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
-std::int16_t RTNAME(ProductInteger2)(const Descriptor &, const char *source,
+std::int16_t RTDECL(ProductInteger2)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
-std::int32_t RTNAME(ProductInteger4)(const Descriptor &, const char *source,
+std::int32_t RTDECL(ProductInteger4)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
-std::int64_t RTNAME(ProductInteger8)(const Descriptor &, const char *source,
+std::int64_t RTDECL(ProductInteger8)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #ifdef __SIZEOF_INT128__
-common::int128_t RTNAME(ProductInteger16)(const Descriptor &,
+common::int128_t RTDECL(ProductInteger16)(const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
 #endif
 
 // REAL/COMPLEX(2 & 3) return 32-bit float results for the caller to downconvert
-float RTNAME(ProductReal2)(const Descriptor &, const char *source, int line,
+float RTDECL(ProductReal2)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-float RTNAME(ProductReal3)(const Descriptor &, const char *source, int line,
+float RTDECL(ProductReal3)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-float RTNAME(ProductReal4)(const Descriptor &, const char *source, int line,
+float RTDECL(ProductReal4)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-double RTNAME(ProductReal8)(const Descriptor &, const char *source, int line,
+double RTDECL(ProductReal8)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
 #if LDBL_MANT_DIG == 64
-long double RTNAME(ProductReal10)(const Descriptor &, const char *source,
+long double RTDECL(ProductReal10)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppFloat128Type RTNAME(ProductReal16)(const Descriptor &, const char *source,
+CppFloat128Type RTDECL(ProductReal16)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #endif
 
-void RTNAME(CppProductComplex2)(std::complex<float> &, const Descriptor &,
+void RTDECL(CppProductComplex2)(std::complex<float> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTNAME(CppProductComplex3)(std::complex<float> &, const Descriptor &,
+void RTDECL(CppProductComplex3)(std::complex<float> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTNAME(CppProductComplex4)(std::complex<float> &, const Descriptor &,
+void RTDECL(CppProductComplex4)(std::complex<float> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTNAME(CppProductComplex8)(std::complex<double> &, const Descriptor &,
+void RTDECL(CppProductComplex8)(std::complex<double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTNAME(CppProductComplex10)(std::complex<long double> &,
+void RTDECL(CppProductComplex10)(std::complex<long double> &,
     const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTNAME(CppProductComplex16)(std::complex<long double> &,
+void RTDECL(CppProductComplex16)(std::complex<long double> &,
     const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
 
-void RTNAME(ProductDim)(Descriptor &result, const Descriptor &array, int dim,
+void RTDECL(ProductDim)(Descriptor &result, const Descriptor &array, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
 
 // IALL, IANY, IPARITY
-std::int8_t RTNAME(IAll1)(const Descriptor &, const char *source, int line,
+std::int8_t RTDECL(IAll1)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-std::int16_t RTNAME(IAll2)(const Descriptor &, const char *source, int line,
+std::int16_t RTDECL(IAll2)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-std::int32_t RTNAME(IAll4)(const Descriptor &, const char *source, int line,
+std::int32_t RTDECL(IAll4)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-std::int64_t RTNAME(IAll8)(const Descriptor &, const char *source, int line,
+std::int64_t RTDECL(IAll8)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
 #ifdef __SIZEOF_INT128__
-common::int128_t RTNAME(IAll16)(const Descriptor &, const char *source,
+common::int128_t RTDECL(IAll16)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #endif
-void RTNAME(IAllDim)(Descriptor &result, const Descriptor &array, int dim,
+void RTDECL(IAllDim)(Descriptor &result, const Descriptor &array, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
 
-std::int8_t RTNAME(IAny1)(const Descriptor &, const char *source, int line,
+std::int8_t RTDECL(IAny1)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-std::int16_t RTNAME(IAny2)(const Descriptor &, const char *source, int line,
+std::int16_t RTDECL(IAny2)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-std::int32_t RTNAME(IAny4)(const Descriptor &, const char *source, int line,
+std::int32_t RTDECL(IAny4)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-std::int64_t RTNAME(IAny8)(const Descriptor &, const char *source, int line,
+std::int64_t RTDECL(IAny8)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
 #ifdef __SIZEOF_INT128__
-common::int128_t RTNAME(IAny16)(const Descriptor &, const char *source,
+common::int128_t RTDECL(IAny16)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #endif
-void RTNAME(IAnyDim)(Descriptor &result, const Descriptor &array, int dim,
+void RTDECL(IAnyDim)(Descriptor &result, const Descriptor &array, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
 
-std::int8_t RTNAME(IParity1)(const Descriptor &, const char *source, int line,
+std::int8_t RTDECL(IParity1)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-std::int16_t RTNAME(IParity2)(const Descriptor &, const char *source, int line,
+std::int16_t RTDECL(IParity2)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-std::int32_t RTNAME(IParity4)(const Descriptor &, const char *source, int line,
+std::int32_t RTDECL(IParity4)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-std::int64_t RTNAME(IParity8)(const Descriptor &, const char *source, int line,
+std::int64_t RTDECL(IParity8)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
 #ifdef __SIZEOF_INT128__
-common::int128_t RTNAME(IParity16)(const Descriptor &, const char *source,
+common::int128_t RTDECL(IParity16)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #endif
-void RTNAME(IParityDim)(Descriptor &result, const Descriptor &array, int dim,
+void RTDECL(IParityDim)(Descriptor &result, const Descriptor &array, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
 
 // FINDLOC, MAXLOC, & MINLOC
 // These return allocated arrays in the supplied descriptor.
 // The default value for KIND= should be the default INTEGER in effect at
 // compilation time.
-void RTNAME(Findloc)(Descriptor &, const Descriptor &x,
+void RTDECL(Findloc)(Descriptor &, const Descriptor &x,
     const Descriptor &target, int kind, const char *source, int line,
     const Descriptor *mask = nullptr, bool back = false);
-void RTNAME(FindlocDim)(Descriptor &, const Descriptor &x,
+void RTDECL(FindlocDim)(Descriptor &, const Descriptor &x,
     const Descriptor &target, int kind, int dim, const char *source, int line,
     const Descriptor *mask = nullptr, bool back = false);
-void RTNAME(MaxlocCharacter)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MaxlocCharacter)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MaxlocInteger1)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MaxlocInteger1)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MaxlocInteger2)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MaxlocInteger2)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MaxlocInteger4)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MaxlocInteger4)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MaxlocInteger8)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MaxlocInteger8)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MaxlocInteger16)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MaxlocInteger16)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MaxlocReal4)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MaxlocReal4)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MaxlocReal8)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MaxlocReal8)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MaxlocReal10)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MaxlocReal10)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MaxlocReal16)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MaxlocReal16)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MaxlocDim)(Descriptor &, const Descriptor &x, int kind, int dim,
+void RTDECL(MaxlocDim)(Descriptor &, const Descriptor &x, int kind, int dim,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MinlocCharacter)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MinlocCharacter)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MinlocInteger1)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MinlocInteger1)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MinlocInteger2)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MinlocInteger2)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MinlocInteger4)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MinlocInteger4)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MinlocInteger8)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MinlocInteger8)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MinlocInteger16)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MinlocInteger16)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MinlocReal4)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MinlocReal4)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MinlocReal8)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MinlocReal8)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MinlocReal10)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MinlocReal10)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MinlocReal16)(Descriptor &, const Descriptor &, int kind,
+void RTDECL(MinlocReal16)(Descriptor &, const Descriptor &, int kind,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
-void RTNAME(MinlocDim)(Descriptor &, const Descriptor &x, int kind, int dim,
+void RTDECL(MinlocDim)(Descriptor &, const Descriptor &x, int kind, int dim,
     const char *source, int line, const Descriptor *mask = nullptr,
     bool back = false);
 
 // MAXVAL and MINVAL
-std::int8_t RTNAME(MaxvalInteger1)(const Descriptor &, const char *source,
+std::int8_t RTDECL(MaxvalInteger1)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
-std::int16_t RTNAME(MaxvalInteger2)(const Descriptor &, const char *source,
+std::int16_t RTDECL(MaxvalInteger2)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
-std::int32_t RTNAME(MaxvalInteger4)(const Descriptor &, const char *source,
+std::int32_t RTDECL(MaxvalInteger4)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
-std::int64_t RTNAME(MaxvalInteger8)(const Descriptor &, const char *source,
+std::int64_t RTDECL(MaxvalInteger8)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #ifdef __SIZEOF_INT128__
-common::int128_t RTNAME(MaxvalInteger16)(const Descriptor &, const char *source,
+common::int128_t RTDECL(MaxvalInteger16)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #endif
-float RTNAME(MaxvalReal2)(const Descriptor &, const char *source, int line,
+float RTDECL(MaxvalReal2)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-float RTNAME(MaxvalReal3)(const Descriptor &, const char *source, int line,
+float RTDECL(MaxvalReal3)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-float RTNAME(MaxvalReal4)(const Descriptor &, const char *source, int line,
+float RTDECL(MaxvalReal4)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-double RTNAME(MaxvalReal8)(const Descriptor &, const char *source, int line,
+double RTDECL(MaxvalReal8)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
 #if LDBL_MANT_DIG == 64
-long double RTNAME(MaxvalReal10)(const Descriptor &, const char *source,
+long double RTDECL(MaxvalReal10)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppFloat128Type RTNAME(MaxvalReal16)(const Descriptor &, const char *source,
+CppFloat128Type RTDECL(MaxvalReal16)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #endif
-void RTNAME(MaxvalCharacter)(Descriptor &, const Descriptor &,
+void RTDECL(MaxvalCharacter)(Descriptor &, const Descriptor &,
     const char *source, int line, const Descriptor *mask = nullptr);
 
-std::int8_t RTNAME(MinvalInteger1)(const Descriptor &, const char *source,
+std::int8_t RTDECL(MinvalInteger1)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
-std::int16_t RTNAME(MinvalInteger2)(const Descriptor &, const char *source,
+std::int16_t RTDECL(MinvalInteger2)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
-std::int32_t RTNAME(MinvalInteger4)(const Descriptor &, const char *source,
+std::int32_t RTDECL(MinvalInteger4)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
-std::int64_t RTNAME(MinvalInteger8)(const Descriptor &, const char *source,
+std::int64_t RTDECL(MinvalInteger8)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #ifdef __SIZEOF_INT128__
-common::int128_t RTNAME(MinvalInteger16)(const Descriptor &, const char *source,
+common::int128_t RTDECL(MinvalInteger16)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #endif
-float RTNAME(MinvalReal2)(const Descriptor &, const char *source, int line,
+float RTDECL(MinvalReal2)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-float RTNAME(MinvalReal3)(const Descriptor &, const char *source, int line,
+float RTDECL(MinvalReal3)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-float RTNAME(MinvalReal4)(const Descriptor &, const char *source, int line,
+float RTDECL(MinvalReal4)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
-double RTNAME(MinvalReal8)(const Descriptor &, const char *source, int line,
+double RTDECL(MinvalReal8)(const Descriptor &, const char *source, int line,
     int dim = 0, const Descriptor *mask = nullptr);
 #if LDBL_MANT_DIG == 64
-long double RTNAME(MinvalReal10)(const Descriptor &, const char *source,
+long double RTDECL(MinvalReal10)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppFloat128Type RTNAME(MinvalReal16)(const Descriptor &, const char *source,
+CppFloat128Type RTDECL(MinvalReal16)(const Descriptor &, const char *source,
     int line, int dim = 0, const Descriptor *mask = nullptr);
 #endif
-void RTNAME(MinvalCharacter)(Descriptor &, const Descriptor &,
+void RTDECL(MinvalCharacter)(Descriptor &, const Descriptor &,
     const char *source, int line, const Descriptor *mask = nullptr);
 
-void RTNAME(MaxvalDim)(Descriptor &, const Descriptor &, int dim,
+void RTDECL(MaxvalDim)(Descriptor &, const Descriptor &, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
-void RTNAME(MinvalDim)(Descriptor &, const Descriptor &, int dim,
+void RTDECL(MinvalDim)(Descriptor &, const Descriptor &, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
 
 // NORM2
-float RTNAME(Norm2_2)(
+float RTDECL(Norm2_2)(
     const Descriptor &, const char *source, int line, int dim = 0);
-float RTNAME(Norm2_3)(
+float RTDECL(Norm2_3)(
     const Descriptor &, const char *source, int line, int dim = 0);
-float RTNAME(Norm2_4)(
+float RTDECL(Norm2_4)(
     const Descriptor &, const char *source, int line, int dim = 0);
-double RTNAME(Norm2_8)(
+double RTDECL(Norm2_8)(
     const Descriptor &, const char *source, int line, int dim = 0);
 #if LDBL_MANT_DIG == 64
-long double RTNAME(Norm2_10)(
+long double RTDECL(Norm2_10)(
     const Descriptor &, const char *source, int line, int dim = 0);
 #elif LDBL_MANT_DIG == 113
-long double RTNAME(Norm2_16)(
+long double RTDECL(Norm2_16)(
     const Descriptor &, const char *source, int line, int dim = 0);
 #endif
-void RTNAME(Norm2Dim)(
+void RTDECL(Norm2Dim)(
     Descriptor &, const Descriptor &, int dim, const char *source, int line);
 
 // ALL, ANY, COUNT, & PARITY logical reductions
-bool RTNAME(All)(const Descriptor &, const char *source, int line, int dim = 0);
-void RTNAME(AllDim)(Descriptor &result, const Descriptor &, int dim,
+bool RTDECL(All)(const Descriptor &, const char *source, int line, int dim = 0);
+void RTDECL(AllDim)(Descriptor &result, const Descriptor &, int dim,
     const char *source, int line);
-bool RTNAME(Any)(const Descriptor &, const char *source, int line, int dim = 0);
-void RTNAME(AnyDim)(Descriptor &result, const Descriptor &, int dim,
+bool RTDECL(Any)(const Descriptor &, const char *source, int line, int dim = 0);
+void RTDECL(AnyDim)(Descriptor &result, const Descriptor &, int dim,
     const char *source, int line);
-std::int64_t RTNAME(Count)(
+std::int64_t RTDECL(Count)(
     const Descriptor &, const char *source, int line, int dim = 0);
-void RTNAME(CountDim)(Descriptor &result, const Descriptor &, int dim, int kind,
+void RTDECL(CountDim)(Descriptor &result, const Descriptor &, int dim, int kind,
     const char *source, int line);
-bool RTNAME(Parity)(
+bool RTDECL(Parity)(
     const Descriptor &, const char *source, int line, int dim = 0);
-void RTNAME(ParityDim)(Descriptor &result, const Descriptor &, int dim,
+void RTDECL(ParityDim)(Descriptor &result, const Descriptor &, int dim,
     const char *source, int line);
 
 // DOT_PRODUCT
-std::int8_t RTNAME(DotProductInteger1)(const Descriptor &, const Descriptor &,
+std::int8_t RTDECL(DotProductInteger1)(const Descriptor &, const Descriptor &,
     const char *source = nullptr, int line = 0);
-std::int16_t RTNAME(DotProductInteger2)(const Descriptor &, const Descriptor &,
+std::int16_t RTDECL(DotProductInteger2)(const Descriptor &, const Descriptor &,
     const char *source = nullptr, int line = 0);
-std::int32_t RTNAME(DotProductInteger4)(const Descriptor &, const Descriptor &,
+std::int32_t RTDECL(DotProductInteger4)(const Descriptor &, const Descriptor &,
     const char *source = nullptr, int line = 0);
-std::int64_t RTNAME(DotProductInteger8)(const Descriptor &, const Descriptor &,
+std::int64_t RTDECL(DotProductInteger8)(const Descriptor &, const Descriptor &,
     const char *source = nullptr, int line = 0);
 #ifdef __SIZEOF_INT128__
-common::int128_t RTNAME(DotProductInteger16)(const Descriptor &,
+common::int128_t RTDECL(DotProductInteger16)(const Descriptor &,
     const Descriptor &, const char *source = nullptr, int line = 0);
 #endif
-float RTNAME(DotProductReal2)(const Descriptor &, const Descriptor &,
+float RTDECL(DotProductReal2)(const Descriptor &, const Descriptor &,
     const char *source = nullptr, int line = 0);
-float RTNAME(DotProductReal3)(const Descriptor &, const Descriptor &,
+float RTDECL(DotProductReal3)(const Descriptor &, const Descriptor &,
     const char *source = nullptr, int line = 0);
-float RTNAME(DotProductReal4)(const Descriptor &, const Descriptor &,
+float RTDECL(DotProductReal4)(const Descriptor &, const Descriptor &,
     const char *source = nullptr, int line = 0);
-double RTNAME(DotProductReal8)(const Descriptor &, const Descriptor &,
+double RTDECL(DotProductReal8)(const Descriptor &, const Descriptor &,
     const char *source = nullptr, int line = 0);
 #if LDBL_MANT_DIG == 64
-long double RTNAME(DotProductReal10)(const Descriptor &, const Descriptor &,
+long double RTDECL(DotProductReal10)(const Descriptor &, const Descriptor &,
     const char *source = nullptr, int line = 0);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppFloat128Type RTNAME(DotProductReal16)(const Descriptor &, const Descriptor &,
+CppFloat128Type RTDECL(DotProductReal16)(const Descriptor &, const Descriptor &,
     const char *source = nullptr, int line = 0);
 #endif
-void RTNAME(CppDotProductComplex2)(std::complex<float> &, const Descriptor &,
+void RTDECL(CppDotProductComplex2)(std::complex<float> &, const Descriptor &,
     const Descriptor &, const char *source = nullptr, int line = 0);
-void RTNAME(CppDotProductComplex3)(std::complex<float> &, const Descriptor &,
+void RTDECL(CppDotProductComplex3)(std::complex<float> &, const Descriptor &,
     const Descriptor &, const char *source = nullptr, int line = 0);
-void RTNAME(CppDotProductComplex4)(std::complex<float> &, const Descriptor &,
+void RTDECL(CppDotProductComplex4)(std::complex<float> &, const Descriptor &,
     const Descriptor &, const char *source = nullptr, int line = 0);
-void RTNAME(CppDotProductComplex8)(std::complex<double> &, const Descriptor &,
+void RTDECL(CppDotProductComplex8)(std::complex<double> &, const Descriptor &,
     const Descriptor &, const char *source = nullptr, int line = 0);
 #if LDBL_MANT_DIG == 64
-void RTNAME(CppDotProductComplex10)(std::complex<long double> &,
+void RTDECL(CppDotProductComplex10)(std::complex<long double> &,
     const Descriptor &, const Descriptor &, const char *source = nullptr,
     int line = 0);
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-void RTNAME(CppDotProductComplex16)(std::complex<CppFloat128Type> &,
+void RTDECL(CppDotProductComplex16)(std::complex<CppFloat128Type> &,
     const Descriptor &, const Descriptor &, const char *source = nullptr,
     int line = 0);
 #endif
-bool RTNAME(DotProductLogical)(const Descriptor &, const Descriptor &,
+bool RTDECL(DotProductLogical)(const Descriptor &, const Descriptor &,
     const char *source = nullptr, int line = 0);
 
 } // extern "C"
diff --git a/flang/include/flang/Runtime/support.h b/flang/include/flang/Runtime/support.h
index 1262a04b5cd4..e7ae2154b2a7 100644
--- a/flang/include/flang/Runtime/support.h
+++ b/flang/include/flang/Runtime/support.h
@@ -21,7 +21,7 @@ class Descriptor;
 extern "C" {
 
 // Predicate: is the storage described by a Descriptor contiguous in memory?
-bool RTNAME(IsContiguous)(const Descriptor &);
+bool RTDECL(IsContiguous)(const Descriptor &);
 
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index d3e4dc6cd4a2..96d3869cd093 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -10,6 +10,7 @@
 /// debugging the test tools. This file must be included into the tool.
 
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
@@ -311,6 +312,20 @@ inline void createDefaultFIRCodeGenPassPipeline(
   if (config.VScaleMin != 0)
     pm.addPass(fir::createVScaleAttrPass({config.VScaleMin, config.VScaleMax}));
 
+  // Add function attributes
+  fir::FunctionAttrTypes functionAttrs;
+
+  if (config.FramePointerKind != llvm::FramePointerKind::None) {
+    if (config.FramePointerKind == llvm::FramePointerKind::NonLeaf)
+      functionAttrs.framePointerKind =
+          mlir::LLVM::framePointerKind::FramePointerKind::NonLeaf;
+    else
+      functionAttrs.framePointerKind =
+          mlir::LLVM::framePointerKind::FramePointerKind::All;
+
+    pm.addPass(fir::createFunctionAttrPass(functionAttrs));
+  }
+
   fir::addFIRToLLVMPass(pm, config);
 }
 
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index ddec70fa9824..b346b30b158a 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -35,6 +35,7 @@ struct MLIRToLLVMPassPipelineConfig {
     LoopVersioning = opts.LoopVersioning;
     DebugInfo = opts.getDebugInfo();
     AliasAnalysis = opts.AliasAnalysis;
+    FramePointerKind = opts.getFramePointer();
   }
 
   llvm::OptimizationLevel OptLevel; ///< optimisation level
@@ -44,6 +45,8 @@ struct MLIRToLLVMPassPipelineConfig {
   bool AliasAnalysis = false; ///< Add TBAA tags to generated LLVMIR
   llvm::codegenoptions::DebugInfoKind DebugInfo =
       llvm::codegenoptions::NoDebugInfo; ///< Debug info generation.
+  llvm::FramePointerKind FramePointerKind =
+      llvm::FramePointerKind::None; ///< Add frame pointer to functions.
   unsigned VScaleMin = 0; ///< SVE vector range minimum.
   unsigned VScaleMax = 0; ///< SVE vector range maximum.
 };
diff --git a/flang/lib/Decimal/big-radix-floating-point.h b/flang/lib/Decimal/big-radix-floating-point.h
index 7d5d31b7788d..2143d1d9b3f7 100644
--- a/flang/lib/Decimal/big-radix-floating-point.h
+++ b/flang/lib/Decimal/big-radix-floating-point.h
@@ -369,6 +369,12 @@ private:
     }
     return result;
   }
+  constexpr Raw HUGE() const {
+    Raw result{static_cast<Raw>(Real::maxExponent)};
+    result <<= Real::significandBits;
+    result |= SignBit();
+    return result - 1; // decrement exponent, set all significand bits
+  }
 
   Digit digit_[maxDigits]; // in little-endian order: digit_[0] is LSD
   int digits_{0}; // # of elements in digit_[] array; zero when zero
diff --git a/flang/lib/Decimal/decimal-to-binary.cpp b/flang/lib/Decimal/decimal-to-binary.cpp
index d5b66b9fb933..d38af0f9b800 100644
--- a/flang/lib/Decimal/decimal-to-binary.cpp
+++ b/flang/lib/Decimal/decimal-to-binary.cpp
@@ -237,6 +237,15 @@ private:
   int exponent_{0};
 };
 
+// The standard says that these overflow cases round to "representable"
+// numbers, and some popular compilers interpret that to mean +/-HUGE()
+// rather than +/-Inf.
+static inline constexpr bool RoundOverflowToHuge(
+    enum FortranRounding rounding, bool isNegative) {
+  return rounding == RoundToZero || (!isNegative && rounding == RoundDown) ||
+      (isNegative && rounding == RoundUp);
+}
+
 template <int PREC>
 ConversionToBinaryResult<PREC> IntermediateFloat<PREC>::ToBinary(
     bool isNegative, FortranRounding rounding) const {
@@ -256,12 +265,18 @@ ConversionToBinaryResult<PREC> IntermediateFloat<PREC>::ToBinary(
   if (guard != 0) {
     flags |= Inexact;
   }
-  if (fraction == 0 && guard <= oneHalf) {
-    if ((!isNegative && rounding == RoundUp) ||
-        (isNegative && rounding == RoundDown)) {
-      // round to minimum nonzero value
-    } else {
-      return {Binary{}, static_cast<enum ConversionResultFlags>(flags)};
+  if (fraction == 0) {
+    if (guard <= oneHalf) {
+      if ((!isNegative && rounding == RoundUp) ||
+          (isNegative && rounding == RoundDown)) {
+        // round to least nonzero value
+        expo = 0;
+      } else { // round to zero
+        if (guard != 0) {
+          flags |= Underflow;
+        }
+        return {Binary{}, static_cast<enum ConversionResultFlags>(flags)};
+      }
     }
   } else {
     // The value is nonzero; normalize it.
@@ -301,14 +316,21 @@ ConversionToBinaryResult<PREC> IntermediateFloat<PREC>::ToBinary(
   }
   if (expo == 1 && fraction < topBit) {
     expo = 0; // subnormal
-  }
-  if (expo >= Binary::maxExponent) {
-    expo = Binary::maxExponent; // Inf
-    flags |= Overflow;
-    if constexpr (Binary::bits == 80) { // x87
-      fraction = IntType{1} << 63;
-    } else {
-      fraction = 0;
+    flags |= Underflow;
+  } else if (expo == 0) {
+    flags |= Underflow;
+  } else if (expo >= Binary::maxExponent) {
+    if (RoundOverflowToHuge(rounding, isNegative)) {
+      expo = Binary::maxExponent - 1;
+      fraction = mask;
+    } else { // Inf
+      expo = Binary::maxExponent;
+      flags |= Overflow;
+      if constexpr (Binary::bits == 80) { // x87
+        fraction = IntType{1} << 63;
+      } else {
+        fraction = 0;
+      }
     }
   }
   using Raw = typename Binary::RawType;
@@ -338,14 +360,22 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToBinary() {
   // Sanity checks for ridiculous exponents
   static constexpr int crazy{2 * Real::decimalRange + log10Radix};
   if (exponent_ < -crazy) {
+    enum ConversionResultFlags flags {
+      static_cast<enum ConversionResultFlags>(Inexact | Underflow)
+    };
     if ((!isNegative_ && rounding_ == RoundUp) ||
         (isNegative_ && rounding_ == RoundDown)) {
-      return {Real{Raw{1} | SignBit()}}; // return least nonzero value
+      // return least nonzero value
+      return {Real{Raw{1} | SignBit()}, flags};
     } else { // underflow to +/-0.
-      return {Real{SignBit()}, Inexact};
+      return {Real{SignBit()}, flags};
+    }
+  } else if (exponent_ > crazy) { // overflow to +/-HUGE() or +/-Inf
+    if (RoundOverflowToHuge(rounding_, isNegative_)) {
+      return {Real{HUGE()}};
+    } else {
+      return {Real{Infinity()}, Overflow};
     }
-  } else if (exponent_ > crazy) { // overflow to +/-Inf.
-    return {Real{Infinity()}, Overflow};
   }
   // Apply any negative decimal exponent by multiplication
   // by a power of two, adjusting the binary exponent to compensate.
diff --git a/flang/lib/Evaluate/real.cpp b/flang/lib/Evaluate/real.cpp
index 4fecaa1a131f..cb8be98bb6f4 100644
--- a/flang/lib/Evaluate/real.cpp
+++ b/flang/lib/Evaluate/real.cpp
@@ -350,7 +350,7 @@ ValueWithRealFlags<Real<W, P>> Real<W, P>::NEAREST(bool upward) const {
         isNegative = !isNegative;
       } else {
         auto sub1{fraction.SubtractSigned(one)};
-        if (sub1.overflow) {
+        if (sub1.overflow && expo > 1) {
           nearest = Fraction{0}.NOT();
           --expo;
         } else {
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 8c755da4a2d8..7834364bccc4 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1724,9 +1724,13 @@ bool IsLenTypeParameter(const Symbol &symbol) {
 }
 
 bool IsExtensibleType(const DerivedTypeSpec *derived) {
-  return derived && !IsIsoCType(derived) &&
-      !derived->typeSymbol().attrs().test(Attr::BIND_C) &&
-      !derived->typeSymbol().get<DerivedTypeDetails>().sequence();
+  return !IsSequenceOrBindCType(derived) && !IsIsoCType(derived);
+}
+
+bool IsSequenceOrBindCType(const DerivedTypeSpec *derived) {
+  return derived &&
+      (derived->typeSymbol().attrs().test(Attr::BIND_C) ||
+          derived->typeSymbol().get<DerivedTypeDetails>().sequence());
 }
 
 bool IsBuiltinDerivedType(const DerivedTypeSpec *derived, const char *name) {
@@ -1761,6 +1765,10 @@ bool IsLockType(const DerivedTypeSpec *derived) {
   return IsBuiltinDerivedType(derived, "lock_type");
 }
 
+bool IsNotifyType(const DerivedTypeSpec *derived) {
+  return IsBuiltinDerivedType(derived, "notify_type");
+}
+
 bool IsTeamType(const DerivedTypeSpec *derived) {
   return IsBuiltinDerivedType(derived, "team_type");
 }
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 4ce6171801d2..b65b6e31bea8 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -245,6 +245,23 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
 
   opts.AliasAnalysis = opts.OptimizationLevel > 0;
 
+  // -mframe-pointer=none/non-leaf/all option.
+  if (const llvm::opt::Arg *a =
+          args.getLastArg(clang::driver::options::OPT_mframe_pointer_EQ)) {
+    std::optional<llvm::FramePointerKind> val =
+        llvm::StringSwitch<std::optional<llvm::FramePointerKind>>(a->getValue())
+            .Case("none", llvm::FramePointerKind::None)
+            .Case("non-leaf", llvm::FramePointerKind::NonLeaf)
+            .Case("all", llvm::FramePointerKind::All)
+            .Default(std::nullopt);
+
+    if (!val.has_value()) {
+      diags.Report(clang::diag::err_drv_invalid_value)
+          << a->getAsString(args) << a->getValue();
+    } else
+      opts.setFramePointer(val.value());
+  }
+
   for (auto *a : args.filtered(clang::driver::options::OPT_fpass_plugin_EQ))
     opts.LLVMPassPlugins.push_back(a->getValue());
 
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index e1d406e3cf31..2bceee09b4f0 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -3092,6 +3092,10 @@ private:
 
   //===--------------------------------------------------------------------===//
 
+  void genFIR(const Fortran::parser::NotifyWaitStmt &stmt) {
+    genNotifyWaitStatement(*this, stmt);
+  }
+
   void genFIR(const Fortran::parser::EventPostStmt &stmt) {
     genEventPostStatement(*this, stmt);
   }
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index fd726c90c07b..57ac9d0652b3 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -887,7 +887,7 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
   // Handle the procedure pointer actual arguments.
   if (actual.isProcedurePointer()) {
     // Procedure pointer actual to procedure pointer dummy.
-    if (hlfir::isBoxProcAddressType(dummyType))
+    if (fir::isBoxProcAddressType(dummyType))
       return PreparedDummyArgument{actual, /*cleanups=*/{}};
     // Procedure pointer actual to procedure dummy.
     if (hlfir::isFortranProcedureValue(dummyType)) {
@@ -898,7 +898,7 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
 
   // NULL() actual to procedure pointer dummy
   if (Fortran::evaluate::IsNullProcedurePointer(expr) &&
-      hlfir::isBoxProcAddressType(dummyType)) {
+      fir::isBoxProcAddressType(dummyType)) {
     auto boxTy{Fortran::lower::getUntypedBoxProcType(builder.getContext())};
     auto tempBoxProc{builder.createTemporary(loc, boxTy)};
     hlfir::Entity nullBoxProc(
@@ -909,7 +909,7 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
 
   if (actual.isProcedure()) {
     // Procedure actual to procedure pointer dummy.
-    if (hlfir::isBoxProcAddressType(dummyType)) {
+    if (fir::isBoxProcAddressType(dummyType)) {
       auto tempBoxProc{builder.createTemporary(loc, actual.getType())};
       builder.create<fir::StoreOp>(loc, actual, tempBoxProc);
       return PreparedDummyArgument{tempBoxProc, /*cleanups=*/{}};
@@ -1555,8 +1555,6 @@ genIntrinsicRefCore(Fortran::lower::PreparedActualArguments &loweredActuals,
     }
 
     hlfir::Entity actual = arg.value()->getActual(loc, builder);
-    if (actual.isProcedurePointer())
-      TODO(loc, "Procedure pointer as actual argument to intrinsics.");
     switch (argRules.lowerAs) {
     case fir::LowerIntrinsicArgAs::Value:
       operands.emplace_back(
diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp
index a60ca92a8733..3933ebeb9b3c 100644
--- a/flang/lib/Lower/IO.cpp
+++ b/flang/lib/Lower/IO.cpp
@@ -1850,24 +1850,25 @@ static mlir::Value genIOUnit(Fortran::lower::AbstractConverter &converter,
                              mlir::Location loc,
                              const Fortran::parser::IoUnit *iounit,
                              mlir::Type ty, ConditionSpecInfo &csi,
-                             Fortran::lower::StatementContext &stmtCtx) {
+                             Fortran::lower::StatementContext &stmtCtx,
+                             int defaultUnitNumber) {
   auto &builder = converter.getFirOpBuilder();
   if (iounit)
     if (auto *e = std::get_if<Fortran::parser::FileUnitNumber>(&iounit->u))
       return genIOUnitNumber(converter, loc, Fortran::semantics::GetExpr(*e),
                              ty, csi, stmtCtx);
   return builder.create<mlir::arith::ConstantOp>(
-      loc, builder.getIntegerAttr(ty, Fortran::runtime::io::DefaultUnit));
+      loc, builder.getIntegerAttr(ty, defaultUnitNumber));
 }
 
 template <typename A>
-static mlir::Value getIOUnit(Fortran::lower::AbstractConverter &converter,
-                             mlir::Location loc, const A &stmt, mlir::Type ty,
-                             ConditionSpecInfo &csi,
-                             Fortran::lower::StatementContext &stmtCtx) {
+static mlir::Value
+getIOUnit(Fortran::lower::AbstractConverter &converter, mlir::Location loc,
+          const A &stmt, mlir::Type ty, ConditionSpecInfo &csi,
+          Fortran::lower::StatementContext &stmtCtx, int defaultUnitNumber) {
   const Fortran::parser::IoUnit *iounit =
       stmt.iounit ? &*stmt.iounit : getIOControl<Fortran::parser::IoUnit>(stmt);
-  return genIOUnit(converter, loc, iounit, ty, csi, stmtCtx);
+  return genIOUnit(converter, loc, iounit, ty, csi, stmtCtx, defaultUnitNumber);
 }
 //===----------------------------------------------------------------------===//
 // Generators for each IO statement type.
@@ -2091,7 +2092,7 @@ getBeginDataTransferFunc(mlir::Location loc, fir::FirOpBuilder &builder,
 }
 
 /// Generate the arguments of a begin data transfer statement call.
-template <bool hasIOCtrl, typename A>
+template <bool hasIOCtrl, int defaultUnitNumber, typename A>
 void genBeginDataTransferCallArgs(
     llvm::SmallVectorImpl<mlir::Value> &ioArgs,
     Fortran::lower::AbstractConverter &converter, mlir::Location loc,
@@ -2149,14 +2150,14 @@ void genBeginDataTransferCallArgs(
         TODO(loc, "asynchronous");
       maybeGetFormatArgs();
       ioArgs.push_back(getIOUnit(converter, loc, stmt,
-                                 ioFuncTy.getInput(ioArgs.size()), csi,
-                                 stmtCtx));
+                                 ioFuncTy.getInput(ioArgs.size()), csi, stmtCtx,
+                                 defaultUnitNumber));
     }
   } else { // PRINT - maybe explicit format; default unit
     maybeGetFormatArgs();
     ioArgs.push_back(builder.create<mlir::arith::ConstantOp>(
         loc, builder.getIntegerAttr(ioFuncTy.getInput(ioArgs.size()),
-                                    Fortran::runtime::io::DefaultUnit)));
+                                    defaultUnitNumber)));
   }
   // File name and line number are always the last two arguments.
   ioArgs.push_back(
@@ -2193,7 +2194,9 @@ genDataTransferStmt(Fortran::lower::AbstractConverter &converter,
       loc, builder, isFormatted, isList || isNml, isInternal,
       isInternalWithDesc, isAsync);
   llvm::SmallVector<mlir::Value> ioArgs;
-  genBeginDataTransferCallArgs<hasIOCtrl>(
+  genBeginDataTransferCallArgs<
+      hasIOCtrl, isInput ? Fortran::runtime::io::DefaultInputUnit
+                         : Fortran::runtime::io::DefaultOutputUnit>(
       ioArgs, converter, loc, stmt, ioFunc.getFunctionType(), isFormatted,
       isList || isNml, isInternal, isAsync, descRef, csi, stmtCtx);
   mlir::Value cookie =
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index fae54eefb02f..ecf70818c4ac 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1480,7 +1480,7 @@ getDeviceType(Fortran::parser::AccDeviceTypeExpr::Device device) {
   case Fortran::parser::AccDeviceTypeExpr::Device::Multicore:
     return mlir::acc::DeviceType::Multicore;
   }
-  return mlir::acc::DeviceType::Default;
+  return mlir::acc::DeviceType::None;
 }
 
 static void gatherDeviceTypeAttrs(
@@ -1781,26 +1781,24 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
                 bool outerCombined = false) {
 
   // Parallel operation operands
-  mlir::Value async;
-  mlir::Value numWorkers;
-  mlir::Value vectorLength;
   mlir::Value ifCond;
   mlir::Value selfCond;
-  mlir::Value waitDevnum;
   llvm::SmallVector<mlir::Value> waitOperands, attachEntryOperands,
       copyEntryOperands, copyoutEntryOperands, createEntryOperands,
-      dataClauseOperands, numGangs;
+      dataClauseOperands, numGangs, numWorkers, vectorLength, async;
+  llvm::SmallVector<mlir::Attribute> numGangsDeviceTypes, numWorkersDeviceTypes,
+      vectorLengthDeviceTypes, asyncDeviceTypes, asyncOnlyDeviceTypes,
+      waitOperandsDeviceTypes, waitOnlyDeviceTypes;
+  llvm::SmallVector<int32_t> numGangsSegments, waitOperandsSegments;
 
   llvm::SmallVector<mlir::Value> reductionOperands, privateOperands,
       firstprivateOperands;
   llvm::SmallVector<mlir::Attribute> privatizations, firstPrivatizations,
       reductionRecipes;
 
-  // Async, wait and self clause have optional values but can be present with
+  // Self clause has optional values but can be present with
   // no value as well. When there is no value, the op has an attribute to
   // represent the clause.
-  bool addAsyncAttr = false;
-  bool addWaitAttr = false;
   bool addSelfAttr = false;
 
   bool hasDefaultNone = false;
@@ -1808,6 +1806,11 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
 
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
+  // device_type attribute is set to `none` until a device_type clause is
+  // encountered.
+  auto crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
+      builder.getContext(), mlir::acc::DeviceType::None);
+
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separatly as clauses can appear
   // more than once.
@@ -1815,27 +1818,52 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
     mlir::Location clauseLocation = converter.genLocation(clause.source);
     if (const auto *asyncClause =
             std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
-      genAsyncClause(converter, asyncClause, async, addAsyncAttr, stmtCtx);
+      const auto &asyncClauseValue = asyncClause->v;
+      if (asyncClauseValue) { // async has a value.
+        async.push_back(fir::getBase(converter.genExprValue(
+            *Fortran::semantics::GetExpr(*asyncClauseValue), stmtCtx)));
+        asyncDeviceTypes.push_back(crtDeviceTypeAttr);
+      } else {
+        asyncOnlyDeviceTypes.push_back(crtDeviceTypeAttr);
+      }
     } else if (const auto *waitClause =
                    std::get_if<Fortran::parser::AccClause::Wait>(&clause.u)) {
-      genWaitClause(converter, waitClause, waitOperands, waitDevnum,
-                    addWaitAttr, stmtCtx);
+      const auto &waitClauseValue = waitClause->v;
+      if (waitClauseValue) { // wait has a value.
+        const Fortran::parser::AccWaitArgument &waitArg = *waitClauseValue;
+        const auto &waitList =
+            std::get<std::list<Fortran::parser::ScalarIntExpr>>(waitArg.t);
+        auto crtWaitOperands = waitOperands.size();
+        for (const Fortran::parser::ScalarIntExpr &value : waitList) {
+          waitOperands.push_back(fir::getBase(converter.genExprValue(
+              *Fortran::semantics::GetExpr(value), stmtCtx)));
+        }
+        waitOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
+        waitOperandsSegments.push_back(waitOperands.size() - crtWaitOperands);
+      } else {
+        waitOnlyDeviceTypes.push_back(crtDeviceTypeAttr);
+      }
     } else if (const auto *numGangsClause =
                    std::get_if<Fortran::parser::AccClause::NumGangs>(
                        &clause.u)) {
+      auto crtNumGangs = numGangs.size();
       for (const Fortran::parser::ScalarIntExpr &expr : numGangsClause->v)
         numGangs.push_back(fir::getBase(converter.genExprValue(
             *Fortran::semantics::GetExpr(expr), stmtCtx)));
+      numGangsDeviceTypes.push_back(crtDeviceTypeAttr);
+      numGangsSegments.push_back(numGangs.size() - crtNumGangs);
     } else if (const auto *numWorkersClause =
                    std::get_if<Fortran::parser::AccClause::NumWorkers>(
                        &clause.u)) {
-      numWorkers = fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(numWorkersClause->v), stmtCtx));
+      numWorkers.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(numWorkersClause->v), stmtCtx)));
+      numWorkersDeviceTypes.push_back(crtDeviceTypeAttr);
     } else if (const auto *vectorLengthClause =
                    std::get_if<Fortran::parser::AccClause::VectorLength>(
                        &clause.u)) {
-      vectorLength = fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(vectorLengthClause->v), stmtCtx));
+      vectorLength.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(vectorLengthClause->v), stmtCtx)));
+      vectorLengthDeviceTypes.push_back(crtDeviceTypeAttr);
     } else if (const auto *ifClause =
                    std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
       genIfClause(converter, clauseLocation, ifClause, ifCond, stmtCtx);
@@ -1986,18 +2014,27 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
       else if ((defaultClause->v).v ==
                llvm::acc::DefaultValue::ACC_Default_present)
         hasDefaultPresent = true;
+    } else if (const auto *deviceTypeClause =
+                   std::get_if<Fortran::parser::AccClause::DeviceType>(
+                       &clause.u)) {
+      const Fortran::parser::AccDeviceTypeExprList &deviceTypeExprList =
+          deviceTypeClause->v;
+      assert(deviceTypeExprList.v.size() == 1 &&
+             "expect only one device_type expr");
+      crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
+          builder.getContext(), getDeviceType(deviceTypeExprList.v.front().v));
     }
   }
 
   // Prepare the operand segment size attribute and the operands value range.
   llvm::SmallVector<mlir::Value, 8> operands;
   llvm::SmallVector<int32_t, 8> operandSegments;
-  addOperand(operands, operandSegments, async);
+  addOperands(operands, operandSegments, async);
   addOperands(operands, operandSegments, waitOperands);
   if constexpr (!std::is_same_v<Op, mlir::acc::SerialOp>) {
     addOperands(operands, operandSegments, numGangs);
-    addOperand(operands, operandSegments, numWorkers);
-    addOperand(operands, operandSegments, vectorLength);
+    addOperands(operands, operandSegments, numWorkers);
+    addOperands(operands, operandSegments, vectorLength);
   }
   addOperand(operands, operandSegments, ifCond);
   addOperand(operands, operandSegments, selfCond);
@@ -2018,10 +2055,6 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
         builder, currentLocation, eval, operands, operandSegments,
         outerCombined);
 
-  if (addAsyncAttr)
-    computeOp.setAsyncAttrAttr(builder.getUnitAttr());
-  if (addWaitAttr)
-    computeOp.setWaitAttrAttr(builder.getUnitAttr());
   if (addSelfAttr)
     computeOp.setSelfAttrAttr(builder.getUnitAttr());
 
@@ -2030,6 +2063,34 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
   if (hasDefaultPresent)
     computeOp.setDefaultAttr(mlir::acc::ClauseDefaultValue::Present);
 
+  if constexpr (!std::is_same_v<Op, mlir::acc::SerialOp>) {
+    if (!numWorkersDeviceTypes.empty())
+      computeOp.setNumWorkersDeviceTypeAttr(
+          mlir::ArrayAttr::get(builder.getContext(), numWorkersDeviceTypes));
+    if (!vectorLengthDeviceTypes.empty())
+      computeOp.setVectorLengthDeviceTypeAttr(
+          mlir::ArrayAttr::get(builder.getContext(), vectorLengthDeviceTypes));
+    if (!numGangsDeviceTypes.empty())
+      computeOp.setNumGangsDeviceTypeAttr(
+          mlir::ArrayAttr::get(builder.getContext(), numGangsDeviceTypes));
+    if (!numGangsSegments.empty())
+      computeOp.setNumGangsSegmentsAttr(
+          builder.getDenseI32ArrayAttr(numGangsSegments));
+  }
+  if (!asyncDeviceTypes.empty())
+    computeOp.setAsyncDeviceTypeAttr(builder.getArrayAttr(asyncDeviceTypes));
+  if (!asyncOnlyDeviceTypes.empty())
+    computeOp.setAsyncOnlyAttr(builder.getArrayAttr(asyncOnlyDeviceTypes));
+
+  if (!waitOperandsDeviceTypes.empty())
+    computeOp.setWaitOperandsDeviceTypeAttr(
+        builder.getArrayAttr(waitOperandsDeviceTypes));
+  if (!waitOperandsSegments.empty())
+    computeOp.setWaitOperandsSegmentsAttr(
+        builder.getDenseI32ArrayAttr(waitOperandsSegments));
+  if (!waitOnlyDeviceTypes.empty())
+    computeOp.setWaitOnlyAttr(builder.getArrayAttr(waitOnlyDeviceTypes));
+
   if constexpr (!std::is_same_v<Op, mlir::acc::KernelsOp>) {
     if (!privatizations.empty())
       computeOp.setPrivatizationsAttr(
diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 9213cff95d3f..c3a570bf15ea 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -30,6 +30,7 @@
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -606,6 +607,12 @@ public:
                       llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
                           &useDeviceSymbols) const;
 
+  template <typename T>
+  bool
+  processMotionClauses(Fortran::semantics::SemanticsContext &semanticsContext,
+                       Fortran::lower::StatementContext &stmtCtx,
+                       llvm::SmallVectorImpl<mlir::Value> &mapOperands);
+
   // Call this method for these clauses that should be supported but are not
   // implemented yet. It triggers a compilation error if any of the given
   // clauses is found.
@@ -1892,6 +1899,47 @@ bool ClauseProcessor::processUseDevicePtr(
       });
 }
 
+template <typename T>
+bool ClauseProcessor::processMotionClauses(
+    Fortran::semantics::SemanticsContext &semanticsContext,
+    Fortran::lower::StatementContext &stmtCtx,
+    llvm::SmallVectorImpl<mlir::Value> &mapOperands) {
+  return findRepeatableClause<T>(
+      [&](const T *motionClause, const Fortran::parser::CharBlock &source) {
+        mlir::Location clauseLocation = converter.genLocation(source);
+        fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+
+        static_assert(std::is_same_v<T, ClauseProcessor::ClauseTy::To> ||
+                      std::is_same_v<T, ClauseProcessor::ClauseTy::From>);
+
+        // TODO Support motion modifiers: present, mapper, iterator.
+        constexpr llvm::omp::OpenMPOffloadMappingFlags mapTypeBits =
+            std::is_same_v<T, ClauseProcessor::ClauseTy::To>
+                ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO
+                : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+
+        for (const Fortran::parser::OmpObject &ompObject : motionClause->v.v) {
+          llvm::SmallVector<mlir::Value> bounds;
+          std::stringstream asFortran;
+          Fortran::lower::AddrAndBoundsInfo info =
+              Fortran::lower::gatherDataOperandAddrAndBounds<
+                  Fortran::parser::OmpObject, mlir::omp::DataBoundsOp,
+                  mlir::omp::DataBoundsType>(
+                  converter, firOpBuilder, semanticsContext, stmtCtx, ompObject,
+                  clauseLocation, asFortran, bounds, treatIndexAsSection);
+
+          mlir::Value mapOp = createMapInfoOp(
+              firOpBuilder, clauseLocation, info.addr, asFortran, bounds,
+              static_cast<
+                  std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
+                  mapTypeBits),
+              mlir::omp::VariableCaptureKind::ByRef, info.addr.getType());
+
+          mapOperands.push_back(mapOp);
+        }
+      });
+}
+
 template <typename... Ts>
 void ClauseProcessor::processTODO(mlir::Location currentLocation,
                                   llvm::omp::Directive directive) const {
@@ -2122,14 +2170,13 @@ static void createBodyOfOp(
     llvm::SmallVector<mlir::Type> tiv(args.size(), loopVarType);
     llvm::SmallVector<mlir::Location> locs(args.size(), loc);
     firOpBuilder.createBlock(&op.getRegion(), {}, tiv, locs);
-    int argIndex = 0;
     // The argument is not currently in memory, so make a temporary for the
     // argument, and store it there, then bind that location to the argument.
-    for (const Fortran::semantics::Symbol *arg : args) {
+    for (auto [argIndex, argSymbol] : llvm::enumerate(args)) {
       mlir::Value indexVal =
           fir::getBase(op.getRegion().front().getArgument(argIndex));
-      storeOp = createAndSetPrivatizedLoopVar(converter, loc, indexVal, arg);
-      argIndex++;
+      storeOp =
+          createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
     }
   } else {
     firOpBuilder.createBlock(&op.getRegion());
@@ -2190,14 +2237,15 @@ static void genBodyOfTargetDataOp(
 
   firOpBuilder.createBlock(&region, {}, useDeviceTypes, useDeviceLocs);
 
-  unsigned argIndex = 0;
-  for (const Fortran::semantics::Symbol *sym : useDeviceSymbols) {
+  for (auto [argIndex, argSymbol] : llvm::enumerate(useDeviceSymbols)) {
     const mlir::BlockArgument &arg = region.front().getArgument(argIndex);
-    fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym);
+    fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*argSymbol);
     if (auto refType = arg.getType().dyn_cast<fir::ReferenceType>()) {
       if (fir::isa_builtin_cptr_type(refType.getElementType())) {
-        converter.bindSymbol(*sym, arg);
+        converter.bindSymbol(*argSymbol, arg);
       } else {
+        // Avoid capture of a reference to a structured binding.
+        const Fortran::semantics::Symbol *sym = argSymbol;
         extVal.match(
             [&](const fir::MutableBoxValue &mbv) {
               converter.bindSymbol(
@@ -2214,7 +2262,6 @@ static void genBodyOfTargetDataOp(
       TODO(converter.getCurrentLocation(),
            "use_device clause operand unsupported type");
     }
-    argIndex++;
   }
 
   // Insert dummy instruction to remember the insertion position. The
@@ -2416,10 +2463,10 @@ genDataOp(Fortran::lower::AbstractConverter &converter,
 
 template <typename OpTy>
 static OpTy
-genEnterExitDataOp(Fortran::lower::AbstractConverter &converter,
-                   Fortran::semantics::SemanticsContext &semanticsContext,
-                   mlir::Location currentLocation,
-                   const Fortran::parser::OmpClauseList &clauseList) {
+genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
+                         Fortran::semantics::SemanticsContext &semanticsContext,
+                         mlir::Location currentLocation,
+                         const Fortran::parser::OmpClauseList &clauseList) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
   mlir::Value ifClauseOperand, deviceOperand;
@@ -2436,6 +2483,10 @@ genEnterExitDataOp(Fortran::lower::AbstractConverter &converter,
     directiveName =
         Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetExitData;
     directive = llvm::omp::Directive::OMPD_target_exit_data;
+  } else if constexpr (std::is_same_v<OpTy, mlir::omp::UpdateDataOp>) {
+    directiveName =
+        Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetUpdate;
+    directive = llvm::omp::Directive::OMPD_target_update;
   } else {
     return nullptr;
   }
@@ -2444,8 +2495,18 @@ genEnterExitDataOp(Fortran::lower::AbstractConverter &converter,
   cp.processIf(directiveName, ifClauseOperand);
   cp.processDevice(stmtCtx, deviceOperand);
   cp.processNowait(nowaitAttr);
-  cp.processMap(currentLocation, directive, semanticsContext, stmtCtx,
-                mapOperands);
+
+  if constexpr (std::is_same_v<OpTy, mlir::omp::UpdateDataOp>) {
+    cp.processMotionClauses<Fortran::parser::OmpClause::To>(
+        semanticsContext, stmtCtx, mapOperands);
+    cp.processMotionClauses<Fortran::parser::OmpClause::From>(
+        semanticsContext, stmtCtx, mapOperands);
+
+  } else {
+    cp.processMap(currentLocation, directive, semanticsContext, stmtCtx,
+                  mapOperands);
+  }
+
   cp.processTODO<Fortran::parser::OmpClause::Depend>(currentLocation,
                                                      directive);
 
@@ -2470,8 +2531,6 @@ static void genBodyOfTargetOp(
   auto *regionBlock =
       firOpBuilder.createBlock(&region, {}, mapSymTypes, mapSymLocs);
 
-  unsigned argIndex = 0;
-
   // Clones the `bounds` placing them inside the target region and returns them.
   auto cloneBound = [&](mlir::Value bound) {
     if (mlir::isMemoryEffectFree(bound.getDefiningOp())) {
@@ -2491,8 +2550,10 @@ static void genBodyOfTargetOp(
   };
 
   // Bind the symbols to their corresponding block arguments.
-  for (const Fortran::semantics::Symbol *sym : mapSymbols) {
+  for (auto [argIndex, argSymbol] : llvm::enumerate(mapSymbols)) {
     const mlir::BlockArgument &arg = region.getArgument(argIndex);
+    // Avoid capture of a reference to a structured binding.
+    const Fortran::semantics::Symbol *sym = argSymbol;
     fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym);
     extVal.match(
         [&](const fir::BoxValue &v) {
@@ -2527,7 +2588,6 @@ static void genBodyOfTargetOp(
           TODO(converter.getCurrentLocation(),
                "target map clause operand unsupported type");
         });
-    argIndex++;
   }
 
   // Check if cloning the bounds introduced any dependency on the outer region.
@@ -2848,15 +2908,17 @@ genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter,
     genDataOp(converter, eval, semanticsContext, currentLocation, opClauseList);
     break;
   case llvm::omp::Directive::OMPD_target_enter_data:
-    genEnterExitDataOp<mlir::omp::EnterDataOp>(converter, semanticsContext,
-                                               currentLocation, opClauseList);
+    genEnterExitUpdateDataOp<mlir::omp::EnterDataOp>(
+        converter, semanticsContext, currentLocation, opClauseList);
     break;
   case llvm::omp::Directive::OMPD_target_exit_data:
-    genEnterExitDataOp<mlir::omp::ExitDataOp>(converter, semanticsContext,
-                                              currentLocation, opClauseList);
+    genEnterExitUpdateDataOp<mlir::omp::ExitDataOp>(
+        converter, semanticsContext, currentLocation, opClauseList);
     break;
   case llvm::omp::Directive::OMPD_target_update:
-    TODO(currentLocation, "OMPD_target_update");
+    genEnterExitUpdateDataOp<mlir::omp::UpdateDataOp>(
+        converter, semanticsContext, currentLocation, opClauseList);
+    break;
   case llvm::omp::Directive::OMPD_ordered:
     TODO(currentLocation, "OMPD_ordered");
   }
diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp
index 8855cab8b517..e7695929623f 100644
--- a/flang/lib/Lower/Runtime.cpp
+++ b/flang/lib/Lower/Runtime.cpp
@@ -137,6 +137,12 @@ void Fortran::lower::genFailImageStatement(
   genUnreachable(builder, loc);
 }
 
+void Fortran::lower::genNotifyWaitStatement(
+    Fortran::lower::AbstractConverter &converter,
+    const Fortran::parser::NotifyWaitStmt &) {
+  TODO(converter.getCurrentLocation(), "coarray: NOTIFY WAIT runtime");
+}
+
 void Fortran::lower::genEventPostStatement(
     Fortran::lower::AbstractConverter &converter,
     const Fortran::parser::EventPostStmt &) {
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 17efa45b8667..94f723b4bae7 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -935,7 +935,7 @@ hlfir::translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 hlfir::convertToValue(mlir::Location loc, fir::FirOpBuilder &builder,
-                      const hlfir::Entity &entity) {
+                      hlfir::Entity entity) {
   // Load scalar references to integer, logical, real, or complex value
   // to an mlir value, dereference allocatable and pointers, and get rid
   // of fir.box that are not needed or create a copy into contiguous memory.
@@ -957,7 +957,12 @@ static fir::ExtendedValue placeTrivialInMemory(mlir::Location loc,
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 hlfir::convertToBox(mlir::Location loc, fir::FirOpBuilder &builder,
-                    const hlfir::Entity &entity, mlir::Type targetType) {
+                    hlfir::Entity entity, mlir::Type targetType) {
+  // fir::factory::createBoxValue is not meant to deal with procedures.
+  // Dereference procedure pointers here.
+  if (entity.isProcedurePointer())
+    entity = hlfir::derefPointersAndAllocatables(loc, builder, entity);
+
   auto [exv, cleanup] = translateToExtendedValue(loc, builder, entity);
   // Procedure entities should not go through createBoxValue that embox
   // object entities. Return the fir.boxproc directly.
@@ -972,7 +977,7 @@ hlfir::convertToBox(mlir::Location loc, fir::FirOpBuilder &builder,
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 hlfir::convertToAddress(mlir::Location loc, fir::FirOpBuilder &builder,
-                        const hlfir::Entity &entity, mlir::Type targetType) {
+                        hlfir::Entity entity, mlir::Type targetType) {
   hlfir::Entity derefedEntity =
       hlfir::derefPointersAndAllocatables(loc, builder, entity);
   auto [exv, cleanup] =
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index ff5dbff04360..c8057fbdd475 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -165,6 +165,10 @@ static constexpr IntrinsicHandler handlers[]{
        {"fptr", asInquired},
        {"shape", asAddr, handleDynamicOptional}}},
      /*isElemental=*/false},
+    {"c_f_procpointer",
+     &I::genCFProcPointer,
+     {{{"cptr", asValue}, {"fptr", asInquired}}},
+     /*isElemental=*/false},
     {"c_funloc", &I::genCFunLoc, {{{"x", asBox}}}, /*isElemental=*/false},
     {"c_loc", &I::genCLoc, {{{"x", asBox}}}, /*isElemental=*/false},
     {"ceiling", &I::genCeiling},
@@ -2134,6 +2138,33 @@ fir::ExtendedValue
 IntrinsicLibrary::genAssociated(mlir::Type resultType,
                                 llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 2);
+  if (fir::isBoxProcAddressType(fir::getBase(args[0]).getType())) {
+    mlir::Value pointerBoxProc =
+        builder.create<fir::LoadOp>(loc, fir::getBase(args[0]));
+    mlir::Value pointerTarget =
+        builder.create<fir::BoxAddrOp>(loc, pointerBoxProc);
+    if (isStaticallyAbsent(args[1]))
+      return builder.genIsNotNullAddr(loc, pointerTarget);
+    mlir::Value target = fir::getBase(args[1]);
+    if (fir::isBoxProcAddressType(target.getType()))
+      target = builder.create<fir::LoadOp>(loc, target);
+    if (target.getType().isa<fir::BoxProcType>())
+      target = builder.create<fir::BoxAddrOp>(loc, target);
+    mlir::Type intPtrTy = builder.getIntPtrType();
+    mlir::Value pointerInt =
+        builder.createConvert(loc, intPtrTy, pointerTarget);
+    mlir::Value targetInt = builder.createConvert(loc, intPtrTy, target);
+    mlir::Value sameTarget = builder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::eq, pointerInt, targetInt);
+    mlir::Value zero = builder.createIntegerConstant(loc, intPtrTy, 0);
+    mlir::Value notNull = builder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::ne, zero, pointerInt);
+    // The not notNull test covers the following two cases:
+    // - TARGET is a procedure that is OPTIONAL and absent at runtime.
+    // - TARGET is a procedure pointer that is NULL.
+    // In both cases, ASSOCIATED should be false if POINTER is NULL.
+    return builder.create<mlir::arith::AndIOp>(loc, sameTarget, notNull);
+  }
   auto *pointer =
       args[0].match([&](const fir::MutableBoxValue &x) { return &x; },
                     [&](const auto &) -> const fir::MutableBoxValue * {
@@ -2498,6 +2529,22 @@ void IntrinsicLibrary::genCFPointer(llvm::ArrayRef<fir::ExtendedValue> args) {
                                     /*lbounds=*/mlir::ValueRange{});
 }
 
+// C_F_PROCPOINTER
+void IntrinsicLibrary::genCFProcPointer(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value cptr =
+      fir::factory::genCPtrOrCFunptrValue(builder, loc, fir::getBase(args[0]));
+  mlir::Value fptr = fir::getBase(args[1]);
+  auto boxProcType =
+      mlir::cast<fir::BoxProcType>(fir::unwrapRefType(fptr.getType()));
+  mlir::Value cptrCast =
+      builder.createConvert(loc, boxProcType.getEleTy(), cptr);
+  mlir::Value cptrBox =
+      builder.create<fir::EmboxProcOp>(loc, boxProcType, cptrCast);
+  builder.create<fir::StoreOp>(loc, cptrBox, fptr);
+}
+
 // C_FUNLOC
 fir::ExtendedValue
 IntrinsicLibrary::genCFunLoc(mlir::Type resultType,
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 03b67104a93b..fc067ad35853 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -20,6 +20,7 @@ add_flang_library(FIRTransforms
   OMPFunctionFiltering.cpp
   OMPMarkDeclareTarget.cpp
   VScaleAttr.cpp
+  FunctionAttr.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/Transforms/FunctionAttr.cpp b/flang/lib/Optimizer/Transforms/FunctionAttr.cpp
new file mode 100644
index 000000000000..55b908ba5d86
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/FunctionAttr.cpp
@@ -0,0 +1,62 @@
+//===- FunctionAttr.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+/// \file
+/// This is a generic pass for adding attributes to functions.
+//===----------------------------------------------------------------------===//
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+
+namespace fir {
+#define GEN_PASS_DECL_FUNCTIONATTR
+#define GEN_PASS_DEF_FUNCTIONATTR
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+#define DEBUG_TYPE "func-attr"
+
+namespace {
+
+class FunctionAttrPass : public fir::impl::FunctionAttrBase<FunctionAttrPass> {
+public:
+  FunctionAttrPass(const fir::FunctionAttrOptions &options) {
+    framePointerKind = options.framePointerKind;
+  }
+  FunctionAttrPass() {}
+  void runOnOperation() override;
+};
+
+} // namespace
+
+void FunctionAttrPass::runOnOperation() {
+  LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n");
+  mlir::func::FuncOp func = getOperation();
+
+  LLVM_DEBUG(llvm::dbgs() << "Func-name:" << func.getSymName() << "\n");
+
+  mlir::MLIRContext *context = &getContext();
+  if (framePointerKind != mlir::LLVM::framePointerKind::FramePointerKind::None)
+    func->setAttr("frame_pointer", mlir::LLVM::FramePointerKindAttr::get(
+                                       context, framePointerKind));
+
+  LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n");
+}
+
+std::unique_ptr<mlir::Pass>
+fir::createFunctionAttrPass(fir::FunctionAttrTypes &functionAttr) {
+  FunctionAttrOptions opts;
+  // Frame pointer
+  opts.framePointerKind = functionAttr.framePointerKind;
+
+  return std::make_unique<FunctionAttrPass>(opts);
+}
+
+std::unique_ptr<mlir::Pass> fir::createFunctionAttrPass() {
+  return std::make_unique<FunctionAttrPass>();
+}
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index 12f354a47c2b..f5ddcbbaecd2 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -1220,6 +1220,8 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
 
   llvm::raw_string_ostream nameOS(funcName);
   outType.print(nameOS);
+  if (isDim)
+    nameOS << '_' << inputType;
   nameOS << '_' << fmfString;
 
   auto typeGenerator = [rank](fir::FirOpBuilder &builder) {
diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
index b51e2aae1a9d..1c213abefe6f 100644
--- a/flang/lib/Optimizer/Transforms/StackArrays.cpp
+++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -449,7 +449,7 @@ StackArraysAnalysisWrapper::analyseFunction(mlir::Operation *func) {
     const LatticePoint *lattice = solver.lookupState<LatticePoint>(op);
     // there will be no lattice for an unreachable block
     if (lattice)
-      point.join(*lattice);
+      (void)point.join(*lattice);
   };
   func->walk([&](mlir::func::ReturnOp child) { joinOperationLattice(child); });
   func->walk([&](fir::UnreachableOp child) { joinOperationLattice(child); });
diff --git a/flang/lib/Parser/Fortran-parsers.cpp b/flang/lib/Parser/Fortran-parsers.cpp
index c070bc1de373..0dd95d69d3c6 100644
--- a/flang/lib/Parser/Fortran-parsers.cpp
+++ b/flang/lib/Parser/Fortran-parsers.cpp
@@ -1151,8 +1151,9 @@ TYPE_PARSER(construct<PartRef>(name,
 
 // R913 structure-component -> data-ref
 // The final part-ref in the data-ref is not allowed to have subscripts.
-TYPE_PARSER(construct<StructureComponent>(
-    construct<DataRef>(some(Parser<PartRef>{} / percentOrDot)), name))
+TYPE_CONTEXT_PARSER("component"_en_US,
+    construct<StructureComponent>(
+        construct<DataRef>(some(Parser<PartRef>{} / percentOrDot)), name))
 
 // R919 subscript -> scalar-int-expr
 constexpr auto subscript{scalarIntExpr};
diff --git a/flang/lib/Parser/executable-parsers.cpp b/flang/lib/Parser/executable-parsers.cpp
index 892c612d0c4d..de2be017508c 100644
--- a/flang/lib/Parser/executable-parsers.cpp
+++ b/flang/lib/Parser/executable-parsers.cpp
@@ -92,9 +92,9 @@ TYPE_CONTEXT_PARSER("execution part"_en_US,
 //        close-stmt | continue-stmt | cycle-stmt | deallocate-stmt |
 //        endfile-stmt | error-stop-stmt | event-post-stmt | event-wait-stmt |
 //        exit-stmt | fail-image-stmt | flush-stmt | form-team-stmt |
-//        goto-stmt | if-stmt | inquire-stmt | lock-stmt | nullify-stmt |
-//        open-stmt | pointer-assignment-stmt | print-stmt | read-stmt |
-//        return-stmt | rewind-stmt | stop-stmt | sync-all-stmt |
+//        goto-stmt | if-stmt | inquire-stmt | lock-stmt | notify-wait-stmt |
+//        nullify-stmt | open-stmt | pointer-assignment-stmt | print-stmt |
+//        read-stmt | return-stmt | rewind-stmt | stop-stmt | sync-all-stmt |
 //        sync-images-stmt | sync-memory-stmt | sync-team-stmt | unlock-stmt |
 //        wait-stmt | where-stmt | write-stmt | computed-goto-stmt | forall-stmt
 // R1159 continue-stmt -> CONTINUE
@@ -119,6 +119,7 @@ TYPE_PARSER(first(construct<ActionStmt>(indirect(Parser<AllocateStmt>{})),
     construct<ActionStmt>(indirect(Parser<IfStmt>{})),
     construct<ActionStmt>(indirect(Parser<InquireStmt>{})),
     construct<ActionStmt>(indirect(Parser<LockStmt>{})),
+    construct<ActionStmt>(indirect(Parser<NotifyWaitStmt>{})),
     construct<ActionStmt>(indirect(Parser<NullifyStmt>{})),
     construct<ActionStmt>(indirect(Parser<OpenStmt>{})),
     construct<ActionStmt>(indirect(Parser<PrintStmt>{})),
@@ -453,6 +454,13 @@ TYPE_CONTEXT_PARSER("STOP statement"_en_US,
 // parse time.
 TYPE_PARSER(construct<StopCode>(scalar(expr)))
 
+// F2030: R1166 notify-wait-stmt ->
+//         NOTIFY WAIT ( notify-variable [, event-wait-spec-list] )
+TYPE_CONTEXT_PARSER("NOTIFY WAIT statement"_en_US,
+    construct<NotifyWaitStmt>(
+        "NOTIFY WAIT"_sptok >> "("_tok >> scalar(variable),
+        defaulted("," >> nonemptyList(Parser<EventWaitSpec>{})) / ")"))
+
 // R1164 sync-all-stmt -> SYNC ALL [( [sync-stat-list] )]
 TYPE_CONTEXT_PARSER("SYNC ALL statement"_en_US,
     construct<SyncAllStmt>("SYNC ALL"_sptok >>
@@ -486,15 +494,14 @@ TYPE_CONTEXT_PARSER("EVENT POST statement"_en_US,
 //         EVENT WAIT ( event-variable [, event-wait-spec-list] )
 TYPE_CONTEXT_PARSER("EVENT WAIT statement"_en_US,
     construct<EventWaitStmt>("EVENT WAIT"_sptok >> "("_tok >> scalar(variable),
-        defaulted("," >> nonemptyList(Parser<EventWaitStmt::EventWaitSpec>{})) /
-            ")"))
+        defaulted("," >> nonemptyList(Parser<EventWaitSpec>{})) / ")"))
 
 // R1174 until-spec -> UNTIL_COUNT = scalar-int-expr
 constexpr auto untilSpec{"UNTIL_COUNT =" >> scalarIntExpr};
 
 // R1173 event-wait-spec -> until-spec | sync-stat
-TYPE_PARSER(construct<EventWaitStmt::EventWaitSpec>(untilSpec) ||
-    construct<EventWaitStmt::EventWaitSpec>(statOrErrmsg))
+TYPE_PARSER(construct<EventWaitSpec>(untilSpec) ||
+    construct<EventWaitSpec>(statOrErrmsg))
 
 // R1177 team-variable -> scalar-variable
 constexpr auto teamVariable{scalar(variable)};
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 79cdaccf1fbf..68d7d9f0c53c 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -139,16 +139,18 @@ void Prescanner::Statement() {
         SkipSpaces();
       }
     } else {
-      // Compiler directive.  Emit normalized sentinel.
+      // Compiler directive.  Emit normalized sentinel, squash following spaces.
       EmitChar(tokens, '!');
       ++at_, ++column_;
       for (const char *sp{directiveSentinel_}; *sp != '\0';
            ++sp, ++at_, ++column_) {
         EmitChar(tokens, *sp);
       }
-      if (*at_ == ' ') {
+      if (*at_ == ' ' || *at_ == '\t') {
         EmitChar(tokens, ' ');
-        ++at_, ++column_;
+        while (*at_ == ' ' || *at_ == '\t') {
+          ++at_, ++column_;
+        }
       }
       tokens.CloseToken();
     }
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 6d9d17621632..1df49a688a12 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -1150,6 +1150,11 @@ public:
   void Unparse(const FailImageStmt &) { // R1163
     Word("FAIL IMAGE");
   }
+  void Unparse(const NotifyWaitStmt &x) { // F2023: R1166
+    Word("NOTIFY WAIT ("), Walk(std::get<Scalar<Variable>>(x.t));
+    Walk(", ", std::get<std::list<EventWaitSpec>>(x.t), ", ");
+    Put(')');
+  }
   void Unparse(const SyncAllStmt &x) { // R1164
     Word("SYNC ALL ("), Walk(x.v, ", "), Put(')');
   }
@@ -1169,7 +1174,7 @@ public:
     Word("EVENT POST ("), Walk(std::get<EventVariable>(x.t));
     Walk(", ", std::get<std::list<StatOrErrmsg>>(x.t), ", "), Put(')');
   }
-  void Before(const EventWaitStmt::EventWaitSpec &x) { // R1173, R1174
+  void Before(const EventWaitSpec &x) { // R1173, R1174
     common::visit(common::visitors{
                       [&](const ScalarIntExpr &) { Word("UNTIL_COUNT="); },
                       [](const StatOrErrmsg &) {},
@@ -1178,7 +1183,7 @@ public:
   }
   void Unparse(const EventWaitStmt &x) { // R1170
     Word("EVENT WAIT ("), Walk(std::get<EventVariable>(x.t));
-    Walk(", ", std::get<std::list<EventWaitStmt::EventWaitSpec>>(x.t), ", ");
+    Walk(", ", std::get<std::list<EventWaitSpec>>(x.t), ", ");
     Put(')');
   }
   void Unparse(const FormTeamStmt &x) { // R1175, R1177
diff --git a/flang/lib/Semantics/check-coarray.cpp b/flang/lib/Semantics/check-coarray.cpp
index 77b198284e05..106af7960fa9 100644
--- a/flang/lib/Semantics/check-coarray.cpp
+++ b/flang/lib/Semantics/check-coarray.cpp
@@ -177,32 +177,15 @@ void CoarrayChecker::Leave(const parser::SyncTeamStmt &x) {
   CheckSyncStatList(context_, std::get<std::list<parser::StatOrErrmsg>>(x.t));
 }
 
-void CoarrayChecker::Leave(const parser::EventPostStmt &x) {
-  CheckSyncStatList(context_, std::get<std::list<parser::StatOrErrmsg>>(x.t));
-  CheckEventVariable(context_, std::get<parser::EventVariable>(x.t));
-}
-
-void CoarrayChecker::Leave(const parser::EventWaitStmt &x) {
-  const auto &eventVar{std::get<parser::EventVariable>(x.t)};
-
-  if (const auto *expr{GetExpr(context_, eventVar)}) {
-    if (ExtractCoarrayRef(expr)) {
-      context_.Say(parser::FindSourceLocation(eventVar), // C1177
-          "A event-variable in a EVENT WAIT statement may not be a coindexed object"_err_en_US);
-    } else {
-      CheckEventVariable(context_, eventVar);
-    }
-  }
-
+static void CheckEventWaitSpecList(SemanticsContext &context,
+    const std::list<parser::EventWaitSpec> &eventWaitSpecList) {
   bool gotStat{false}, gotMsg{false}, gotUntil{false};
-  using EventWaitSpec = parser::EventWaitStmt::EventWaitSpec;
-  for (const EventWaitSpec &eventWaitSpec :
-      std::get<std::list<EventWaitSpec>>(x.t)) {
+  for (const parser::EventWaitSpec &eventWaitSpec : eventWaitSpecList) {
     common::visit(
         common::visitors{
             [&](const parser::ScalarIntExpr &untilCount) {
               if (gotUntil) {
-                context_.Say( // C1178
+                context.Say( // C1178
                     "Until-spec in a event-wait-spec-list may not be repeated"_err_en_US);
               }
               gotUntil = true;
@@ -212,17 +195,17 @@ void CoarrayChecker::Leave(const parser::EventWaitStmt &x) {
                   common::visitors{
                       [&](const parser::StatVariable &stat) {
                         if (gotStat) {
-                          context_.Say( // C1178
+                          context.Say( // C1178
                               "A stat-variable in a event-wait-spec-list may not be repeated"_err_en_US);
                         }
                         gotStat = true;
                       },
                       [&](const parser::MsgVariable &var) {
-                        WarnOnDeferredLengthCharacterScalar(context_,
-                            GetExpr(context_, var),
+                        WarnOnDeferredLengthCharacterScalar(context,
+                            GetExpr(context, var),
                             var.v.thing.thing.GetSource(), "ERRMSG=");
                         if (gotMsg) {
-                          context_.Say( // C1178
+                          context.Say( // C1178
                               "A errmsg-variable in a event-wait-spec-list may not be repeated"_err_en_US);
                         }
                         gotMsg = true;
@@ -230,7 +213,7 @@ void CoarrayChecker::Leave(const parser::EventWaitStmt &x) {
                   },
                   statOrErrmsg.u);
               CheckCoindexedStatOrErrmsg(
-                  context_, statOrErrmsg, "event-wait-spec-list");
+                  context, statOrErrmsg, "event-wait-spec-list");
             },
 
         },
@@ -238,6 +221,48 @@ void CoarrayChecker::Leave(const parser::EventWaitStmt &x) {
   }
 }
 
+void CoarrayChecker::Leave(const parser::NotifyWaitStmt &x) {
+  const auto &notifyVar{std::get<parser::Scalar<parser::Variable>>(x.t)};
+
+  if (const auto *expr{GetExpr(context_, notifyVar)}) {
+    if (ExtractCoarrayRef(expr)) {
+      context_.Say(parser::FindSourceLocation(notifyVar), // F2023 - C1178
+          "A notify-variable in a NOTIFY WAIT statement may not be a coindexed object"_err_en_US);
+    } else if (!IsNotifyType(evaluate::GetDerivedTypeSpec(
+                   expr->GetType()))) { // F2023 - C1177
+      context_.Say(parser::FindSourceLocation(notifyVar),
+          "The notify-variable must be of type NOTIFY_TYPE from module ISO_FORTRAN_ENV"_err_en_US);
+    } else if (!evaluate::IsCoarray(*expr)) { // F2023 - C1612
+      context_.Say(parser::FindSourceLocation(notifyVar),
+          "The notify-variable must be a coarray"_err_en_US);
+    }
+  }
+
+  CheckEventWaitSpecList(
+      context_, std::get<std::list<parser::EventWaitSpec>>(x.t));
+}
+
+void CoarrayChecker::Leave(const parser::EventPostStmt &x) {
+  CheckSyncStatList(context_, std::get<std::list<parser::StatOrErrmsg>>(x.t));
+  CheckEventVariable(context_, std::get<parser::EventVariable>(x.t));
+}
+
+void CoarrayChecker::Leave(const parser::EventWaitStmt &x) {
+  const auto &eventVar{std::get<parser::EventVariable>(x.t)};
+
+  if (const auto *expr{GetExpr(context_, eventVar)}) {
+    if (ExtractCoarrayRef(expr)) {
+      context_.Say(parser::FindSourceLocation(eventVar), // C1177
+          "A event-variable in a EVENT WAIT statement may not be a coindexed object"_err_en_US);
+    } else {
+      CheckEventVariable(context_, eventVar);
+    }
+  }
+
+  CheckEventWaitSpecList(
+      context_, std::get<std::list<parser::EventWaitSpec>>(x.t));
+}
+
 void CoarrayChecker::Leave(const parser::UnlockStmt &x) {
   CheckSyncStatList(context_, std::get<std::list<parser::StatOrErrmsg>>(x.t));
 }
diff --git a/flang/lib/Semantics/check-coarray.h b/flang/lib/Semantics/check-coarray.h
index 251ee980d8a5..0af9a880fd31 100644
--- a/flang/lib/Semantics/check-coarray.h
+++ b/flang/lib/Semantics/check-coarray.h
@@ -23,6 +23,7 @@ struct EventPostStmt;
 struct EventWaitStmt;
 struct FormTeamStmt;
 struct ImageSelector;
+struct NotifyWaitStmt;
 struct SyncAllStmt;
 struct SyncImagesStmt;
 struct SyncMemoryStmt;
@@ -41,6 +42,7 @@ public:
   void Leave(const parser::SyncImagesStmt &);
   void Leave(const parser::SyncMemoryStmt &);
   void Leave(const parser::SyncTeamStmt &);
+  void Leave(const parser::NotifyWaitStmt &);
   void Leave(const parser::EventPostStmt &);
   void Leave(const parser::EventWaitStmt &);
   void Leave(const parser::UnlockStmt &);
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index e1cd34ddf65b..64fc7de12087 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -698,7 +698,7 @@ protected:
   bool CheckPossibleBadForwardRef(const Symbol &);
 
   bool inSpecificationPart_{false};
-  bool inDataStmtObject_{false};
+  bool deferImplicitTyping_{false};
   bool inEquivalenceStmt_{false};
 
   // Some information is collected from a specification part for deferred
@@ -1629,6 +1629,7 @@ private:
   bool BeginScopeForNode(const ProgramTree &);
   void EndScopeForNode(const ProgramTree &);
   void FinishSpecificationParts(const ProgramTree &);
+  void FinishExecutionParts(const ProgramTree &);
   void FinishDerivedTypeInstantiation(Scope &);
   void ResolveExecutionParts(const ProgramTree &);
   void UseCUDABuiltinNames();
@@ -2533,7 +2534,7 @@ void ScopeHandler::ApplyImplicitRules(
     // or object, it'll be caught later.
     return;
   }
-  if (inDataStmtObject_) {
+  if (deferImplicitTyping_) {
     return;
   }
   if (!context().HasError(symbol)) {
@@ -2709,7 +2710,7 @@ const DeclTypeSpec &ScopeHandler::MakeLogicalType(int kind) {
 }
 
 void ScopeHandler::NotePossibleBadForwardRef(const parser::Name &name) {
-  if (inSpecificationPart_ && !inDataStmtObject_ && name.symbol) {
+  if (inSpecificationPart_ && !deferImplicitTyping_ && name.symbol) {
     auto kind{currScope().kind()};
     if ((kind == Scope::Kind::Subprogram && !currScope().IsStmtFunction()) ||
         kind == Scope::Kind::BlockConstruct) {
@@ -2904,7 +2905,7 @@ void ModuleVisitor::Post(const parser::UseStmt &x) {
     }
     for (const auto &[name, symbol] : *useModuleScope_) {
       if (symbol->attrs().test(Attr::PUBLIC) && !IsUseRenamed(symbol->name()) &&
-          (!symbol->attrs().test(Attr::INTRINSIC) ||
+          (!symbol->implicitAttrs().test(Attr::INTRINSIC) ||
               symbol->has<UseDetails>()) &&
           !symbol->has<MiscDetails>() && useNames.count(name) == 0) {
         SourceName location{x.moduleName.source};
@@ -2998,7 +2999,7 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
     details->add_occurrence(location, *useModuleScope_);
     return;
   }
-
+  const Symbol &useUltimate{useSymbol.GetUltimate()};
   if (localSymbol.has<UnknownDetails>()) {
     localSymbol.set_details(UseDetails{localName, useSymbol});
     localSymbol.attrs() =
@@ -3010,7 +3011,6 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
   }
 
   Symbol &localUltimate{localSymbol.GetUltimate()};
-  const Symbol &useUltimate{useSymbol.GetUltimate()};
   if (&localUltimate == &useUltimate) {
     // use-associating the same symbol again -- ok
     return;
@@ -3044,13 +3044,19 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
           checkAmbiguousDerivedType(&useUltimate, localGeneric->derivedType());
     } else if (&useUltimate == &BypassGeneric(localUltimate).GetUltimate()) {
       return; // nothing to do; used subprogram is local's specific
+    } else if (useUltimate.attrs().test(Attr::INTRINSIC) &&
+        useUltimate.name() == localSymbol.name()) {
+      return; // local generic can extend intrinsic
     }
   } else if (useGeneric) {
     if (localUltimate.has<DerivedTypeDetails>()) {
       combine =
           checkAmbiguousDerivedType(&localUltimate, useGeneric->derivedType());
-    } else if (&localUltimate == &BypassGeneric(useUltimate).GetUltimate()) {
-      // Local is the specific of the used generic; replace it.
+    } else if (&localUltimate == &BypassGeneric(useUltimate).GetUltimate() ||
+        (localSymbol.attrs().test(Attr::INTRINSIC) &&
+            localUltimate.name() == useUltimate.name())) {
+      // Local is the specific of the used generic or an intrinsic with the
+      // same name; replace it.
       EraseSymbol(localSymbol);
       Symbol &newSymbol{MakeSymbol(localName,
           useUltimate.attrs() & ~Attrs{Attr::PUBLIC, Attr::PRIVATE},
@@ -3058,23 +3064,22 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
       newSymbol.flags() = useSymbol.flags();
       return;
     }
+  } else if (localUltimate.name() != useUltimate.name()) {
+    // not the same procedure
+  } else if (localUltimate.attrs().test(Attr::INTRINSIC) &&
+      useUltimate.attrs().test(Attr::INTRINSIC)) {
+    return;
   } else {
     auto localClass{ClassifyProcedure(localUltimate)};
     auto useClass{ClassifyProcedure(useUltimate)};
-    if (localClass == useClass &&
-        (localClass == ProcedureDefinitionClass::Intrinsic ||
-            localClass == ProcedureDefinitionClass::External) &&
-        localUltimate.name() == useUltimate.name()) {
+    if (localClass == ProcedureDefinitionClass::External &&
+        useClass == ProcedureDefinitionClass::External) {
       auto localChars{evaluate::characteristics::Procedure::Characterize(
           localUltimate, GetFoldingContext())};
       auto useChars{evaluate::characteristics::Procedure::Characterize(
           useUltimate, GetFoldingContext())};
-      if (localChars && useChars) {
-        if (*localChars == *useChars) {
-          // Same intrinsic or external procedure defined identically in two
-          // modules
-          return;
-        }
+      if (localChars && useChars && *localChars == *useChars) {
+        return; // same procedure defined identically in two modules
       }
     }
   }
@@ -4794,9 +4799,15 @@ Symbol &DeclarationVisitor::HandleAttributeStmt(
       }
     }
   } else if (symbol && symbol->has<UseDetails>()) {
-    Say(currStmtSource().value(),
-        "Cannot change %s attribute on use-associated '%s'"_err_en_US,
-        EnumToString(attr), name.source);
+    if (symbol->GetUltimate().attrs().test(attr)) {
+      Say(currStmtSource().value(),
+          "Use-associated '%s' already has '%s' attribute"_warn_en_US,
+          name.source, EnumToString(attr));
+    } else {
+      Say(currStmtSource().value(),
+          "Cannot change %s attribute on use-associated '%s'"_err_en_US,
+          EnumToString(attr), name.source);
+    }
     return *symbol;
   }
   if (!symbol) {
@@ -5929,9 +5940,9 @@ void DeclarationVisitor::Post(const parser::BasedPointer &bp) {
     }
     if (const auto *pointeeType{pointee->GetType()}) {
       if (const auto *derived{pointeeType->AsDerived()}) {
-        if (!derived->typeSymbol().get<DerivedTypeDetails>().sequence()) {
+        if (!IsSequenceOrBindCType(derived)) {
           Say(pointeeName,
-              "Type of Cray pointee '%s' is a non-sequence derived type"_err_en_US);
+              "Type of Cray pointee '%s' is a derived type that is neither SEQUENCE nor BIND(C)"_err_en_US);
         }
       }
     }
@@ -6166,15 +6177,13 @@ void DeclarationVisitor::CheckCommonBlocks() {
         Say(name,
             "Unlimited polymorphic pointer '%s' may not appear in a COMMON block"_err_en_US);
       } else if (const auto *derived{type->AsDerived()}) {
-        auto &typeSymbol{derived->typeSymbol()};
-        if (!typeSymbol.attrs().test(Attr::BIND_C) &&
-            !typeSymbol.get<DerivedTypeDetails>().sequence()) {
+        if (!IsSequenceOrBindCType(derived)) {
           Say(name,
               "Derived type '%s' in COMMON block must have the BIND or"
               " SEQUENCE attribute"_err_en_US);
         }
         UnorderedSymbolSet typeSet;
-        CheckCommonBlockDerivedType(name, typeSymbol, typeSet);
+        CheckCommonBlockDerivedType(name, derived->typeSymbol(), typeSet);
       }
     }
   }
@@ -6244,8 +6253,8 @@ bool DeclarationVisitor::HandleUnrestrictedSpecificIntrinsicFunction(
     // recreated for it later on demand, but capturing its result type here
     // will make GetType() return a correct result without having to
     // probe the intrinsics table again.
-    Symbol &symbol{
-        MakeSymbol(InclusiveScope(), name.source, Attrs{Attr::INTRINSIC})};
+    Symbol &symbol{MakeSymbol(InclusiveScope(), name.source, Attrs{})};
+    SetImplicitAttr(symbol, Attr::INTRINSIC);
     CHECK(interface->functionResult.has_value());
     evaluate::DynamicType dyType{
         DEREF(interface->functionResult->GetTypeAndShape()).type()};
@@ -6792,7 +6801,8 @@ bool ConstructVisitor::Pre(const parser::DataStmtObject &x) {
   auto flagRestorer{common::ScopedSet(inSpecificationPart_, false)};
   common::visit(common::visitors{
                     [&](const Indirection<parser::Variable> &y) {
-                      auto restorer{common::ScopedSet(inDataStmtObject_, true)};
+                      auto restorer{
+                          common::ScopedSet(deferImplicitTyping_, true)};
                       Walk(y.value());
                       const parser::Name &first{
                           parser::GetFirstName(y.value())};
@@ -7376,7 +7386,7 @@ const parser::Name *DeclarationVisitor::ResolveName(const parser::Name &name) {
     }
     return &name;
   }
-  if (isImplicitNoneType() && !inDataStmtObject_) {
+  if (isImplicitNoneType() && !deferImplicitTyping_) {
     Say(name, "No explicit type declared for '%s'"_err_en_US);
     return nullptr;
   }
@@ -7538,7 +7548,15 @@ void DeclarationVisitor::Initialization(const parser::Name &name,
   common::visit(
       common::visitors{
           [&](const parser::ConstantExpr &expr) {
-            NonPointerInitialization(name, expr);
+            Walk(expr);
+            if (IsNamedConstant(ultimate) || inComponentDecl) {
+              NonPointerInitialization(name, expr);
+            } else {
+              // Defer analysis so forward references to nested subprograms
+              // can be properly resolved when they appear in structure
+              // constructors.
+              ultimate.set(Symbol::Flag::InDataStmt);
+            }
           },
           [&](const parser::NullInit &null) { // => NULL()
             Walk(null);
@@ -7559,10 +7577,12 @@ void DeclarationVisitor::Initialization(const parser::Name &name,
               }
             }
           },
-          [&](const parser::InitialDataTarget &) {
+          [&](const parser::InitialDataTarget &target) {
             // Defer analysis to the end of the specification part
             // so that forward references and attribute checks like SAVE
             // work better.
+            auto restorer{common::ScopedSet(deferImplicitTyping_, true)};
+            Walk(target);
             ultimate.set(Symbol::Flag::InDataStmt);
           },
           [&](const std::list<Indirection<parser::DataStmtValue>> &values) {
@@ -7580,12 +7600,27 @@ void DeclarationVisitor::PointerInitialization(
     Symbol &ultimate{name.symbol->GetUltimate()};
     if (!context().HasError(ultimate)) {
       if (IsPointer(ultimate)) {
-        if (auto *details{ultimate.detailsIf<ObjectEntityDetails>()}) {
-          CHECK(!details->init());
-          Walk(target);
-          if (MaybeExpr expr{EvaluateExpr(target)}) {
-            // Validation is done in declaration checking.
+        Walk(target);
+        if (MaybeExpr expr{EvaluateExpr(target)}) {
+          // Validation is done in declaration checking.
+          if (auto *details{ultimate.detailsIf<ObjectEntityDetails>()}) {
+            CHECK(!details->init());
             details->set_init(std::move(*expr));
+            ultimate.set(Symbol::Flag::InDataStmt, false);
+          } else if (auto *details{ultimate.detailsIf<ProcEntityDetails>()}) {
+            // something like "REAL, EXTERNAL, POINTER :: p => t"
+            if (evaluate::IsNullProcedurePointer(*expr)) {
+              CHECK(!details->init());
+              details->set_init(nullptr);
+            } else if (const Symbol *
+                targetSymbol{evaluate::UnwrapWholeSymbolDataRef(*expr)}) {
+              CHECK(!details->init());
+              details->set_init(*targetSymbol);
+            } else {
+              Say(name,
+                  "Procedure pointer '%s' must be initialized with a procedure name or NULL()"_err_en_US);
+              context().SetError(ultimate);
+            }
           }
         }
       } else {
@@ -7625,27 +7660,23 @@ void DeclarationVisitor::PointerInitialization(
 
 void DeclarationVisitor::NonPointerInitialization(
     const parser::Name &name, const parser::ConstantExpr &expr) {
-  if (name.symbol) {
+  if (!context().HasError(name.symbol)) {
     Symbol &ultimate{name.symbol->GetUltimate()};
-    if (!context().HasError(ultimate) && !context().HasError(name.symbol)) {
+    if (!context().HasError(ultimate)) {
       if (IsPointer(ultimate)) {
         Say(name,
             "'%s' is a pointer but is not initialized like one"_err_en_US);
       } else if (auto *details{ultimate.detailsIf<ObjectEntityDetails>()}) {
-        CHECK(!details->init());
-        if (IsAllocatable(ultimate)) {
+        if (details->init()) {
+        } else if (IsAllocatable(ultimate)) {
           Say(name, "Allocatable object '%s' cannot be initialized"_err_en_US);
-          return;
-        }
-        Walk(expr);
-        if (ultimate.owner().IsParameterizedDerivedType()) {
+        } else if (ultimate.owner().IsParameterizedDerivedType()) {
           // Save the expression for per-instantiation analysis.
           details->set_unanalyzedPDTComponentInit(&expr.thing.value());
-        } else {
-          if (MaybeExpr folded{EvaluateNonPointerInitializer(
-                  ultimate, expr, expr.thing.value().source)}) {
-            details->set_init(std::move(*folded));
-          }
+        } else if (MaybeExpr folded{EvaluateNonPointerInitializer(
+                       ultimate, expr, expr.thing.value().source)}) {
+          details->set_init(std::move(*folded));
+          ultimate.set(Symbol::Flag::InDataStmt, false);
         }
       } else {
         Say(name, "'%s' is not an object that can be initialized"_err_en_US);
@@ -7708,8 +7739,8 @@ void ResolveNamesVisitor::HandleProcedureName(
   auto *symbol{FindSymbol(NonDerivedTypeScope(), name)};
   if (!symbol) {
     if (IsIntrinsic(name.source, flag)) {
-      symbol =
-          &MakeSymbol(InclusiveScope(), name.source, Attrs{Attr::INTRINSIC});
+      symbol = &MakeSymbol(InclusiveScope(), name.source, Attrs{});
+      SetImplicitAttr(*symbol, Attr::INTRINSIC);
     } else if (const auto ppcBuiltinScope =
                    currScope().context().GetPPCBuiltinsScope()) {
       // Check if it is a builtin from the predefined module
@@ -8047,6 +8078,11 @@ void ResolveNamesVisitor::CreateGeneric(const parser::GenericSpec &x) {
     } else if (ultimate.has<SubprogramDetails>() ||
         ultimate.has<SubprogramNameDetails>()) {
       genericDetails.set_specific(*existing);
+    } else if (ultimate.has<ProcEntityDetails>()) {
+      if (existing->name() != symbolName ||
+          !ultimate.attrs().test(Attr::INTRINSIC)) {
+        genericDetails.set_specific(*existing);
+      }
     } else if (ultimate.has<DerivedTypeDetails>()) {
       genericDetails.set_derivedType(*existing);
     } else if (&existing->owner() == &currScope()) {
@@ -8409,6 +8445,7 @@ bool ResolveNamesVisitor::Pre(const parser::ProgramUnit &x) {
   ResolveSpecificationParts(root);
   FinishSpecificationParts(root);
   ResolveExecutionParts(root);
+  FinishExecutionParts(root);
   ResolveAccParts(context(), x);
   ResolveOmpParts(context(), x);
   return false;
@@ -8826,6 +8863,8 @@ public:
     }
   }
 
+  bool Pre(const parser::BlockConstruct &x) { return true; }
+
   void Post(const parser::ProcInterface &pi) {
     if (const auto *name{std::get_if<parser::Name>(&pi.u)}) {
       resolver_.CheckExplicitInterface(*name);
@@ -8856,7 +8895,6 @@ public:
       resolver_.CheckBindings(tbps);
     }
   }
-  bool Pre(const parser::StmtFunctionStmt &stmtFunc) { return false; }
 
 private:
   void Init(const parser::Name &name,
@@ -8865,6 +8903,9 @@ private:
       if (const auto *target{
               std::get_if<parser::InitialDataTarget>(&init->u)}) {
         resolver_.PointerInitialization(name, *target);
+      } else if (const auto *expr{
+                     std::get_if<parser::ConstantExpr>(&init->u)}) {
+        resolver_.NonPointerInitialization(name, *expr);
       }
     }
   }
@@ -8879,15 +8920,16 @@ void ResolveNamesVisitor::FinishSpecificationParts(const ProgramTree &node) {
   if (!node.scope()) {
     return; // error occurred creating scope
   }
+  auto flagRestorer{common::ScopedSet(inSpecificationPart_, true)};
   SetScope(*node.scope());
-  // The initializers of pointers, the default initializers of pointer
-  // components, non-deferred type-bound procedure bindings have not
-  // yet been traversed.
-  // We do that now, when any (formerly) forward references that appear
+  // The initializers of pointers and non-PARAMETER objects, the default
+  // initializers of components, and non-deferred type-bound procedure
+  // bindings have not yet been traversed.
+  // We do that now, when any forward references that appeared
   // in those initializers will resolve to the right symbols without
-  // incurring spurious errors with IMPLICIT NONE.
+  // incurring spurious errors with IMPLICIT NONE or forward references
+  // to nested subprograms.
   DeferredCheckVisitor{*this}.Walk(node.spec());
-  DeferredCheckVisitor{*this}.Walk(node.exec()); // for BLOCK
   for (Scope &childScope : currScope().children()) {
     if (childScope.IsParameterizedDerivedTypeInstantiation()) {
       FinishDerivedTypeInstantiation(childScope);
@@ -8898,6 +8940,18 @@ void ResolveNamesVisitor::FinishSpecificationParts(const ProgramTree &node) {
   }
 }
 
+void ResolveNamesVisitor::FinishExecutionParts(const ProgramTree &node) {
+  if (node.scope()) {
+    SetScope(*node.scope());
+    if (node.exec()) {
+      DeferredCheckVisitor{*this}.Walk(*node.exec());
+    }
+    for (const auto &child : node.children()) {
+      FinishExecutionParts(child);
+    }
+  }
+}
+
 // Duplicate and fold component object pointer default initializer designators
 // using the actual type parameter values of each particular instantiation.
 // Validation is done later in declaration checking.
diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90
index 0bc66def847e..0566ae6327d7 100644
--- a/flang/module/__fortran_builtins.f90
+++ b/flang/module/__fortran_builtins.f90
@@ -32,6 +32,10 @@ module __fortran_builtins
     integer(kind=int64), private :: __count
   end type
 
+  type :: __builtin_notify_type
+    integer(kind=int64), private :: __count
+  end type
+
   type :: __builtin_lock_type
     integer(kind=int64), private :: __count
   end type
diff --git a/flang/module/iso_fortran_env.f90 b/flang/module/iso_fortran_env.f90
index f1d540bc8e45..cd3c06f8c756 100644
--- a/flang/module/iso_fortran_env.f90
+++ b/flang/module/iso_fortran_env.f90
@@ -9,12 +9,13 @@
 ! See Fortran 2018, clause 16.10.2
 ! TODO: These are placeholder values so that some tests can be run.
 
-include '../include/flang/Runtime/magic-numbers.h' ! IOSTAT values
+include '../include/flang/Runtime/magic-numbers.h'
 
 module iso_fortran_env
 
   use __fortran_builtins, only: &
     event_type => __builtin_event_type, &
+    notify_type => __builtin_notify_type, &
     lock_type => __builtin_lock_type, &
     team_type => __builtin_team_type, &
     atomic_int_kind => __builtin_atomic_int_kind, &
@@ -23,6 +24,7 @@ module iso_fortran_env
     compiler_version => __builtin_compiler_version
 
   implicit none
+  private count
 
   ! TODO: Use PACK([x],test) in place of the array constructor idiom
   ! [(x, integer::j=1,COUNT([test]))] below once PACK() can be folded.
@@ -129,8 +131,9 @@ module iso_fortran_env
 
   integer, parameter :: current_team = -1, initial_team = -2, parent_team = -3
 
-  integer, parameter :: input_unit = 5, output_unit = 6
-  integer, parameter :: error_unit = 0
+  integer, parameter :: output_unit = FORTRAN_DEFAULT_OUTPUT_UNIT
+  integer, parameter :: input_unit = FORTRAN_DEFAULT_INPUT_UNIT
+  integer, parameter :: error_unit = FORTRAN_ERROR_UNIT
   integer, parameter :: iostat_end = FORTRAN_RUNTIME_IOSTAT_END
   integer, parameter :: iostat_eor = FORTRAN_RUNTIME_IOSTAT_EOR
   integer, parameter :: iostat_inquire_internal_unit = &
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index bf3aa5af3c88..d6df15b7f6e0 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -90,16 +90,16 @@ set(sources
   array-constructor.cpp
   assign.cpp
   buffer.cpp
+  character.cpp
   command.cpp
   complex-powi.cpp
   complex-reduction.c
-  copy.cpp
-  character.cpp
   connection.cpp
-  derived.cpp
+  copy.cpp
   derived-api.cpp
-  descriptor.cpp
+  derived.cpp
   descriptor-io.cpp
+  descriptor.cpp
   dot-product.cpp
   edit-input.cpp
   edit-output.cpp
@@ -112,10 +112,10 @@ set(sources
   format.cpp
   inquiry.cpp
   internal-unit.cpp
-  iostat.cpp
   io-api.cpp
   io-error.cpp
   io-stmt.cpp
+  iostat.cpp
   main.cpp
   matmul-transpose.cpp
   matmul.cpp
@@ -124,11 +124,11 @@ set(sources
   namelist.cpp
   non-tbp-dio.cpp
   numeric.cpp
+  pointer.cpp
+  product.cpp
   ragged.cpp
   random.cpp
   reduction.cpp
-  pointer.cpp
-  product.cpp
   stat.cpp
   stop.cpp
   sum.cpp
@@ -140,8 +140,8 @@ set(sources
   transformational.cpp
   type-code.cpp
   type-info.cpp
-  unit.cpp
   unit-map.cpp
+  unit.cpp
   utf.cpp
 )
 
@@ -152,10 +152,29 @@ option(FLANG_EXPERIMENTAL_CUDA_RUNTIME
 # List of files that are buildable for all devices.
 set(supported_files
   ISO_Fortran_binding.cpp
+  allocatable.cpp
+  array-constructor.cpp
   assign.cpp
+  character.cpp
+  copy.cpp
+  derived-api.cpp
   derived.cpp
   descriptor.cpp
+  dot-product.cpp
+  extrema.cpp
+  findloc.cpp
+  inquiry.cpp
+  matmul-transpose.cpp
+  matmul.cpp
+  memory.cpp
+  misc-intrinsic.cpp
+  numeric.cpp
+  pointer.cpp
+  product.cpp
+  ragged.cpp
   stat.cpp
+  sum.cpp
+  support.cpp
   terminator.cpp
   tools.cpp
   transformational.cpp
@@ -188,6 +207,10 @@ if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
   if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "NVIDIA")
     set(CUDA_COMPILE_OPTIONS
       --expt-relaxed-constexpr
+      # Disable these warnings:
+      #   'long double' is treated as 'double' in device code
+      -Xcudafe --diag_suppress=20208
+      -Xcudafe --display_error_number
       )
   endif()
   set_source_files_properties(${supported_files} PROPERTIES COMPILE_OPTIONS
diff --git a/flang/runtime/allocatable.cpp b/flang/runtime/allocatable.cpp
index 409255aaa214..e69795e6f824 100644
--- a/flang/runtime/allocatable.cpp
+++ b/flang/runtime/allocatable.cpp
@@ -18,8 +18,9 @@
 
 namespace Fortran::runtime {
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
-void RTNAME(AllocatableInitIntrinsic)(Descriptor &descriptor,
+void RTDEF(AllocatableInitIntrinsic)(Descriptor &descriptor,
     TypeCategory category, int kind, int rank, int corank) {
   INTERNAL_CHECK(corank == 0);
   descriptor.Establish(TypeCode{category, kind},
@@ -27,21 +28,21 @@ void RTNAME(AllocatableInitIntrinsic)(Descriptor &descriptor,
       CFI_attribute_allocatable);
 }
 
-void RTNAME(AllocatableInitCharacter)(Descriptor &descriptor,
+void RTDEF(AllocatableInitCharacter)(Descriptor &descriptor,
     SubscriptValue length, int kind, int rank, int corank) {
   INTERNAL_CHECK(corank == 0);
   descriptor.Establish(
       kind, length, nullptr, rank, nullptr, CFI_attribute_allocatable);
 }
 
-void RTNAME(AllocatableInitDerived)(Descriptor &descriptor,
+void RTDEF(AllocatableInitDerived)(Descriptor &descriptor,
     const typeInfo::DerivedType &derivedType, int rank, int corank) {
   INTERNAL_CHECK(corank == 0);
   descriptor.Establish(
       derivedType, nullptr, rank, nullptr, CFI_attribute_allocatable);
 }
 
-void RTNAME(AllocatableInitIntrinsicForAllocate)(Descriptor &descriptor,
+void RTDEF(AllocatableInitIntrinsicForAllocate)(Descriptor &descriptor,
     TypeCategory category, int kind, int rank, int corank) {
   if (descriptor.IsAllocated()) {
     return;
@@ -49,7 +50,7 @@ void RTNAME(AllocatableInitIntrinsicForAllocate)(Descriptor &descriptor,
   RTNAME(AllocatableInitIntrinsic)(descriptor, category, kind, rank, corank);
 }
 
-void RTNAME(AllocatableInitCharacterForAllocate)(Descriptor &descriptor,
+void RTDEF(AllocatableInitCharacterForAllocate)(Descriptor &descriptor,
     SubscriptValue length, int kind, int rank, int corank) {
   if (descriptor.IsAllocated()) {
     return;
@@ -57,7 +58,7 @@ void RTNAME(AllocatableInitCharacterForAllocate)(Descriptor &descriptor,
   RTNAME(AllocatableInitCharacter)(descriptor, length, kind, rank, corank);
 }
 
-void RTNAME(AllocatableInitDerivedForAllocate)(Descriptor &descriptor,
+void RTDEF(AllocatableInitDerivedForAllocate)(Descriptor &descriptor,
     const typeInfo::DerivedType &derivedType, int rank, int corank) {
   if (descriptor.IsAllocated()) {
     return;
@@ -65,7 +66,7 @@ void RTNAME(AllocatableInitDerivedForAllocate)(Descriptor &descriptor,
   RTNAME(AllocatableInitDerived)(descriptor, derivedType, rank, corank);
 }
 
-std::int32_t RTNAME(MoveAlloc)(Descriptor &to, Descriptor &from,
+std::int32_t RTDEF(MoveAlloc)(Descriptor &to, Descriptor &from,
     const typeInfo::DerivedType *derivedType, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
@@ -110,21 +111,21 @@ std::int32_t RTNAME(MoveAlloc)(Descriptor &to, Descriptor &from,
   return StatOk;
 }
 
-void RTNAME(AllocatableSetBounds)(Descriptor &descriptor, int zeroBasedDim,
+void RTDEF(AllocatableSetBounds)(Descriptor &descriptor, int zeroBasedDim,
     SubscriptValue lower, SubscriptValue upper) {
   INTERNAL_CHECK(zeroBasedDim >= 0 && zeroBasedDim < descriptor.rank());
   descriptor.GetDimension(zeroBasedDim).SetBounds(lower, upper);
   // The byte strides are computed when the object is allocated.
 }
 
-void RTNAME(AllocatableSetDerivedLength)(
+void RTDEF(AllocatableSetDerivedLength)(
     Descriptor &descriptor, int which, SubscriptValue x) {
   DescriptorAddendum *addendum{descriptor.Addendum()};
   INTERNAL_CHECK(addendum != nullptr);
   addendum->SetLenParameterValue(which, x);
 }
 
-void RTNAME(AllocatableApplyMold)(
+void RTDEF(AllocatableApplyMold)(
     Descriptor &descriptor, const Descriptor &mold, int rank) {
   if (descriptor.IsAllocated()) {
     // 9.7.1.3 Return so the error can be emitted by AllocatableAllocate.
@@ -133,7 +134,7 @@ void RTNAME(AllocatableApplyMold)(
   descriptor.ApplyMold(mold, rank);
 }
 
-int RTNAME(AllocatableAllocate)(Descriptor &descriptor, bool hasStat,
+int RTDEF(AllocatableAllocate)(Descriptor &descriptor, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (!descriptor.IsAllocatable()) {
@@ -155,7 +156,7 @@ int RTNAME(AllocatableAllocate)(Descriptor &descriptor, bool hasStat,
   return stat;
 }
 
-int RTNAME(AllocatableAllocateSource)(Descriptor &alloc,
+int RTDEF(AllocatableAllocateSource)(Descriptor &alloc,
     const Descriptor &source, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   int stat{RTNAME(AllocatableAllocate)(
@@ -167,7 +168,7 @@ int RTNAME(AllocatableAllocateSource)(Descriptor &alloc,
   return stat;
 }
 
-int RTNAME(AllocatableDeallocate)(Descriptor &descriptor, bool hasStat,
+int RTDEF(AllocatableDeallocate)(Descriptor &descriptor, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (!descriptor.IsAllocatable()) {
@@ -182,7 +183,7 @@ int RTNAME(AllocatableDeallocate)(Descriptor &descriptor, bool hasStat,
       errMsg, hasStat);
 }
 
-int RTNAME(AllocatableDeallocatePolymorphic)(Descriptor &descriptor,
+int RTDEF(AllocatableDeallocatePolymorphic)(Descriptor &descriptor,
     const typeInfo::DerivedType *derivedType, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(AllocatableDeallocate)(
@@ -202,7 +203,7 @@ int RTNAME(AllocatableDeallocatePolymorphic)(Descriptor &descriptor,
   return stat;
 }
 
-void RTNAME(AllocatableDeallocateNoFinal)(
+void RTDEF(AllocatableDeallocateNoFinal)(
     Descriptor &descriptor, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (!descriptor.IsAllocatable()) {
@@ -217,5 +218,7 @@ void RTNAME(AllocatableDeallocateNoFinal)(
 }
 
 // TODO: AllocatableCheckLengthParameter
+
+RT_EXT_API_GROUP_END
 }
 } // namespace Fortran::runtime
diff --git a/flang/runtime/array-constructor.cpp b/flang/runtime/array-constructor.cpp
index 1be302eaaf1a..72e08feff7fd 100644
--- a/flang/runtime/array-constructor.cpp
+++ b/flang/runtime/array-constructor.cpp
@@ -9,6 +9,7 @@
 #include "flang/Runtime/array-constructor.h"
 #include "derived.h"
 #include "terminator.h"
+#include "tools.h"
 #include "type-info.h"
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/assign.h"
@@ -23,7 +24,7 @@ namespace Fortran::runtime {
 //  REAL(8), INTEGER(8), COMPLEX(4), ...   -> 16 elements.
 //  REAL(16), INTEGER(16), COMPLEX(8), ... -> 8 elements.
 //  Bigger types -> 4 elements.
-static SubscriptValue initialAllocationSize(
+static RT_API_ATTRS SubscriptValue initialAllocationSize(
     SubscriptValue initialNumberOfElements, SubscriptValue elementBytes) {
   // Try to guess an optimal initial allocation size in number of elements to
   // avoid doing too many reallocation.
@@ -36,9 +37,9 @@ static SubscriptValue initialAllocationSize(
   return std::max(numberOfElements, elementsForMinBytes);
 }
 
-static void AllocateOrReallocateVectorIfNeeded(ArrayConstructorVector &vector,
-    Terminator &terminator, SubscriptValue previousToElements,
-    SubscriptValue fromElements) {
+static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
+    ArrayConstructorVector &vector, Terminator &terminator,
+    SubscriptValue previousToElements, SubscriptValue fromElements) {
   Descriptor &to{vector.to};
   if (to.IsAllocatable() && !to.IsAllocated()) {
     // The descriptor bounds may already be set here if the array constructor
@@ -73,8 +74,8 @@ static void AllocateOrReallocateVectorIfNeeded(ArrayConstructorVector &vector,
       // realloc is undefined with zero new size and ElementBytes() may be null
       // if the character length is null, or if "from" is a zero sized array.
       if (newByteSize > 0) {
-        void *p{std::realloc(to.raw().base_addr, newByteSize)};
-        RUNTIME_CHECK(terminator, p);
+        void *p{ReallocateMemoryOrCrash(
+            terminator, to.raw().base_addr, newByteSize)};
         to.set_base_addr(p);
       }
       vector.actualAllocationSize = requestedAllocationSize;
@@ -88,7 +89,9 @@ static void AllocateOrReallocateVectorIfNeeded(ArrayConstructorVector &vector,
 }
 
 extern "C" {
-void RTNAME(InitArrayConstructorVector)(ArrayConstructorVector &vector,
+RT_EXT_API_GROUP_BEGIN
+
+void RTDEF(InitArrayConstructorVector)(ArrayConstructorVector &vector,
     Descriptor &to, bool useValueLengthParameters, int vectorClassSize,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{vector.sourceFile, vector.sourceLine};
@@ -102,7 +105,7 @@ void RTNAME(InitArrayConstructorVector)(ArrayConstructorVector &vector,
       actualAllocationSize, sourceFile, sourceLine, useValueLengthParameters};
 }
 
-void RTNAME(PushArrayConstructorValue)(
+void RTDEF(PushArrayConstructorValue)(
     ArrayConstructorVector &vector, const Descriptor &from) {
   Terminator terminator{vector.sourceFile, vector.sourceLine};
   Descriptor &to{vector.to};
@@ -166,7 +169,7 @@ void RTNAME(PushArrayConstructorValue)(
   vector.nextValuePosition += fromElements;
 }
 
-void RTNAME(PushArrayConstructorSimpleScalar)(
+void RTDEF(PushArrayConstructorSimpleScalar)(
     ArrayConstructorVector &vector, void *from) {
   Terminator terminator{vector.sourceFile, vector.sourceLine};
   Descriptor &to{vector.to};
@@ -176,5 +179,7 @@ void RTNAME(PushArrayConstructorSimpleScalar)(
   std::memcpy(to.Element<char>(subscript), from, to.ElementBytes());
   ++vector.nextValuePosition;
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/character.cpp b/flang/runtime/character.cpp
index 2afde7cd5e83..5049247397eb 100644
--- a/flang/runtime/character.cpp
+++ b/flang/runtime/character.cpp
@@ -11,6 +11,7 @@
 #include "tools.h"
 #include "flang/Common/bit-population-count.h"
 #include "flang/Common/uint128.h"
+#include "flang/Runtime/character.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
 #include <algorithm>
@@ -19,7 +20,8 @@
 namespace Fortran::runtime {
 
 template <typename CHAR>
-inline int CompareToBlankPadding(const CHAR *x, std::size_t chars) {
+inline RT_API_ATTRS int CompareToBlankPadding(
+    const CHAR *x, std::size_t chars) {
   using UNSIGNED_CHAR = std::make_unsigned_t<CHAR>;
   const auto blank{static_cast<UNSIGNED_CHAR>(' ')};
   for (; chars-- > 0; ++x) {
@@ -34,13 +36,15 @@ inline int CompareToBlankPadding(const CHAR *x, std::size_t chars) {
   return 0;
 }
 
+RT_OFFLOAD_API_GROUP_BEGIN
+
 template <typename CHAR>
-int CharacterScalarCompare(
+RT_API_ATTRS int CharacterScalarCompare(
     const CHAR *x, const CHAR *y, std::size_t xChars, std::size_t yChars) {
   auto minChars{std::min(xChars, yChars)};
   if constexpr (sizeof(CHAR) == 1) {
     // don't use for kind=2 or =4, that would fail on little-endian machines
-    int cmp{std::memcmp(x, y, minChars)};
+    int cmp{Fortran::runtime::memcmp(x, y, minChars)};
     if (cmp < 0) {
       return -1;
     }
@@ -68,20 +72,22 @@ int CharacterScalarCompare(
   return -CompareToBlankPadding(y, yChars - minChars);
 }
 
-template int CharacterScalarCompare<char>(
+template RT_API_ATTRS int CharacterScalarCompare<char>(
     const char *x, const char *y, std::size_t xChars, std::size_t yChars);
-template int CharacterScalarCompare<char16_t>(const char16_t *x,
+template RT_API_ATTRS int CharacterScalarCompare<char16_t>(const char16_t *x,
     const char16_t *y, std::size_t xChars, std::size_t yChars);
-template int CharacterScalarCompare<char32_t>(const char32_t *x,
+template RT_API_ATTRS int CharacterScalarCompare<char32_t>(const char32_t *x,
     const char32_t *y, std::size_t xChars, std::size_t yChars);
 
+RT_OFFLOAD_API_GROUP_END
+
 // Shift count to use when converting between character lengths
 // and byte counts.
 template <typename CHAR>
 constexpr int shift{common::TrailingZeroBitCount(sizeof(CHAR))};
 
 template <typename CHAR>
-static void Compare(Descriptor &result, const Descriptor &x,
+static RT_API_ATTRS void Compare(Descriptor &result, const Descriptor &x,
     const Descriptor &y, const Terminator &terminator) {
   RUNTIME_CHECK(
       terminator, x.rank() == y.rank() || x.rank() == 0 || y.rank() == 0);
@@ -124,7 +130,7 @@ static void Compare(Descriptor &result, const Descriptor &x,
 }
 
 template <typename CHAR, bool ADJUSTR>
-static void Adjust(CHAR *to, const CHAR *from, std::size_t chars) {
+static RT_API_ATTRS void Adjust(CHAR *to, const CHAR *from, std::size_t chars) {
   if constexpr (ADJUSTR) {
     std::size_t j{chars}, k{chars};
     for (; k > 0 && from[k - 1] == ' '; --k) {
@@ -149,8 +155,8 @@ static void Adjust(CHAR *to, const CHAR *from, std::size_t chars) {
 }
 
 template <typename CHAR, bool ADJUSTR>
-static void AdjustLRHelper(Descriptor &result, const Descriptor &string,
-    const Terminator &terminator) {
+static RT_API_ATTRS void AdjustLRHelper(Descriptor &result,
+    const Descriptor &string, const Terminator &terminator) {
   int rank{string.rank()};
   SubscriptValue ub[maxRank], stringAt[maxRank];
   SubscriptValue elements{1};
@@ -177,7 +183,7 @@ static void AdjustLRHelper(Descriptor &result, const Descriptor &string,
 }
 
 template <bool ADJUSTR>
-void AdjustLR(Descriptor &result, const Descriptor &string,
+RT_API_ATTRS void AdjustLR(Descriptor &result, const Descriptor &string,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   switch (string.raw().type) {
@@ -197,7 +203,7 @@ void AdjustLR(Descriptor &result, const Descriptor &string,
 }
 
 template <typename CHAR>
-inline std::size_t LenTrim(const CHAR *x, std::size_t chars) {
+inline RT_API_ATTRS std::size_t LenTrim(const CHAR *x, std::size_t chars) {
   while (chars > 0 && x[chars - 1] == ' ') {
     --chars;
   }
@@ -205,7 +211,7 @@ inline std::size_t LenTrim(const CHAR *x, std::size_t chars) {
 }
 
 template <typename INT, typename CHAR>
-static void LenTrim(Descriptor &result, const Descriptor &string,
+static RT_API_ATTRS void LenTrim(Descriptor &result, const Descriptor &string,
     const Terminator &terminator) {
   int rank{string.rank()};
   SubscriptValue ub[maxRank], stringAt[maxRank];
@@ -232,8 +238,8 @@ static void LenTrim(Descriptor &result, const Descriptor &string,
 }
 
 template <typename CHAR>
-static void LenTrimKind(Descriptor &result, const Descriptor &string, int kind,
-    const Terminator &terminator) {
+static RT_API_ATTRS void LenTrimKind(Descriptor &result,
+    const Descriptor &string, int kind, const Terminator &terminator) {
   switch (kind) {
   case 1:
     LenTrim<CppTypeFor<TypeCategory::Integer, 1>, CHAR>(
@@ -263,8 +269,8 @@ static void LenTrimKind(Descriptor &result, const Descriptor &string, int kind,
 
 // INDEX implementation
 template <typename CHAR>
-inline std::size_t Index(const CHAR *x, std::size_t xLen, const CHAR *want,
-    std::size_t wantLen, bool back) {
+inline RT_API_ATTRS std::size_t Index(const CHAR *x, std::size_t xLen,
+    const CHAR *want, std::size_t wantLen, bool back) {
   if (xLen < wantLen) {
     return 0;
   }
@@ -329,8 +335,8 @@ inline std::size_t Index(const CHAR *x, std::size_t xLen, const CHAR *want,
 enum class CharFunc { Index, Scan, Verify };
 
 template <typename CHAR, CharFunc FUNC>
-inline std::size_t ScanVerify(const CHAR *x, std::size_t xLen, const CHAR *set,
-    std::size_t setLen, bool back) {
+inline RT_API_ATTRS std::size_t ScanVerify(const CHAR *x, std::size_t xLen,
+    const CHAR *set, std::size_t setLen, bool back) {
   std::size_t at{back ? xLen : 1};
   int increment{back ? -1 : 1};
   for (; xLen-- > 0; at += increment) {
@@ -352,8 +358,8 @@ inline std::size_t ScanVerify(const CHAR *x, std::size_t xLen, const CHAR *set,
 
 // Specialization for one-byte characters
 template <bool IS_VERIFY = false>
-inline std::size_t ScanVerify(const char *x, std::size_t xLen, const char *set,
-    std::size_t setLen, bool back) {
+inline RT_API_ATTRS std::size_t ScanVerify(const char *x, std::size_t xLen,
+    const char *set, std::size_t setLen, bool back) {
   std::size_t at{back ? xLen : 1};
   int increment{back ? -1 : 1};
   if (xLen > 0) {
@@ -375,8 +381,8 @@ inline std::size_t ScanVerify(const char *x, std::size_t xLen, const char *set,
 }
 
 template <typename INT, typename CHAR, CharFunc FUNC>
-static void GeneralCharFunc(Descriptor &result, const Descriptor &string,
-    const Descriptor &arg, const Descriptor *back,
+static RT_API_ATTRS void GeneralCharFunc(Descriptor &result,
+    const Descriptor &string, const Descriptor &arg, const Descriptor *back,
     const Terminator &terminator) {
   int rank{string.rank() ? string.rank()
           : arg.rank()   ? arg.rank()
@@ -433,9 +439,9 @@ static void GeneralCharFunc(Descriptor &result, const Descriptor &string,
 }
 
 template <typename CHAR, CharFunc FUNC>
-static void GeneralCharFuncKind(Descriptor &result, const Descriptor &string,
-    const Descriptor &arg, const Descriptor *back, int kind,
-    const Terminator &terminator) {
+static RT_API_ATTRS void GeneralCharFuncKind(Descriptor &result,
+    const Descriptor &string, const Descriptor &arg, const Descriptor *back,
+    int kind, const Terminator &terminator) {
   switch (kind) {
   case 1:
     GeneralCharFunc<CppTypeFor<TypeCategory::Integer, 1>, CHAR, FUNC>(
@@ -464,30 +470,9 @@ static void GeneralCharFuncKind(Descriptor &result, const Descriptor &string,
   }
 }
 
-template <typename TO, typename FROM>
-static void CopyAndPad(
-    TO *to, const FROM *from, std::size_t toChars, std::size_t fromChars) {
-  if constexpr (sizeof(TO) != sizeof(FROM)) {
-    std::size_t copyChars{std::min(toChars, fromChars)};
-    for (std::size_t j{0}; j < copyChars; ++j) {
-      to[j] = from[j];
-    }
-    for (std::size_t j{copyChars}; j < toChars; ++j) {
-      to[j] = static_cast<TO>(' ');
-    }
-  } else if (toChars <= fromChars) {
-    std::memcpy(to, from, toChars * sizeof(TO));
-  } else {
-    std::memcpy(to, from, fromChars * sizeof(TO));
-    for (std::size_t j{fromChars}; j < toChars; ++j) {
-      to[j] = static_cast<TO>(' ');
-    }
-  }
-}
-
 template <typename CHAR, bool ISMIN>
-static void MaxMinHelper(Descriptor &accumulator, const Descriptor &x,
-    const Terminator &terminator) {
+static RT_API_ATTRS void MaxMinHelper(Descriptor &accumulator,
+    const Descriptor &x, const Terminator &terminator) {
   RUNTIME_CHECK(terminator,
       accumulator.rank() == 0 || x.rank() == 0 ||
           accumulator.rank() == x.rank());
@@ -545,7 +530,7 @@ static void MaxMinHelper(Descriptor &accumulator, const Descriptor &x,
 }
 
 template <bool ISMIN>
-static void MaxMin(Descriptor &accumulator, const Descriptor &x,
+static RT_API_ATTRS void MaxMin(Descriptor &accumulator, const Descriptor &x,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   RUNTIME_CHECK(terminator, accumulator.raw().type == x.raw().type);
@@ -566,8 +551,9 @@ static void MaxMin(Descriptor &accumulator, const Descriptor &x,
 }
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
-void RTNAME(CharacterConcatenate)(Descriptor &accumulator,
+void RTDEF(CharacterConcatenate)(Descriptor &accumulator,
     const Descriptor &from, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   RUNTIME_CHECK(terminator,
@@ -616,7 +602,7 @@ void RTNAME(CharacterConcatenate)(Descriptor &accumulator,
   FreeMemory(old);
 }
 
-void RTNAME(CharacterConcatenateScalar1)(
+void RTDEF(CharacterConcatenateScalar1)(
     Descriptor &accumulator, const char *from, std::size_t chars) {
   Terminator terminator{__FILE__, __LINE__};
   RUNTIME_CHECK(terminator, accumulator.rank() == 0);
@@ -629,7 +615,7 @@ void RTNAME(CharacterConcatenateScalar1)(
   FreeMemory(old);
 }
 
-int RTNAME(CharacterCompareScalar)(const Descriptor &x, const Descriptor &y) {
+int RTDEF(CharacterCompareScalar)(const Descriptor &x, const Descriptor &y) {
   Terminator terminator{__FILE__, __LINE__};
   RUNTIME_CHECK(terminator, x.rank() == 0);
   RUNTIME_CHECK(terminator, y.rank() == 0);
@@ -653,22 +639,22 @@ int RTNAME(CharacterCompareScalar)(const Descriptor &x, const Descriptor &y) {
   return 0;
 }
 
-int RTNAME(CharacterCompareScalar1)(
+int RTDEF(CharacterCompareScalar1)(
     const char *x, const char *y, std::size_t xChars, std::size_t yChars) {
   return CharacterScalarCompare(x, y, xChars, yChars);
 }
 
-int RTNAME(CharacterCompareScalar2)(const char16_t *x, const char16_t *y,
+int RTDEF(CharacterCompareScalar2)(const char16_t *x, const char16_t *y,
     std::size_t xChars, std::size_t yChars) {
   return CharacterScalarCompare(x, y, xChars, yChars);
 }
 
-int RTNAME(CharacterCompareScalar4)(const char32_t *x, const char32_t *y,
+int RTDEF(CharacterCompareScalar4)(const char32_t *x, const char32_t *y,
     std::size_t xChars, std::size_t yChars) {
   return CharacterScalarCompare(x, y, xChars, yChars);
 }
 
-void RTNAME(CharacterCompare)(
+void RTDEF(CharacterCompare)(
     Descriptor &result, const Descriptor &x, const Descriptor &y) {
   Terminator terminator{__FILE__, __LINE__};
   RUNTIME_CHECK(terminator, x.raw().type == y.raw().type);
@@ -688,7 +674,7 @@ void RTNAME(CharacterCompare)(
   }
 }
 
-std::size_t RTNAME(CharacterAppend1)(char *lhs, std::size_t lhsBytes,
+std::size_t RTDEF(CharacterAppend1)(char *lhs, std::size_t lhsBytes,
     std::size_t offset, const char *rhs, std::size_t rhsBytes) {
   if (auto n{std::min(lhsBytes - offset, rhsBytes)}) {
     std::memcpy(lhs + offset, rhs, n);
@@ -697,7 +683,7 @@ std::size_t RTNAME(CharacterAppend1)(char *lhs, std::size_t lhsBytes,
   return offset;
 }
 
-void RTNAME(CharacterPad1)(char *lhs, std::size_t bytes, std::size_t offset) {
+void RTDEF(CharacterPad1)(char *lhs, std::size_t bytes, std::size_t offset) {
   if (bytes > offset) {
     std::memset(lhs + offset, ' ', bytes - offset);
   }
@@ -705,30 +691,30 @@ void RTNAME(CharacterPad1)(char *lhs, std::size_t bytes, std::size_t offset) {
 
 // Intrinsic function entry points
 
-void RTNAME(Adjustl)(Descriptor &result, const Descriptor &string,
+void RTDEF(Adjustl)(Descriptor &result, const Descriptor &string,
     const char *sourceFile, int sourceLine) {
   AdjustLR<false>(result, string, sourceFile, sourceLine);
 }
 
-void RTNAME(Adjustr)(Descriptor &result, const Descriptor &string,
+void RTDEF(Adjustr)(Descriptor &result, const Descriptor &string,
     const char *sourceFile, int sourceLine) {
   AdjustLR<true>(result, string, sourceFile, sourceLine);
 }
 
-std::size_t RTNAME(Index1)(const char *x, std::size_t xLen, const char *set,
+std::size_t RTDEF(Index1)(const char *x, std::size_t xLen, const char *set,
     std::size_t setLen, bool back) {
   return Index<char>(x, xLen, set, setLen, back);
 }
-std::size_t RTNAME(Index2)(const char16_t *x, std::size_t xLen,
+std::size_t RTDEF(Index2)(const char16_t *x, std::size_t xLen,
     const char16_t *set, std::size_t setLen, bool back) {
   return Index<char16_t>(x, xLen, set, setLen, back);
 }
-std::size_t RTNAME(Index4)(const char32_t *x, std::size_t xLen,
+std::size_t RTDEF(Index4)(const char32_t *x, std::size_t xLen,
     const char32_t *set, std::size_t setLen, bool back) {
   return Index<char32_t>(x, xLen, set, setLen, back);
 }
 
-void RTNAME(Index)(Descriptor &result, const Descriptor &string,
+void RTDEF(Index)(Descriptor &result, const Descriptor &string,
     const Descriptor &substring, const Descriptor *back, int kind,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
@@ -751,17 +737,17 @@ void RTNAME(Index)(Descriptor &result, const Descriptor &string,
   }
 }
 
-std::size_t RTNAME(LenTrim1)(const char *x, std::size_t chars) {
+std::size_t RTDEF(LenTrim1)(const char *x, std::size_t chars) {
   return LenTrim(x, chars);
 }
-std::size_t RTNAME(LenTrim2)(const char16_t *x, std::size_t chars) {
+std::size_t RTDEF(LenTrim2)(const char16_t *x, std::size_t chars) {
   return LenTrim(x, chars);
 }
-std::size_t RTNAME(LenTrim4)(const char32_t *x, std::size_t chars) {
+std::size_t RTDEF(LenTrim4)(const char32_t *x, std::size_t chars) {
   return LenTrim(x, chars);
 }
 
-void RTNAME(LenTrim)(Descriptor &result, const Descriptor &string, int kind,
+void RTDEF(LenTrim)(Descriptor &result, const Descriptor &string, int kind,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   switch (string.raw().type) {
@@ -780,20 +766,20 @@ void RTNAME(LenTrim)(Descriptor &result, const Descriptor &string, int kind,
   }
 }
 
-std::size_t RTNAME(Scan1)(const char *x, std::size_t xLen, const char *set,
+std::size_t RTDEF(Scan1)(const char *x, std::size_t xLen, const char *set,
     std::size_t setLen, bool back) {
   return ScanVerify<char, CharFunc::Scan>(x, xLen, set, setLen, back);
 }
-std::size_t RTNAME(Scan2)(const char16_t *x, std::size_t xLen,
+std::size_t RTDEF(Scan2)(const char16_t *x, std::size_t xLen,
     const char16_t *set, std::size_t setLen, bool back) {
   return ScanVerify<char16_t, CharFunc::Scan>(x, xLen, set, setLen, back);
 }
-std::size_t RTNAME(Scan4)(const char32_t *x, std::size_t xLen,
+std::size_t RTDEF(Scan4)(const char32_t *x, std::size_t xLen,
     const char32_t *set, std::size_t setLen, bool back) {
   return ScanVerify<char32_t, CharFunc::Scan>(x, xLen, set, setLen, back);
 }
 
-void RTNAME(Scan)(Descriptor &result, const Descriptor &string,
+void RTDEF(Scan)(Descriptor &result, const Descriptor &string,
     const Descriptor &set, const Descriptor *back, int kind,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
@@ -816,7 +802,7 @@ void RTNAME(Scan)(Descriptor &result, const Descriptor &string,
   }
 }
 
-void RTNAME(Repeat)(Descriptor &result, const Descriptor &string,
+void RTDEF(Repeat)(Descriptor &result, const Descriptor &string,
     std::int64_t ncopies, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (ncopies < 0) {
@@ -835,7 +821,7 @@ void RTNAME(Repeat)(Descriptor &result, const Descriptor &string,
   }
 }
 
-void RTNAME(Trim)(Descriptor &result, const Descriptor &string,
+void RTDEF(Trim)(Descriptor &result, const Descriptor &string,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   std::size_t resultBytes{0};
@@ -864,20 +850,20 @@ void RTNAME(Trim)(Descriptor &result, const Descriptor &string,
   std::memcpy(result.OffsetElement(), string.OffsetElement(), resultBytes);
 }
 
-std::size_t RTNAME(Verify1)(const char *x, std::size_t xLen, const char *set,
+std::size_t RTDEF(Verify1)(const char *x, std::size_t xLen, const char *set,
     std::size_t setLen, bool back) {
   return ScanVerify<char, CharFunc::Verify>(x, xLen, set, setLen, back);
 }
-std::size_t RTNAME(Verify2)(const char16_t *x, std::size_t xLen,
+std::size_t RTDEF(Verify2)(const char16_t *x, std::size_t xLen,
     const char16_t *set, std::size_t setLen, bool back) {
   return ScanVerify<char16_t, CharFunc::Verify>(x, xLen, set, setLen, back);
 }
-std::size_t RTNAME(Verify4)(const char32_t *x, std::size_t xLen,
+std::size_t RTDEF(Verify4)(const char32_t *x, std::size_t xLen,
     const char32_t *set, std::size_t setLen, bool back) {
   return ScanVerify<char32_t, CharFunc::Verify>(x, xLen, set, setLen, back);
 }
 
-void RTNAME(Verify)(Descriptor &result, const Descriptor &string,
+void RTDEF(Verify)(Descriptor &result, const Descriptor &string,
     const Descriptor &set, const Descriptor *back, int kind,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
@@ -900,14 +886,16 @@ void RTNAME(Verify)(Descriptor &result, const Descriptor &string,
   }
 }
 
-void RTNAME(CharacterMax)(Descriptor &accumulator, const Descriptor &x,
+void RTDEF(CharacterMax)(Descriptor &accumulator, const Descriptor &x,
     const char *sourceFile, int sourceLine) {
   MaxMin<false>(accumulator, x, sourceFile, sourceLine);
 }
 
-void RTNAME(CharacterMin)(Descriptor &accumulator, const Descriptor &x,
+void RTDEF(CharacterMin)(Descriptor &accumulator, const Descriptor &x,
     const char *sourceFile, int sourceLine) {
   MaxMin<true>(accumulator, x, sourceFile, sourceLine);
 }
+
+RT_EXT_API_GROUP_END
 }
 } // namespace Fortran::runtime
diff --git a/flang/runtime/connection.cpp b/flang/runtime/connection.cpp
index 0abacd7995b4..91ac9a0e14e4 100644
--- a/flang/runtime/connection.cpp
+++ b/flang/runtime/connection.cpp
@@ -46,13 +46,15 @@ SavedPosition::SavedPosition(IoStatementState &io) : io_{io} {
 }
 
 SavedPosition::~SavedPosition() {
-  ConnectionState &conn{io_.GetConnectionState()};
-  while (conn.currentRecordNumber > saved_.currentRecordNumber) {
-    io_.BackspaceRecord();
+  if (!cancelled_) {
+    ConnectionState &conn{io_.GetConnectionState()};
+    while (conn.currentRecordNumber > saved_.currentRecordNumber) {
+      io_.BackspaceRecord();
+    }
+    conn.leftTabLimit = saved_.leftTabLimit;
+    conn.furthestPositionInRecord = saved_.furthestPositionInRecord;
+    conn.positionInRecord = saved_.positionInRecord;
+    conn.pinnedFrame = saved_.pinnedFrame;
   }
-  conn.leftTabLimit = saved_.leftTabLimit;
-  conn.furthestPositionInRecord = saved_.furthestPositionInRecord;
-  conn.positionInRecord = saved_.positionInRecord;
-  conn.pinnedFrame = saved_.pinnedFrame;
 }
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/connection.h b/flang/runtime/connection.h
index 70c20e17fd01..c9a7566f2098 100644
--- a/flang/runtime/connection.h
+++ b/flang/runtime/connection.h
@@ -111,10 +111,12 @@ class SavedPosition {
 public:
   explicit SavedPosition(IoStatementState &);
   ~SavedPosition();
+  void Cancel() { cancelled_ = true; }
 
 private:
   IoStatementState &io_;
   ConnectionState saved_;
+  bool cancelled_{false};
 };
 
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/copy.cpp b/flang/runtime/copy.cpp
index 71ef2c2f7566..9e62d1e24a47 100644
--- a/flang/runtime/copy.cpp
+++ b/flang/runtime/copy.cpp
@@ -14,8 +14,9 @@
 #include <cstring>
 
 namespace Fortran::runtime {
+RT_OFFLOAD_API_GROUP_BEGIN
 
-void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
+RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
     const Descriptor &from, const SubscriptValue fromAt[],
     Terminator &terminator) {
   char *toPtr{to.Element<char>(toAt)};
@@ -48,7 +49,7 @@ void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
   }
 }
 
-void CopyArray(
+RT_API_ATTRS void CopyArray(
     const Descriptor &to, const Descriptor &from, Terminator &terminator) {
   std::size_t elements{to.Elements()};
   RUNTIME_CHECK(terminator, elements == from.Elements());
@@ -61,4 +62,6 @@ void CopyArray(
     from.IncrementSubscripts(fromAt);
   }
 }
+
+RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime
diff --git a/flang/runtime/derived-api.cpp b/flang/runtime/derived-api.cpp
index 39bf0521e73b..321f50a1edfc 100644
--- a/flang/runtime/derived-api.cpp
+++ b/flang/runtime/derived-api.cpp
@@ -10,14 +10,16 @@
 #include "flang/Runtime/derived-api.h"
 #include "derived.h"
 #include "terminator.h"
+#include "tools.h"
 #include "type-info.h"
 #include "flang/Runtime/descriptor.h"
 
 namespace Fortran::runtime {
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
-void RTNAME(Initialize)(
+void RTDEF(Initialize)(
     const Descriptor &descriptor, const char *sourceFile, int sourceLine) {
   if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
     if (const auto *derived{addendum->derivedType()}) {
@@ -29,7 +31,7 @@ void RTNAME(Initialize)(
   }
 }
 
-void RTNAME(Destroy)(const Descriptor &descriptor) {
+void RTDEF(Destroy)(const Descriptor &descriptor) {
   if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
     if (const auto *derived{addendum->derivedType()}) {
       if (!derived->noDestructionNeeded()) {
@@ -41,7 +43,7 @@ void RTNAME(Destroy)(const Descriptor &descriptor) {
   }
 }
 
-void RTNAME(Finalize)(
+void RTDEF(Finalize)(
     const Descriptor &descriptor, const char *sourceFile, int sourceLine) {
   if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
     if (const auto *derived{addendum->derivedType()}) {
@@ -53,7 +55,7 @@ void RTNAME(Finalize)(
   }
 }
 
-bool RTNAME(ClassIs)(
+bool RTDEF(ClassIs)(
     const Descriptor &descriptor, const typeInfo::DerivedType &derivedType) {
   if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
     if (const auto *derived{addendum->derivedType()}) {
@@ -72,7 +74,8 @@ bool RTNAME(ClassIs)(
   return false;
 }
 
-static bool CompareDerivedTypeNames(const Descriptor &a, const Descriptor &b) {
+static RT_API_ATTRS bool CompareDerivedTypeNames(
+    const Descriptor &a, const Descriptor &b) {
   if (a.raw().version == CFI_VERSION &&
       a.type() == TypeCode{TypeCategory::Character, 1} &&
       a.ElementBytes() > 0 && a.rank() == 0 && a.OffsetElement() != nullptr &&
@@ -80,18 +83,20 @@ static bool CompareDerivedTypeNames(const Descriptor &a, const Descriptor &b) {
       b.type() == TypeCode{TypeCategory::Character, 1} &&
       b.ElementBytes() > 0 && b.rank() == 0 && b.OffsetElement() != nullptr &&
       a.ElementBytes() == b.ElementBytes() &&
-      memcmp(a.OffsetElement(), b.OffsetElement(), a.ElementBytes()) == 0) {
+      Fortran::runtime::memcmp(
+          a.OffsetElement(), b.OffsetElement(), a.ElementBytes()) == 0) {
     return true;
   }
   return false;
 }
 
-inline bool CompareDerivedType(
+inline RT_API_ATTRS bool CompareDerivedType(
     const typeInfo::DerivedType *a, const typeInfo::DerivedType *b) {
   return a == b || CompareDerivedTypeNames(a->name(), b->name());
 }
 
-static const typeInfo::DerivedType *GetDerivedType(const Descriptor &desc) {
+static const RT_API_ATTRS typeInfo::DerivedType *GetDerivedType(
+    const Descriptor &desc) {
   if (const DescriptorAddendum * addendum{desc.Addendum()}) {
     if (const auto *derived{addendum->derivedType()}) {
       return derived;
@@ -100,7 +105,7 @@ static const typeInfo::DerivedType *GetDerivedType(const Descriptor &desc) {
   return nullptr;
 }
 
-bool RTNAME(SameTypeAs)(const Descriptor &a, const Descriptor &b) {
+bool RTDEF(SameTypeAs)(const Descriptor &a, const Descriptor &b) {
   auto aType{a.raw().type};
   auto bType{b.raw().type};
   if ((aType != CFI_type_struct && aType != CFI_type_other) ||
@@ -125,7 +130,7 @@ bool RTNAME(SameTypeAs)(const Descriptor &a, const Descriptor &b) {
   }
 }
 
-bool RTNAME(ExtendsTypeOf)(const Descriptor &a, const Descriptor &mold) {
+bool RTDEF(ExtendsTypeOf)(const Descriptor &a, const Descriptor &mold) {
   auto aType{a.raw().type};
   auto moldType{mold.raw().type};
   if ((aType != CFI_type_struct && aType != CFI_type_other) ||
@@ -152,7 +157,7 @@ bool RTNAME(ExtendsTypeOf)(const Descriptor &a, const Descriptor &mold) {
   }
 }
 
-void RTNAME(DestroyWithoutFinalization)(const Descriptor &descriptor) {
+void RTDEF(DestroyWithoutFinalization)(const Descriptor &descriptor) {
   if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
     if (const auto *derived{addendum->derivedType()}) {
       if (!derived->noDestructionNeeded()) {
@@ -162,5 +167,6 @@ void RTNAME(DestroyWithoutFinalization)(const Descriptor &descriptor) {
   }
 }
 
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/dot-product.cpp b/flang/runtime/dot-product.cpp
index 58382863a500..977698269bcb 100644
--- a/flang/runtime/dot-product.cpp
+++ b/flang/runtime/dot-product.cpp
@@ -21,14 +21,20 @@ namespace Fortran::runtime {
 // Beware: DOT_PRODUCT of COMPLEX data uses the complex conjugate of the first
 // argument; MATMUL does not.
 
+// Suppress the warnings about calling __host__-only std::complex operators,
+// defined in C++ STD header files, from __device__ code.
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 // General accumulator for any type and stride; this is not used for
 // contiguous numeric vectors.
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 class Accumulator {
 public:
   using Result = AccumulationType<RCAT, RKIND>;
-  Accumulator(const Descriptor &x, const Descriptor &y) : x_{x}, y_{y} {}
-  void AccumulateIndexed(SubscriptValue xAt, SubscriptValue yAt) {
+  RT_API_ATTRS Accumulator(const Descriptor &x, const Descriptor &y)
+      : x_{x}, y_{y} {}
+  RT_API_ATTRS void AccumulateIndexed(SubscriptValue xAt, SubscriptValue yAt) {
     if constexpr (RCAT == TypeCategory::Logical) {
       sum_ = sum_ ||
           (IsLogicalElementTrue(x_, &xAt) && IsLogicalElementTrue(y_, &yAt));
@@ -43,7 +49,7 @@ public:
       }
     }
   }
-  Result GetResult() const { return sum_; }
+  RT_API_ATTRS Result GetResult() const { return sum_; }
 
 private:
   const Descriptor &x_, &y_;
@@ -51,7 +57,7 @@ private:
 };
 
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
-static inline CppTypeFor<RCAT, RKIND> DoDotProduct(
+static inline RT_API_ATTRS CppTypeFor<RCAT, RKIND> DoDotProduct(
     const Descriptor &x, const Descriptor &y, Terminator &terminator) {
   using Result = CppTypeFor<RCAT, RKIND>;
   RUNTIME_CHECK(terminator, x.rank() == 1 && y.rank() == 1);
@@ -83,8 +89,14 @@ static inline CppTypeFor<RCAT, RKIND> DoDotProduct(
       AccumType accum{};
       if constexpr (RCAT == TypeCategory::Complex) {
         for (SubscriptValue j{0}; j < n; ++j) {
-          accum += std::conj(static_cast<AccumType>(*xp++)) *
+          // std::conj() may instantiate its argument twice,
+          // so xp has to be incremented separately.
+          // This is a workaround for an alleged bug in clang,
+          // that shows up as:
+          //   warning: multiple unsequenced modifications to 'xp'
+          accum += std::conj(static_cast<AccumType>(*xp)) *
               static_cast<AccumType>(*yp++);
+          xp++;
         }
       } else {
         for (SubscriptValue j{0}; j < n; ++j) {
@@ -105,11 +117,13 @@ static inline CppTypeFor<RCAT, RKIND> DoDotProduct(
   return static_cast<Result>(accumulator.GetResult());
 }
 
+RT_DIAG_POP
+
 template <TypeCategory RCAT, int RKIND> struct DotProduct {
   using Result = CppTypeFor<RCAT, RKIND>;
   template <TypeCategory XCAT, int XKIND> struct DP1 {
     template <TypeCategory YCAT, int YKIND> struct DP2 {
-      Result operator()(const Descriptor &x, const Descriptor &y,
+      RT_API_ATTRS Result operator()(const Descriptor &x, const Descriptor &y,
           Terminator &terminator) const {
         if constexpr (constexpr auto resultType{
                           GetResultType(XCAT, XKIND, YCAT, YKIND)}) {
@@ -125,12 +139,12 @@ template <TypeCategory RCAT, int RKIND> struct DotProduct {
             static_cast<int>(YCAT), YKIND);
       }
     };
-    Result operator()(const Descriptor &x, const Descriptor &y,
+    RT_API_ATTRS Result operator()(const Descriptor &x, const Descriptor &y,
         Terminator &terminator, TypeCategory yCat, int yKind) const {
       return ApplyType<DP2, Result>(yCat, yKind, terminator, x, y, terminator);
     }
   };
-  Result operator()(const Descriptor &x, const Descriptor &y,
+  RT_API_ATTRS Result operator()(const Descriptor &x, const Descriptor &y,
       const char *source, int line) const {
     Terminator terminator{source, line};
     if (RCAT != TypeCategory::Logical && x.type() == y.type()) {
@@ -148,24 +162,26 @@ template <TypeCategory RCAT, int RKIND> struct DotProduct {
 };
 
 extern "C" {
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(DotProductInteger1)(
+RT_EXT_API_GROUP_BEGIN
+
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(DotProductInteger1)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Integer, 1>{}(x, y, source, line);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(DotProductInteger2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(DotProductInteger2)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Integer, 2>{}(x, y, source, line);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(DotProductInteger4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(DotProductInteger4)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Integer, 4>{}(x, y, source, line);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(DotProductInteger8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(DotProductInteger8)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Integer, 8>{}(x, y, source, line);
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(DotProductInteger16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(DotProductInteger16)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Integer, 16>{}(x, y, source, line);
 }
@@ -173,53 +189,55 @@ CppTypeFor<TypeCategory::Integer, 16> RTNAME(DotProductInteger16)(
 
 // TODO: REAL/COMPLEX(2 & 3)
 // Intermediate results and operations are at least 64 bits
-CppTypeFor<TypeCategory::Real, 4> RTNAME(DotProductReal4)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(DotProductReal4)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Real, 4>{}(x, y, source, line);
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(DotProductReal8)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(DotProductReal8)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Real, 8>{}(x, y, source, line);
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(DotProductReal10)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(DotProductReal10)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Real, 10>{}(x, y, source, line);
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(DotProductReal16)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(DotProductReal16)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Real, 16>{}(x, y, source, line);
 }
 #endif
 
-void RTNAME(CppDotProductComplex4)(CppTypeFor<TypeCategory::Complex, 4> &result,
+void RTDEF(CppDotProductComplex4)(CppTypeFor<TypeCategory::Complex, 4> &result,
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   result = DotProduct<TypeCategory::Complex, 4>{}(x, y, source, line);
 }
-void RTNAME(CppDotProductComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
+void RTDEF(CppDotProductComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   result = DotProduct<TypeCategory::Complex, 8>{}(x, y, source, line);
 }
 #if LDBL_MANT_DIG == 64
-void RTNAME(CppDotProductComplex10)(
+void RTDEF(CppDotProductComplex10)(
     CppTypeFor<TypeCategory::Complex, 10> &result, const Descriptor &x,
     const Descriptor &y, const char *source, int line) {
   result = DotProduct<TypeCategory::Complex, 10>{}(x, y, source, line);
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-void RTNAME(CppDotProductComplex16)(
+void RTDEF(CppDotProductComplex16)(
     CppTypeFor<TypeCategory::Complex, 16> &result, const Descriptor &x,
     const Descriptor &y, const char *source, int line) {
   result = DotProduct<TypeCategory::Complex, 16>{}(x, y, source, line);
 }
 #endif
 
-bool RTNAME(DotProductLogical)(
+bool RTDEF(DotProductLogical)(
     const Descriptor &x, const Descriptor &y, const char *source, int line) {
   return DotProduct<TypeCategory::Logical, 1>{}(x, y, source, line);
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index 822099b5141b..6d4fa588cbf6 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -21,7 +21,8 @@ namespace Fortran::runtime::io {
 static inline bool IsCharValueSeparator(const DataEdit &edit, char32_t ch) {
   char32_t comma{
       edit.modes.editingFlags & decimalComma ? char32_t{';'} : char32_t{','}};
-  return ch == ' ' || ch == '\t' || ch == '/' || ch == comma;
+  return ch == ' ' || ch == '\t' || ch == comma || ch == '/' ||
+      (edit.IsNamelist() && (ch == '&' || ch == '$'));
 }
 
 static bool CheckCompleteListDirectedField(
@@ -243,10 +244,21 @@ bool EditIntegerInput(
   if (sign == '-') {
     value = -value;
   }
-  if (any || !io.GetConnectionState().IsAtEOF()) {
-    std::memcpy(n, &value, kind); // a blank field means zero
+  if (any || !io.GetIoErrorHandler().InError()) {
+    // The value is stored in the lower order bits on big endian platform.
+    // When memcpy, shift the value to the higher order bit.
+    auto shft{static_cast<int>(sizeof(value.low())) - kind};
+    // For kind==8 (i.e. shft==0), the value is stored in low_ in big endian.
+    if (!isHostLittleEndian && shft >= 0) {
+      auto l{value.low() << (8 * shft)};
+      std::memcpy(n, &l, kind);
+    } else {
+      std::memcpy(n, &value, kind); // a blank field means zero
+    }
+    return true;
+  } else {
+    return false;
   }
-  return any;
 }
 
 // Parses a REAL input number from the input source as a normalized
@@ -478,6 +490,9 @@ static void RaiseFPExceptions(decimal::ConversionResultFlags flags) {
   if (flags & decimal::ConversionResultFlags::Overflow) {
     RAISE(FE_OVERFLOW);
   }
+  if (flags & decimal::ConversionResultFlags::Underflow) {
+    RAISE(FE_UNDERFLOW);
+  }
   if (flags & decimal::ConversionResultFlags::Inexact) {
     RAISE(FE_INEXACT);
   }
@@ -612,48 +627,26 @@ decimal::ConversionToBinaryResult<binaryPrecision> ConvertHexadecimal(
       fraction <<= 1;
       --expo;
     }
-    // Rounding
-    bool increase{false};
-    switch (rounding) {
-    case decimal::RoundNearest: // RN & RP
-      increase = roundingBit && (guardBit | ((int)fraction & 1));
-      break;
-    case decimal::RoundUp: // RU
-      increase = !isNegative && (roundingBit | guardBit);
-      break;
-    case decimal::RoundDown: // RD
-      increase = isNegative && (roundingBit | guardBit);
-      break;
-    case decimal::RoundToZero: // RZ
-      break;
-    case decimal::RoundCompatible: // RC
-      increase = roundingBit != 0;
-      break;
-    }
-    if (increase) {
-      ++fraction;
-      if (fraction >> binaryPrecision) {
-        fraction >>= 1;
-        ++expo;
-      }
-    }
   }
   // Package & return result
   constexpr RawType significandMask{(one << RealType::significandBits) - 1};
+  int flags{(roundingBit | guardBit) ? decimal::Inexact : decimal::Exact};
   if (!fraction) {
     expo = 0;
   } else if (expo == 1 && !(fraction >> (binaryPrecision - 1))) {
     expo = 0; // subnormal
+    flags |= decimal::Underflow;
   } else if (expo >= RealType::maxExponent) {
     expo = RealType::maxExponent; // +/-Inf
     fraction = 0;
+    flags |= decimal::Overflow;
   } else {
     fraction &= significandMask; // remove explicit normalization unless x87
   }
   return decimal::ConversionToBinaryResult<binaryPrecision>{
       RealType{static_cast<RawType>(signBit |
           static_cast<RawType>(expo) << RealType::significandBits | fraction)},
-      (roundingBit | guardBit) ? decimal::Inexact : decimal::Exact};
+      static_cast<decimal::ConversionResultFlags>(flags)};
 }
 
 template <int KIND>
@@ -895,6 +888,10 @@ static bool EditListDirectedCharacterInput(
     case '/':
       isSep = true;
       break;
+    case '&':
+    case '$':
+      isSep = edit.IsNamelist();
+      break;
     case ',':
       isSep = !(edit.modes.editingFlags & decimalComma);
       break;
diff --git a/flang/runtime/edit-output.cpp b/flang/runtime/edit-output.cpp
index a4ce0b12f911..26e066c85fed 100644
--- a/flang/runtime/edit-output.cpp
+++ b/flang/runtime/edit-output.cpp
@@ -341,11 +341,12 @@ bool RealOutputEditing<KIND>::EditEorDOutput(const DataEdit &edit) {
         ConvertToDecimal(significantDigits, edit.modes.round, flags)};
     if (IsInfOrNaN(converted.str, static_cast<int>(converted.length))) {
       return editWidth > 0 &&
-              converted.length > static_cast<std::size_t>(editWidth)
+              converted.length + trailingBlanks_ >
+                  static_cast<std::size_t>(editWidth)
           ? EmitRepeated(io_, '*', editWidth)
           : EmitPrefix(edit, converted.length, editWidth) &&
               EmitAscii(io_, converted.str, converted.length) &&
-              EmitSuffix(edit);
+              EmitRepeated(io_, ' ', trailingBlanks_) && EmitSuffix(edit);
     }
     if (!IsZero()) {
       converted.decimalExponent -= scale;
@@ -522,8 +523,9 @@ bool RealOutputEditing<KIND>::EditFOutput(const DataEdit &edit) {
       zeroesBeforePoint = 1; // "." -> "0."
     }
     int totalLength{signLength + digitsBeforePoint + zeroesBeforePoint +
-        1 /*'.'*/ + zeroesAfterPoint + digitsAfterPoint + trailingZeroes};
-    int width{editWidth > 0 ? editWidth : totalLength};
+        1 /*'.'*/ + zeroesAfterPoint + digitsAfterPoint + trailingZeroes +
+        trailingBlanks_ /* G editing converted to F */};
+    int width{editWidth > 0 || trailingBlanks_ ? editWidth : totalLength};
     if (totalLength > width) {
       return EmitRepeated(io_, '*', width);
     }
@@ -574,8 +576,11 @@ DataEdit RealOutputEditing<KIND>::EditForGOutput(DataEdit edit) {
   trailingBlanks_ = 0;
   if (editWidth > 0) {
     int expoDigits{edit.expoDigits.value_or(0)};
+    // F'2023 13.7.5.2.3 p5: "If 0 <= s <= d, the scale factor has no effect
+    // and F(w − n).(d − s),n(’b’) editing is used where b is a blank and
+    // n is 4 for Gw.d editing, e + 2 for Gw.dEe editing if e > 0, and
+    // 4 for Gw.dE0 editing."
     trailingBlanks_ = expoDigits > 0 ? expoDigits + 2 : 4; // 'n'
-    *edit.width = std::max(0, editWidth - trailingBlanks_);
   }
   if (edit.digits.has_value()) {
     *edit.digits = std::max(0, *edit.digits - expo);
@@ -649,7 +654,7 @@ auto RealOutputEditing<KIND>::ConvertToHexadecimal(
     // x_.binaryPrecision is constant, so / can be used for readability.
     int shift{x_.binaryPrecision - 4};
     typename BinaryFloatingPoint::RawType one{1};
-    auto remaining{(one << shift) - one};
+    auto remaining{(one << x_.binaryPrecision) - one};
     for (int digits{0}; digits < significantDigits; ++digits) {
       if ((flags & decimal::Minimize) && !(fraction & remaining)) {
         break;
@@ -682,7 +687,8 @@ bool RealOutputEditing<KIND>::EditEXOutput(const DataEdit &edit) {
     flags |= decimal::AlwaysSign;
   }
   int editWidth{edit.width.value_or(0)}; // 'w' field
-  if (editWidth == 0 && !edit.digits) { // EX0 (no .d)
+  if ((editWidth == 0 && !edit.digits) || editDigits == 0) {
+    // EX0 or EXw.0
     flags |= decimal::Minimize;
     significantDigits = 28; // enough for 128-bit F.P.
   }
diff --git a/flang/runtime/extensions.cpp b/flang/runtime/extensions.cpp
index b8e9b6eae132..1c025d40b395 100644
--- a/flang/runtime/extensions.cpp
+++ b/flang/runtime/extensions.cpp
@@ -10,13 +10,29 @@
 // extensions that will eventually be implemented in Fortran.
 
 #include "flang/Runtime/extensions.h"
+#include "tools.h"
 #include "flang/Runtime/command.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/io-api.h"
 
+#if _REENTRANT || _POSIX_C_SOURCE >= 199506L
+// System is posix-compliant and has getlogin_r
+#include <unistd.h>
+#endif
+
 extern "C" {
 
 namespace Fortran::runtime {
+
+void GetUsernameEnvVar(
+    const char *envName, std::byte *arg, std::int64_t length) {
+  Descriptor name{*Descriptor::Create(
+      1, std::strlen(envName) + 1, const_cast<char *>(envName), 0)};
+  Descriptor value{*Descriptor::Create(1, length, arg, 0)};
+
+  RTNAME(GetEnvVariable)
+  (name, &value, nullptr, false, nullptr, __FILE__, __LINE__);
+}
 namespace io {
 // SUBROUTINE FLUSH(N)
 //   FLUSH N
@@ -37,5 +53,28 @@ void FORTRAN_PROCEDURE_NAME(getarg)(
   (void)RTNAME(GetCommandArgument)(
       n, &value, nullptr, nullptr, __FILE__, __LINE__);
 }
+
+// CALL GETLOG(USRNAME)
+void FORTRAN_PROCEDURE_NAME(getlog)(std::byte *arg, std::int64_t length) {
+#if _REENTRANT || _POSIX_C_SOURCE >= 199506L
+  const int nameMaxLen{LOGIN_NAME_MAX + 1};
+  char str[nameMaxLen];
+
+  int error{getlogin_r(str, nameMaxLen)};
+  if (error == 0) {
+    // no error: find first \0 in string then pad from there
+    CopyAndPad(reinterpret_cast<char *>(arg), str, length, std::strlen(str));
+  } else {
+    // error occur: get username from environment variable
+    GetUsernameEnvVar("LOGNAME", arg, length);
+  }
+#elif _WIN32
+  // Get username from environment to avoid link to Advapi32.lib
+  GetUsernameEnvVar("USERNAME", arg, length);
+#else
+  GetUsernameEnvVar("LOGNAME", arg, length);
+#endif
+}
+
 } // namespace Fortran::runtime
 } // extern "C"
diff --git a/flang/runtime/extrema.cpp b/flang/runtime/extrema.cpp
index 70b2c4d3d735..edb5d5f47a5a 100644
--- a/flang/runtime/extrema.cpp
+++ b/flang/runtime/extrema.cpp
@@ -26,8 +26,8 @@ namespace Fortran::runtime {
 
 template <typename T, bool IS_MAX, bool BACK> struct NumericCompare {
   using Type = T;
-  explicit NumericCompare(std::size_t /*elemLen; ignored*/) {}
-  bool operator()(const T &value, const T &previous) const {
+  explicit RT_API_ATTRS NumericCompare(std::size_t /*elemLen; ignored*/) {}
+  RT_API_ATTRS bool operator()(const T &value, const T &previous) const {
     if (value == previous) {
       return BACK;
     } else if constexpr (IS_MAX) {
@@ -41,9 +41,9 @@ template <typename T, bool IS_MAX, bool BACK> struct NumericCompare {
 template <typename T, bool IS_MAX, bool BACK> class CharacterCompare {
 public:
   using Type = T;
-  explicit CharacterCompare(std::size_t elemLen)
+  explicit RT_API_ATTRS CharacterCompare(std::size_t elemLen)
       : chars_{elemLen / sizeof(T)} {}
-  bool operator()(const T &value, const T &previous) const {
+  RT_API_ATTRS bool operator()(const T &value, const T &previous) const {
     int cmp{CharacterScalarCompare<T>(&value, &previous, chars_, chars_)};
     if (cmp == 0) {
       return BACK;
@@ -61,19 +61,20 @@ private:
 template <typename COMPARE> class ExtremumLocAccumulator {
 public:
   using Type = typename COMPARE::Type;
-  ExtremumLocAccumulator(const Descriptor &array)
+  RT_API_ATTRS ExtremumLocAccumulator(const Descriptor &array)
       : array_{array}, argRank_{array.rank()}, compare_{array.ElementBytes()} {
     Reinitialize();
   }
-  void Reinitialize() {
+  RT_API_ATTRS void Reinitialize() {
     // per standard: result indices are all zero if no data
     for (int j{0}; j < argRank_; ++j) {
       extremumLoc_[j] = 0;
     }
     previous_ = nullptr;
   }
-  int argRank() const { return argRank_; }
-  template <typename A> void GetResult(A *p, int zeroBasedDim = -1) {
+  RT_API_ATTRS int argRank() const { return argRank_; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int zeroBasedDim = -1) {
     if (zeroBasedDim >= 0) {
       *p = extremumLoc_[zeroBasedDim] -
           array_.GetDimension(zeroBasedDim).LowerBound() + 1;
@@ -83,7 +84,8 @@ public:
       }
     }
   }
-  template <typename IGNORED> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename IGNORED>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     const auto &value{*array_.Element<Type>(at)};
     if (!previous_ || compare_(value, *previous_)) {
       previous_ = &value;
@@ -103,8 +105,8 @@ private:
 };
 
 template <typename ACCUMULATOR, typename CPPTYPE>
-static void LocationHelper(const char *intrinsic, Descriptor &result,
-    const Descriptor &x, int kind, const Descriptor *mask,
+static RT_API_ATTRS void LocationHelper(const char *intrinsic,
+    Descriptor &result, const Descriptor &x, int kind, const Descriptor *mask,
     Terminator &terminator) {
   ACCUMULATOR accumulator{x};
   DoTotalReduction<CPPTYPE>(x, 0, mask, accumulator, intrinsic, terminator);
@@ -114,9 +116,9 @@ static void LocationHelper(const char *intrinsic, Descriptor &result,
 
 template <TypeCategory CAT, int KIND, bool IS_MAX,
     template <typename, bool, bool> class COMPARE>
-inline void DoMaxOrMinLoc(const char *intrinsic, Descriptor &result,
-    const Descriptor &x, int kind, const char *source, int line,
-    const Descriptor *mask, bool back) {
+inline RT_API_ATTRS void DoMaxOrMinLoc(const char *intrinsic,
+    Descriptor &result, const Descriptor &x, int kind, const char *source,
+    int line, const Descriptor *mask, bool back) {
   using CppType = CppTypeFor<CAT, KIND>;
   Terminator terminator{source, line};
   if (back) {
@@ -130,7 +132,7 @@ inline void DoMaxOrMinLoc(const char *intrinsic, Descriptor &result,
 
 template <bool IS_MAX> struct CharacterMaxOrMinLocHelper {
   template <int KIND> struct Functor {
-    void operator()(const char *intrinsic, Descriptor &result,
+    RT_API_ATTRS void operator()(const char *intrinsic, Descriptor &result,
         const Descriptor &x, int kind, const char *source, int line,
         const Descriptor *mask, bool back) const {
       DoMaxOrMinLoc<TypeCategory::Character, KIND, IS_MAX, NumericCompare>(
@@ -140,9 +142,9 @@ template <bool IS_MAX> struct CharacterMaxOrMinLocHelper {
 };
 
 template <bool IS_MAX>
-inline void CharacterMaxOrMinLoc(const char *intrinsic, Descriptor &result,
-    const Descriptor &x, int kind, const char *source, int line,
-    const Descriptor *mask, bool back) {
+inline RT_API_ATTRS void CharacterMaxOrMinLoc(const char *intrinsic,
+    Descriptor &result, const Descriptor &x, int kind, const char *source,
+    int line, const Descriptor *mask, bool back) {
   int rank{x.rank()};
   SubscriptValue extent[1]{rank};
   result.Establish(TypeCategory::Integer, kind, nullptr, 1, extent,
@@ -169,9 +171,9 @@ inline void CharacterMaxOrMinLoc(const char *intrinsic, Descriptor &result,
 }
 
 template <TypeCategory CAT, int KIND, bool IS_MAXVAL>
-inline void TotalNumericMaxOrMinLoc(const char *intrinsic, Descriptor &result,
-    const Descriptor &x, int kind, const char *source, int line,
-    const Descriptor *mask, bool back) {
+inline RT_API_ATTRS void TotalNumericMaxOrMinLoc(const char *intrinsic,
+    Descriptor &result, const Descriptor &x, int kind, const char *source,
+    int line, const Descriptor *mask, bool back) {
   int rank{x.rank()};
   SubscriptValue extent[1]{rank};
   result.Establish(TypeCategory::Integer, kind, nullptr, 1, extent,
@@ -189,125 +191,129 @@ inline void TotalNumericMaxOrMinLoc(const char *intrinsic, Descriptor &result,
 }
 
 extern "C" {
-void RTNAME(MaxlocCharacter)(Descriptor &result, const Descriptor &x, int kind,
+RT_EXT_API_GROUP_BEGIN
+
+void RTDEF(MaxlocCharacter)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   CharacterMaxOrMinLoc<true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MaxlocInteger1)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocInteger1)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 1, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MaxlocInteger2)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocInteger2)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 2, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MaxlocInteger4)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocInteger4)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 4, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MaxlocInteger8)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocInteger8)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 8, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
 #ifdef __SIZEOF_INT128__
-void RTNAME(MaxlocInteger16)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocInteger16)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 16, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
 #endif
-void RTNAME(MaxlocReal4)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocReal4)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 4, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MaxlocReal8)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocReal8)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 8, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
 #if LDBL_MANT_DIG == 64
-void RTNAME(MaxlocReal10)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocReal10)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 10, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-void RTNAME(MaxlocReal16)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MaxlocReal16)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 16, true>(
       "MAXLOC", result, x, kind, source, line, mask, back);
 }
 #endif
-void RTNAME(MinlocCharacter)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocCharacter)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   CharacterMaxOrMinLoc<false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MinlocInteger1)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocInteger1)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 1, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MinlocInteger2)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocInteger2)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 2, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MinlocInteger4)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocInteger4)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 4, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MinlocInteger8)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocInteger8)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 8, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
 #ifdef __SIZEOF_INT128__
-void RTNAME(MinlocInteger16)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocInteger16)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Integer, 16, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
 #endif
-void RTNAME(MinlocReal4)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocReal4)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 4, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
-void RTNAME(MinlocReal8)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocReal8)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 8, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
 #if LDBL_MANT_DIG == 64
-void RTNAME(MinlocReal10)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocReal10)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 10, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-void RTNAME(MinlocReal16)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocReal16)(Descriptor &result, const Descriptor &x, int kind,
     const char *source, int line, const Descriptor *mask, bool back) {
   TotalNumericMaxOrMinLoc<TypeCategory::Real, 16, false>(
       "MINLOC", result, x, kind, source, line, mask, back);
 }
 #endif
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 
 // MAXLOC/MINLOC with DIM=
 
 template <TypeCategory CAT, int KIND, bool IS_MAX,
     template <typename, bool, bool> class COMPARE, bool BACK>
-static void DoPartialMaxOrMinLocDirection(const char *intrinsic,
+static RT_API_ATTRS void DoPartialMaxOrMinLocDirection(const char *intrinsic,
     Descriptor &result, const Descriptor &x, int kind, int dim,
     const Descriptor *mask, Terminator &terminator) {
   using CppType = CppTypeFor<CAT, KIND>;
@@ -320,9 +326,9 @@ static void DoPartialMaxOrMinLocDirection(const char *intrinsic,
 
 template <TypeCategory CAT, int KIND, bool IS_MAX,
     template <typename, bool, bool> class COMPARE>
-inline void DoPartialMaxOrMinLoc(const char *intrinsic, Descriptor &result,
-    const Descriptor &x, int kind, int dim, const Descriptor *mask, bool back,
-    Terminator &terminator) {
+inline RT_API_ATTRS void DoPartialMaxOrMinLoc(const char *intrinsic,
+    Descriptor &result, const Descriptor &x, int kind, int dim,
+    const Descriptor *mask, bool back, Terminator &terminator) {
   if (back) {
     DoPartialMaxOrMinLocDirection<CAT, KIND, IS_MAX, COMPARE, true>(
         intrinsic, result, x, kind, dim, mask, terminator);
@@ -336,7 +342,7 @@ template <TypeCategory CAT, bool IS_MAX,
     template <typename, bool, bool> class COMPARE>
 struct DoPartialMaxOrMinLocHelper {
   template <int KIND> struct Functor {
-    void operator()(const char *intrinsic, Descriptor &result,
+    RT_API_ATTRS void operator()(const char *intrinsic, Descriptor &result,
         const Descriptor &x, int kind, int dim, const Descriptor *mask,
         bool back, Terminator &terminator) const {
       DoPartialMaxOrMinLoc<CAT, KIND, IS_MAX, COMPARE>(
@@ -346,9 +352,9 @@ struct DoPartialMaxOrMinLocHelper {
 };
 
 template <bool IS_MAX>
-inline void TypedPartialMaxOrMinLoc(const char *intrinsic, Descriptor &result,
-    const Descriptor &x, int kind, int dim, const char *source, int line,
-    const Descriptor *mask, bool back) {
+inline RT_API_ATTRS void TypedPartialMaxOrMinLoc(const char *intrinsic,
+    Descriptor &result, const Descriptor &x, int kind, int dim,
+    const char *source, int line, const Descriptor *mask, bool back) {
   Terminator terminator{source, line};
   CheckIntegerKind(terminator, kind, intrinsic);
   auto catKind{x.type().GetCategoryAndKind()};
@@ -398,16 +404,20 @@ inline void TypedPartialMaxOrMinLoc(const char *intrinsic, Descriptor &result,
 }
 
 extern "C" {
-void RTNAME(MaxlocDim)(Descriptor &result, const Descriptor &x, int kind,
+RT_EXT_API_GROUP_BEGIN
+
+void RTDEF(MaxlocDim)(Descriptor &result, const Descriptor &x, int kind,
     int dim, const char *source, int line, const Descriptor *mask, bool back) {
   TypedPartialMaxOrMinLoc<true>(
       "MAXLOC", result, x, kind, dim, source, line, mask, back);
 }
-void RTNAME(MinlocDim)(Descriptor &result, const Descriptor &x, int kind,
+void RTDEF(MinlocDim)(Descriptor &result, const Descriptor &x, int kind,
     int dim, const char *source, int line, const Descriptor *mask, bool back) {
   TypedPartialMaxOrMinLoc<false>(
       "MINLOC", result, x, kind, dim, source, line, mask, back);
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 
 // MAXVAL and MINVAL
@@ -415,7 +425,7 @@ void RTNAME(MinlocDim)(Descriptor &result, const Descriptor &x, int kind,
 template <TypeCategory CAT, int KIND, bool IS_MAXVAL, typename Enable = void>
 struct MaxOrMinIdentity {
   using Type = CppTypeFor<CAT, KIND>;
-  static constexpr Type Value() {
+  static constexpr RT_API_ATTRS Type Value() {
     return IS_MAXVAL ? std::numeric_limits<Type>::lowest()
                      : std::numeric_limits<Type>::max();
   }
@@ -425,7 +435,7 @@ struct MaxOrMinIdentity {
 template <bool IS_MAXVAL>
 struct MaxOrMinIdentity<TypeCategory::Integer, 16, IS_MAXVAL> {
   using Type = CppTypeFor<TypeCategory::Integer, 16>;
-  static constexpr Type Value() {
+  static constexpr RT_API_ATTRS Type Value() {
     return IS_MAXVAL ? Type{1} << 127 : ~Type{0} >> 1;
   }
 };
@@ -444,7 +454,7 @@ struct MaxOrMinIdentity<TypeCategory::Real, 16, IS_MAXVAL,
     typename std::enable_if_t<
         std::is_same_v<CppTypeFor<TypeCategory::Real, 16>, __float128>>> {
   using Type = __float128;
-  static Type Value() {
+  static RT_API_ATTRS Type Value() {
     // Create a buffer to store binary representation of __float128 constant.
     constexpr std::size_t alignment =
         std::max(alignof(Type), alignof(std::uint64_t));
@@ -472,15 +482,16 @@ template <TypeCategory CAT, int KIND, bool IS_MAXVAL>
 class NumericExtremumAccumulator {
 public:
   using Type = CppTypeFor<CAT, KIND>;
-  explicit NumericExtremumAccumulator(const Descriptor &array)
+  explicit RT_API_ATTRS NumericExtremumAccumulator(const Descriptor &array)
       : array_{array} {}
-  void Reinitialize() {
+  RT_API_ATTRS void Reinitialize() {
     extremum_ = MaxOrMinIdentity<CAT, KIND, IS_MAXVAL>::Value();
   }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     *p = extremum_;
   }
-  bool Accumulate(Type x) {
+  RT_API_ATTRS bool Accumulate(Type x) {
     if constexpr (IS_MAXVAL) {
       if (x > extremum_) {
         extremum_ = x;
@@ -490,7 +501,8 @@ public:
     }
     return true;
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     return Accumulate(*array_.Element<A>(at));
   }
 
@@ -500,16 +512,17 @@ private:
 };
 
 template <TypeCategory CAT, int KIND, bool IS_MAXVAL>
-inline CppTypeFor<CAT, KIND> TotalNumericMaxOrMin(const Descriptor &x,
-    const char *source, int line, int dim, const Descriptor *mask,
-    const char *intrinsic) {
+inline RT_API_ATTRS CppTypeFor<CAT, KIND> TotalNumericMaxOrMin(
+    const Descriptor &x, const char *source, int line, int dim,
+    const Descriptor *mask, const char *intrinsic) {
   return GetTotalReduction<CAT, KIND>(x, source, line, dim, mask,
       NumericExtremumAccumulator<CAT, KIND, IS_MAXVAL>{x}, intrinsic);
 }
 
 template <TypeCategory CAT, int KIND, typename ACCUMULATOR>
-static void DoMaxMinNorm2(Descriptor &result, const Descriptor &x, int dim,
-    const Descriptor *mask, const char *intrinsic, Terminator &terminator) {
+static RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
+    int dim, const Descriptor *mask, const char *intrinsic,
+    Terminator &terminator) {
   using Type = CppTypeFor<CAT, KIND>;
   ACCUMULATOR accumulator{x};
   if (dim == 0 || x.rank() == 1) {
@@ -537,8 +550,8 @@ static void DoMaxMinNorm2(Descriptor &result, const Descriptor &x, int dim,
 
 template <TypeCategory CAT, bool IS_MAXVAL> struct MaxOrMinHelper {
   template <int KIND> struct Functor {
-    void operator()(Descriptor &result, const Descriptor &x, int dim,
-        const Descriptor *mask, const char *intrinsic,
+    RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
+        int dim, const Descriptor *mask, const char *intrinsic,
         Terminator &terminator) const {
       DoMaxMinNorm2<CAT, KIND,
           NumericExtremumAccumulator<CAT, KIND, IS_MAXVAL>>(
@@ -548,9 +561,9 @@ template <TypeCategory CAT, bool IS_MAXVAL> struct MaxOrMinHelper {
 };
 
 template <bool IS_MAXVAL>
-inline void NumericMaxOrMin(Descriptor &result, const Descriptor &x, int dim,
-    const char *source, int line, const Descriptor *mask,
-    const char *intrinsic) {
+inline RT_API_ATTRS void NumericMaxOrMin(Descriptor &result,
+    const Descriptor &x, int dim, const char *source, int line,
+    const Descriptor *mask, const char *intrinsic) {
   Terminator terminator{source, line};
   auto type{x.type().GetCategoryAndKind()};
   RUNTIME_CHECK(terminator, type);
@@ -574,10 +587,11 @@ inline void NumericMaxOrMin(Descriptor &result, const Descriptor &x, int dim,
 template <int KIND, bool IS_MAXVAL> class CharacterExtremumAccumulator {
 public:
   using Type = CppTypeFor<TypeCategory::Character, KIND>;
-  explicit CharacterExtremumAccumulator(const Descriptor &array)
+  explicit RT_API_ATTRS CharacterExtremumAccumulator(const Descriptor &array)
       : array_{array}, charLen_{array_.ElementBytes() / KIND} {}
-  void Reinitialize() { extremum_ = nullptr; }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  RT_API_ATTRS void Reinitialize() { extremum_ = nullptr; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     static_assert(std::is_same_v<A, Type>);
     std::size_t byteSize{array_.ElementBytes()};
     if (extremum_) {
@@ -589,7 +603,7 @@ public:
       std::memset(p, IS_MAXVAL ? 0 : KIND == 1 ? 127 : 255, byteSize);
     }
   }
-  bool Accumulate(const Type *x) {
+  RT_API_ATTRS bool Accumulate(const Type *x) {
     if (!extremum_) {
       extremum_ = x;
     } else {
@@ -600,7 +614,8 @@ public:
     }
     return true;
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     return Accumulate(array_.Element<A>(at));
   }
 
@@ -612,8 +627,8 @@ private:
 
 template <bool IS_MAXVAL> struct CharacterMaxOrMinHelper {
   template <int KIND> struct Functor {
-    void operator()(Descriptor &result, const Descriptor &x, int dim,
-        const Descriptor *mask, const char *intrinsic,
+    RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
+        int dim, const Descriptor *mask, const char *intrinsic,
         Terminator &terminator) const {
       DoMaxMinNorm2<TypeCategory::Character, KIND,
           CharacterExtremumAccumulator<KIND, IS_MAXVAL>>(
@@ -623,9 +638,9 @@ template <bool IS_MAXVAL> struct CharacterMaxOrMinHelper {
 };
 
 template <bool IS_MAXVAL>
-inline void CharacterMaxOrMin(Descriptor &result, const Descriptor &x, int dim,
-    const char *source, int line, const Descriptor *mask,
-    const char *intrinsic) {
+inline RT_API_ATTRS void CharacterMaxOrMin(Descriptor &result,
+    const Descriptor &x, int dim, const char *source, int line,
+    const Descriptor *mask, const char *intrinsic) {
   Terminator terminator{source, line};
   auto type{x.type().GetCategoryAndKind()};
   RUNTIME_CHECK(terminator, type && type->first == TypeCategory::Character);
@@ -635,28 +650,30 @@ inline void CharacterMaxOrMin(Descriptor &result, const Descriptor &x, int dim,
 }
 
 extern "C" {
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(MaxvalInteger1)(const Descriptor &x,
+RT_EXT_API_GROUP_BEGIN
+
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(MaxvalInteger1)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 1, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(MaxvalInteger2)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(MaxvalInteger2)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 2, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(MaxvalInteger4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(MaxvalInteger4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 4, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(MaxvalInteger8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(MaxvalInteger8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 8, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(MaxvalInteger16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(MaxvalInteger16)(
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 16, true>(
@@ -665,58 +682,58 @@ CppTypeFor<TypeCategory::Integer, 16> RTNAME(MaxvalInteger16)(
 #endif
 
 // TODO: REAL(2 & 3)
-CppTypeFor<TypeCategory::Real, 4> RTNAME(MaxvalReal4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 4> RTDEF(MaxvalReal4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 4, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(MaxvalReal8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 8> RTDEF(MaxvalReal8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 8, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(MaxvalReal10)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 10> RTDEF(MaxvalReal10)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 10, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(MaxvalReal16)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 16> RTDEF(MaxvalReal16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 16, true>(
       x, source, line, dim, mask, "MAXVAL");
 }
 #endif
 
-void RTNAME(MaxvalCharacter)(Descriptor &result, const Descriptor &x,
+void RTDEF(MaxvalCharacter)(Descriptor &result, const Descriptor &x,
     const char *source, int line, const Descriptor *mask) {
   CharacterMaxOrMin<true>(result, x, 0, source, line, mask, "MAXVAL");
 }
 
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(MinvalInteger1)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(MinvalInteger1)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 1, false>(
       x, source, line, dim, mask, "MINVAL");
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(MinvalInteger2)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(MinvalInteger2)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 2, false>(
       x, source, line, dim, mask, "MINVAL");
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(MinvalInteger4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(MinvalInteger4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 4, false>(
       x, source, line, dim, mask, "MINVAL");
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(MinvalInteger8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(MinvalInteger8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 8, false>(
       x, source, line, dim, mask, "MINVAL");
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(MinvalInteger16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(MinvalInteger16)(
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Integer, 16, false>(
@@ -725,37 +742,37 @@ CppTypeFor<TypeCategory::Integer, 16> RTNAME(MinvalInteger16)(
 #endif
 
 // TODO: REAL(2 & 3)
-CppTypeFor<TypeCategory::Real, 4> RTNAME(MinvalReal4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 4> RTDEF(MinvalReal4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 4, false>(
       x, source, line, dim, mask, "MINVAL");
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(MinvalReal8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 8> RTDEF(MinvalReal8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 8, false>(
       x, source, line, dim, mask, "MINVAL");
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(MinvalReal10)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 10> RTDEF(MinvalReal10)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 10, false>(
       x, source, line, dim, mask, "MINVAL");
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(MinvalReal16)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 16> RTDEF(MinvalReal16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return TotalNumericMaxOrMin<TypeCategory::Real, 16, false>(
       x, source, line, dim, mask, "MINVAL");
 }
 #endif
 
-void RTNAME(MinvalCharacter)(Descriptor &result, const Descriptor &x,
+void RTDEF(MinvalCharacter)(Descriptor &result, const Descriptor &x,
     const char *source, int line, const Descriptor *mask) {
   CharacterMaxOrMin<false>(result, x, 0, source, line, mask, "MINVAL");
 }
 
-void RTNAME(MaxvalDim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(MaxvalDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line, const Descriptor *mask) {
   if (x.type().IsCharacter()) {
     CharacterMaxOrMin<true>(result, x, dim, source, line, mask, "MAXVAL");
@@ -763,7 +780,7 @@ void RTNAME(MaxvalDim)(Descriptor &result, const Descriptor &x, int dim,
     NumericMaxOrMin<true>(result, x, dim, source, line, mask, "MAXVAL");
   }
 }
-void RTNAME(MinvalDim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(MinvalDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line, const Descriptor *mask) {
   if (x.type().IsCharacter()) {
     CharacterMaxOrMin<false>(result, x, dim, source, line, mask, "MINVAL");
@@ -771,33 +788,42 @@ void RTNAME(MinvalDim)(Descriptor &result, const Descriptor &x, int dim,
     NumericMaxOrMin<false>(result, x, dim, source, line, mask, "MINVAL");
   }
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 
 // NORM2
 
-template <int KIND> class Norm2Accumulator {
-public:
-  using Type = CppTypeFor<TypeCategory::Real, KIND>;
-  // Use at least double precision for accumulators.
-  // Don't use __float128, it doesn't work with abs() or sqrt() yet.
-  static constexpr int largestLDKind {
+RT_VAR_GROUP_BEGIN
+
+// Use at least double precision for accumulators.
+// Don't use __float128, it doesn't work with abs() or sqrt() yet.
+static constexpr RT_CONST_VAR_ATTRS int largestLDKind {
 #if LDBL_MANT_DIG == 113
-    16
+  16
 #elif LDBL_MANT_DIG == 64
-    10
+  10
 #else
-    8
+  8
 #endif
-  };
+};
+
+RT_VAR_GROUP_END
+
+template <int KIND> class Norm2Accumulator {
+public:
+  using Type = CppTypeFor<TypeCategory::Real, KIND>;
   using AccumType =
       CppTypeFor<TypeCategory::Real, std::clamp(KIND, 8, largestLDKind)>;
-  explicit Norm2Accumulator(const Descriptor &array) : array_{array} {}
-  void Reinitialize() { max_ = sum_ = 0; }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  explicit RT_API_ATTRS Norm2Accumulator(const Descriptor &array)
+      : array_{array} {}
+  RT_API_ATTRS void Reinitialize() { max_ = sum_ = 0; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     // m * sqrt(1 + sum((others(:)/m)**2))
     *p = static_cast<Type>(max_ * std::sqrt(1 + sum_));
   }
-  bool Accumulate(Type x) {
+  RT_API_ATTRS bool Accumulate(Type x) {
     auto absX{std::abs(static_cast<AccumType>(x))};
     if (!max_) {
       max_ = absX;
@@ -813,7 +839,8 @@ public:
     }
     return true;
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     return Accumulate(*array_.Element<A>(at));
   }
 
@@ -824,7 +851,7 @@ private:
 };
 
 template <int KIND> struct Norm2Helper {
-  void operator()(Descriptor &result, const Descriptor &x, int dim,
+  RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x, int dim,
       const Descriptor *mask, Terminator &terminator) const {
     DoMaxMinNorm2<TypeCategory::Real, KIND, Norm2Accumulator<KIND>>(
         result, x, dim, mask, "NORM2", terminator);
@@ -832,33 +859,35 @@ template <int KIND> struct Norm2Helper {
 };
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
 // TODO: REAL(2 & 3)
-CppTypeFor<TypeCategory::Real, 4> RTNAME(Norm2_4)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(Norm2_4)(
     const Descriptor &x, const char *source, int line, int dim) {
   return GetTotalReduction<TypeCategory::Real, 4>(
       x, source, line, dim, nullptr, Norm2Accumulator<4>{x}, "NORM2");
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(Norm2_8)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(Norm2_8)(
     const Descriptor &x, const char *source, int line, int dim) {
   return GetTotalReduction<TypeCategory::Real, 8>(
       x, source, line, dim, nullptr, Norm2Accumulator<8>{x}, "NORM2");
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(Norm2_10)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(Norm2_10)(
     const Descriptor &x, const char *source, int line, int dim) {
   return GetTotalReduction<TypeCategory::Real, 10>(
       x, source, line, dim, nullptr, Norm2Accumulator<10>{x}, "NORM2");
 }
 #endif
 #if LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTNAME(Norm2_16)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Norm2_16)(
     const Descriptor &x, const char *source, int line, int dim) {
   return GetTotalReduction<TypeCategory::Real, 16>(
       x, source, line, dim, nullptr, Norm2Accumulator<16>{x}, "NORM2");
 }
 #endif
 
-void RTNAME(Norm2Dim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(Norm2Dim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line) {
   Terminator terminator{source, line};
   auto type{x.type().GetCategoryAndKind()};
@@ -870,5 +899,7 @@ void RTNAME(Norm2Dim)(Descriptor &result, const Descriptor &x, int dim,
     terminator.Crash("NORM2: bad type code %d", x.type().raw());
   }
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/findloc.cpp b/flang/runtime/findloc.cpp
index 339e0c75f05f..674a21ae50b8 100644
--- a/flang/runtime/findloc.cpp
+++ b/flang/runtime/findloc.cpp
@@ -21,8 +21,8 @@ template <TypeCategory CAT1, int KIND1, TypeCategory CAT2, int KIND2>
 struct Equality {
   using Type1 = CppTypeFor<CAT1, KIND1>;
   using Type2 = CppTypeFor<CAT2, KIND2>;
-  bool operator()(const Descriptor &array, const SubscriptValue at[],
-      const Descriptor &target) const {
+  RT_API_ATTRS bool operator()(const Descriptor &array,
+      const SubscriptValue at[], const Descriptor &target) const {
     return *array.Element<Type1>(at) == *target.OffsetElement<Type2>();
   }
 };
@@ -31,8 +31,8 @@ template <int KIND1, int KIND2>
 struct Equality<TypeCategory::Complex, KIND1, TypeCategory::Complex, KIND2> {
   using Type1 = CppTypeFor<TypeCategory::Complex, KIND1>;
   using Type2 = CppTypeFor<TypeCategory::Complex, KIND2>;
-  bool operator()(const Descriptor &array, const SubscriptValue at[],
-      const Descriptor &target) const {
+  RT_API_ATTRS bool operator()(const Descriptor &array,
+      const SubscriptValue at[], const Descriptor &target) const {
     const Type1 &xz{*array.Element<Type1>(at)};
     const Type2 &tz{*target.OffsetElement<Type2>()};
     return xz.real() == tz.real() && xz.imag() == tz.imag();
@@ -43,8 +43,8 @@ template <int KIND1, TypeCategory CAT2, int KIND2>
 struct Equality<TypeCategory::Complex, KIND1, CAT2, KIND2> {
   using Type1 = CppTypeFor<TypeCategory::Complex, KIND1>;
   using Type2 = CppTypeFor<CAT2, KIND2>;
-  bool operator()(const Descriptor &array, const SubscriptValue at[],
-      const Descriptor &target) const {
+  RT_API_ATTRS bool operator()(const Descriptor &array,
+      const SubscriptValue at[], const Descriptor &target) const {
     const Type1 &z{*array.Element<Type1>(at)};
     return z.imag() == 0 && z.real() == *target.OffsetElement<Type2>();
   }
@@ -54,8 +54,8 @@ template <TypeCategory CAT1, int KIND1, int KIND2>
 struct Equality<CAT1, KIND1, TypeCategory::Complex, KIND2> {
   using Type1 = CppTypeFor<CAT1, KIND1>;
   using Type2 = CppTypeFor<TypeCategory::Complex, KIND2>;
-  bool operator()(const Descriptor &array, const SubscriptValue at[],
-      const Descriptor &target) const {
+  RT_API_ATTRS bool operator()(const Descriptor &array,
+      const SubscriptValue at[], const Descriptor &target) const {
     const Type2 &z{*target.OffsetElement<Type2>()};
     return *array.Element<Type1>(at) == z.real() && z.imag() == 0;
   }
@@ -63,8 +63,8 @@ struct Equality<CAT1, KIND1, TypeCategory::Complex, KIND2> {
 
 template <int KIND> struct CharacterEquality {
   using Type = CppTypeFor<TypeCategory::Character, KIND>;
-  bool operator()(const Descriptor &array, const SubscriptValue at[],
-      const Descriptor &target) const {
+  RT_API_ATTRS bool operator()(const Descriptor &array,
+      const SubscriptValue at[], const Descriptor &target) const {
     return CharacterScalarCompare<Type>(array.Element<Type>(at),
                target.OffsetElement<Type>(),
                array.ElementBytes() / static_cast<unsigned>(KIND),
@@ -73,8 +73,8 @@ template <int KIND> struct CharacterEquality {
 };
 
 struct LogicalEquivalence {
-  bool operator()(const Descriptor &array, const SubscriptValue at[],
-      const Descriptor &target) const {
+  RT_API_ATTRS bool operator()(const Descriptor &array,
+      const SubscriptValue at[], const Descriptor &target) const {
     return IsLogicalElementTrue(array, at) ==
         IsLogicalElementTrue(target, at /*ignored*/);
   }
@@ -82,29 +82,31 @@ struct LogicalEquivalence {
 
 template <typename EQUALITY> class LocationAccumulator {
 public:
-  LocationAccumulator(
+  RT_API_ATTRS LocationAccumulator(
       const Descriptor &array, const Descriptor &target, bool back)
-      : array_{array}, target_{target}, back_{back} {
-    Reinitialize();
-  }
-  void Reinitialize() {
-    // per standard: result indices are all zero if no data
-    for (int j{0}; j < rank_; ++j) {
-      location_[j] = 0;
-    }
-  }
-  template <typename A> void GetResult(A *p, int zeroBasedDim = -1) {
+      : array_{array}, target_{target}, back_{back} {}
+  RT_API_ATTRS void Reinitialize() { gotAnything_ = false; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int zeroBasedDim = -1) {
     if (zeroBasedDim >= 0) {
-      *p = location_[zeroBasedDim] -
-          array_.GetDimension(zeroBasedDim).LowerBound() + 1;
-    } else {
+      *p = gotAnything_ ? location_[zeroBasedDim] -
+              array_.GetDimension(zeroBasedDim).LowerBound() + 1
+                        : 0;
+    } else if (gotAnything_) {
       for (int j{0}; j < rank_; ++j) {
         p[j] = location_[j] - array_.GetDimension(j).LowerBound() + 1;
       }
+    } else {
+      // no unmasked hits? result is all zeroes
+      for (int j{0}; j < rank_; ++j) {
+        p[j] = 0;
+      }
     }
   }
-  template <typename IGNORED> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename IGNORED>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     if (equality_(array_, at, target_)) {
+      gotAnything_ = true;
       for (int j{0}; j < rank_; ++j) {
         location_[j] = at[j];
       }
@@ -119,6 +121,7 @@ private:
   const Descriptor &target_;
   const bool back_{false};
   const int rank_{array_.rank()};
+  bool gotAnything_{false};
   SubscriptValue location_[maxRank];
   const EQUALITY equality_{};
 };
@@ -126,7 +129,7 @@ private:
 template <TypeCategory XCAT, int XKIND, TypeCategory TARGET_CAT>
 struct TotalNumericFindlocHelper {
   template <int TARGET_KIND> struct Functor {
-    void operator()(Descriptor &result, const Descriptor &x,
+    RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
         const Descriptor &target, int kind, int dim, const Descriptor *mask,
         bool back, Terminator &terminator) const {
       using Eq = Equality<XCAT, XKIND, TARGET_CAT, TARGET_KIND>;
@@ -144,9 +147,10 @@ template <TypeCategory CAT,
     class HELPER>
 struct NumericFindlocHelper {
   template <int KIND> struct Functor {
-    void operator()(TypeCategory targetCat, int targetKind, Descriptor &result,
-        const Descriptor &x, const Descriptor &target, int kind, int dim,
-        const Descriptor *mask, bool back, Terminator &terminator) const {
+    RT_API_ATTRS void operator()(TypeCategory targetCat, int targetKind,
+        Descriptor &result, const Descriptor &x, const Descriptor &target,
+        int kind, int dim, const Descriptor *mask, bool back,
+        Terminator &terminator) const {
       switch (targetCat) {
       case TypeCategory::Integer:
         ApplyIntegerKind<
@@ -176,7 +180,7 @@ struct NumericFindlocHelper {
 };
 
 template <int KIND> struct CharacterFindlocHelper {
-  void operator()(Descriptor &result, const Descriptor &x,
+  RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
       const Descriptor &target, int kind, const Descriptor *mask, bool back,
       Terminator &terminator) {
     using Accumulator = LocationAccumulator<CharacterEquality<KIND>>;
@@ -187,9 +191,9 @@ template <int KIND> struct CharacterFindlocHelper {
   }
 };
 
-static void LogicalFindlocHelper(Descriptor &result, const Descriptor &x,
-    const Descriptor &target, int kind, const Descriptor *mask, bool back,
-    Terminator &terminator) {
+static RT_API_ATTRS void LogicalFindlocHelper(Descriptor &result,
+    const Descriptor &x, const Descriptor &target, int kind,
+    const Descriptor *mask, bool back, Terminator &terminator) {
   using Accumulator = LocationAccumulator<LogicalEquivalence>;
   Accumulator accumulator{x, target, back};
   DoTotalReduction<void>(x, 0, mask, accumulator, "FINDLOC", terminator);
@@ -198,7 +202,9 @@ static void LogicalFindlocHelper(Descriptor &result, const Descriptor &x,
 }
 
 extern "C" {
-void RTNAME(Findloc)(Descriptor &result, const Descriptor &x,
+RT_EXT_API_GROUP_BEGIN
+
+void RTDEF(Findloc)(Descriptor &result, const Descriptor &x,
     const Descriptor &target, int kind, const char *source, int line,
     const Descriptor *mask, bool back) {
   int rank{x.rank()};
@@ -250,6 +256,8 @@ void RTNAME(Findloc)(Descriptor &result, const Descriptor &x,
         "FINDLOC: bad data type code (%d) for array", x.type().raw());
   }
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 
 // FINDLOC with DIM=
@@ -257,7 +265,7 @@ void RTNAME(Findloc)(Descriptor &result, const Descriptor &x,
 template <TypeCategory XCAT, int XKIND, TypeCategory TARGET_CAT>
 struct PartialNumericFindlocHelper {
   template <int TARGET_KIND> struct Functor {
-    void operator()(Descriptor &result, const Descriptor &x,
+    RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
         const Descriptor &target, int kind, int dim, const Descriptor *mask,
         bool back, Terminator &terminator) const {
       using Eq = Equality<XCAT, XKIND, TARGET_CAT, TARGET_KIND>;
@@ -271,7 +279,7 @@ struct PartialNumericFindlocHelper {
 };
 
 template <int KIND> struct PartialCharacterFindlocHelper {
-  void operator()(Descriptor &result, const Descriptor &x,
+  RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
       const Descriptor &target, int kind, int dim, const Descriptor *mask,
       bool back, Terminator &terminator) {
     using Accumulator = LocationAccumulator<CharacterEquality<KIND>>;
@@ -282,9 +290,9 @@ template <int KIND> struct PartialCharacterFindlocHelper {
   }
 };
 
-static void PartialLogicalFindlocHelper(Descriptor &result, const Descriptor &x,
-    const Descriptor &target, int kind, int dim, const Descriptor *mask,
-    bool back, Terminator &terminator) {
+static RT_API_ATTRS void PartialLogicalFindlocHelper(Descriptor &result,
+    const Descriptor &x, const Descriptor &target, int kind, int dim,
+    const Descriptor *mask, bool back, Terminator &terminator) {
   using Accumulator = LocationAccumulator<LogicalEquivalence>;
   Accumulator accumulator{x, target, back};
   ApplyIntegerKind<PartialLocationHelper<Accumulator>::template Functor, void>(
@@ -293,7 +301,9 @@ static void PartialLogicalFindlocHelper(Descriptor &result, const Descriptor &x,
 }
 
 extern "C" {
-void RTNAME(FindlocDim)(Descriptor &result, const Descriptor &x,
+RT_EXT_API_GROUP_BEGIN
+
+void RTDEF(FindlocDim)(Descriptor &result, const Descriptor &x,
     const Descriptor &target, int kind, int dim, const char *source, int line,
     const Descriptor *mask, bool back) {
   Terminator terminator{source, line};
@@ -337,5 +347,7 @@ void RTNAME(FindlocDim)(Descriptor &result, const Descriptor &x,
         "FINDLOC: bad data type code (%d) for array", x.type().raw());
   }
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/freestanding-tools.h b/flang/runtime/freestanding-tools.h
index 28248f76e882..bdc11ae93ac9 100644
--- a/flang/runtime/freestanding-tools.h
+++ b/flang/runtime/freestanding-tools.h
@@ -37,6 +37,11 @@
 #define STD_MEMCMP_UNSUPPORTED 1
 #endif
 
+#if !defined(STD_REALLOC_UNSUPPORTED) && \
+    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define STD_REALLOC_UNSUPPORTED 1
+#endif
+
 namespace Fortran::runtime {
 
 #if STD_FILL_N_UNSUPPORTED
@@ -118,5 +123,16 @@ static inline RT_API_ATTRS int memcmp(
 using std::memcmp;
 #endif // !STD_MEMCMP_UNSUPPORTED
 
+#if STD_REALLOC_UNSUPPORTED
+static inline RT_API_ATTRS void *realloc(void *ptr, std::size_t newByteSize) {
+  // Return nullptr and let the callers assert that.
+  // TODO: we can provide a straightforward implementation
+  // via malloc/memcpy/free.
+  return nullptr;
+}
+#else // !STD_REALLOC_UNSUPPORTED
+using std::realloc;
+#endif // !STD_REALLOC_UNSUPPORTED
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_
diff --git a/flang/runtime/inquiry.cpp b/flang/runtime/inquiry.cpp
index 5dc692c9a384..2b59a1cfab1a 100644
--- a/flang/runtime/inquiry.cpp
+++ b/flang/runtime/inquiry.cpp
@@ -19,7 +19,7 @@
 namespace Fortran::runtime {
 
 extern "C" {
-std::int64_t RTNAME(LboundDim)(
+std::int64_t RTDEF(LboundDim)(
     const Descriptor &array, int dim, const char *sourceFile, int line) {
   if (dim < 1 || dim > array.rank()) {
     Terminator terminator{sourceFile, line};
@@ -30,7 +30,7 @@ std::int64_t RTNAME(LboundDim)(
   return static_cast<std::int64_t>(dimension.LowerBound());
 }
 
-void RTNAME(Ubound)(Descriptor &result, const Descriptor &array, int kind,
+void RTDEF(Ubound)(Descriptor &result, const Descriptor &array, int kind,
     const char *sourceFile, int line) {
   SubscriptValue extent[1]{array.rank()};
   result.Establish(TypeCategory::Integer, kind, nullptr, 1, extent,
@@ -55,7 +55,7 @@ void RTNAME(Ubound)(Descriptor &result, const Descriptor &array, int kind,
   }
 }
 
-std::int64_t RTNAME(Size)(
+std::int64_t RTDEF(Size)(
     const Descriptor &array, const char *sourceFile, int line) {
   std::int64_t result{1};
   for (int i = 0; i < array.rank(); ++i) {
@@ -65,7 +65,7 @@ std::int64_t RTNAME(Size)(
   return result;
 }
 
-std::int64_t RTNAME(SizeDim)(
+std::int64_t RTDEF(SizeDim)(
     const Descriptor &array, int dim, const char *sourceFile, int line) {
   if (dim < 1 || dim > array.rank()) {
     Terminator terminator{sourceFile, line};
diff --git a/flang/runtime/io-api.cpp b/flang/runtime/io-api.cpp
index 9a69a2624641..79d43c7cc884 100644
--- a/flang/runtime/io-api.cpp
+++ b/flang/runtime/io-api.cpp
@@ -185,9 +185,6 @@ template <Direction DIR, template <Direction> class STATE, typename... A>
 Cookie BeginExternalListIO(
     int unitNumber, const char *sourceFile, int sourceLine, A &&...xs) {
   Terminator terminator{sourceFile, sourceLine};
-  if (unitNumber == DefaultUnit) {
-    unitNumber = DIR == Direction::Input ? 5 : 6;
-  }
   Cookie errorCookie{nullptr};
   ExternalFileUnit *unit{GetOrCreateUnit(
       unitNumber, DIR, false /*!unformatted*/, terminator, errorCookie)};
@@ -246,9 +243,6 @@ Cookie BeginExternalFormattedIO(const char *format, std::size_t formatLength,
     const Descriptor *formatDescriptor, ExternalUnit unitNumber,
     const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
-  if (unitNumber == DefaultUnit) {
-    unitNumber = DIR == Direction::Input ? 5 : 6;
-  }
   Cookie errorCookie{nullptr};
   ExternalFileUnit *unit{GetOrCreateUnit(
       unitNumber, DIR, false /*!unformatted*/, terminator, errorCookie)};
@@ -761,7 +755,8 @@ bool IONAME(SetAccess)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetAccess() called when not in an OPEN statement");
     }
@@ -796,7 +791,8 @@ bool IONAME(SetAction)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetAction() called when not in an OPEN statement");
     }
@@ -852,7 +848,8 @@ bool IONAME(SetAsynchronous)(
         handler.SignalError(IostatBadAsynchronous);
       }
     }
-  } else if (!io.get_if<ErroneousIoStatementState>()) {
+  } else if (!io.get_if<NoopStatementState>() &&
+      !io.get_if<ErroneousIoStatementState>()) {
     handler.Crash("SetAsynchronous() called when not in an OPEN or external "
                   "I/O statement");
   }
@@ -864,7 +861,8 @@ bool IONAME(SetCarriagecontrol)(
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetCarriageControl() called when not in an OPEN statement");
     }
@@ -895,7 +893,8 @@ bool IONAME(SetConvert)(
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetConvert() called when not in an OPEN statement");
     }
@@ -919,7 +918,8 @@ bool IONAME(SetEncoding)(
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetEncoding() called when not in an OPEN statement");
     }
@@ -949,7 +949,8 @@ bool IONAME(SetForm)(Cookie cookie, const char *keyword, std::size_t length) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetForm() called when not in an OPEN statement");
     }
@@ -977,7 +978,8 @@ bool IONAME(SetPosition)(
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetPosition() called when not in an OPEN statement");
     }
@@ -1008,7 +1010,8 @@ bool IONAME(SetRecl)(Cookie cookie, std::size_t n) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "SetRecl() called when not in an OPEN statement");
     }
@@ -1093,7 +1096,8 @@ bool IONAME(SetFile)(Cookie cookie, const char *path, std::size_t chars) {
     }
     open->set_path(path, chars);
     return true;
-  } else if (!io.get_if<ErroneousIoStatementState>()) {
+  } else if (!io.get_if<NoopStatementState>() &&
+      !io.get_if<ErroneousIoStatementState>()) {
     io.GetIoErrorHandler().Crash(
         "SetFile() called when not in an OPEN statement");
   }
@@ -1104,7 +1108,8 @@ bool IONAME(GetNewUnit)(Cookie cookie, int &unit, int kind) {
   IoStatementState &io{*cookie};
   auto *open{io.get_if<OpenStatementState>()};
   if (!open) {
-    if (!io.get_if<ErroneousIoStatementState>()) {
+    if (!io.get_if<NoopStatementState>() &&
+        !io.get_if<ErroneousIoStatementState>()) {
       io.GetIoErrorHandler().Crash(
           "GetNewUnit() called when not in an OPEN statement");
     }
@@ -1361,7 +1366,8 @@ std::size_t IONAME(GetSize)(Cookie cookie) {
   if (const auto *formatted{
           io.get_if<FormattedIoStatementState<Direction::Input>>()}) {
     return formatted->GetEditDescriptorChars();
-  } else if (!io.get_if<ErroneousIoStatementState>()) {
+  } else if (!io.get_if<NoopStatementState>() &&
+      !io.get_if<ErroneousIoStatementState>()) {
     handler.Crash("GetIoSize() called for an I/O statement that is not a "
                   "formatted READ()");
   }
@@ -1376,7 +1382,8 @@ std::size_t IONAME(GetIoLength)(Cookie cookie) {
   }
   if (const auto *inq{io.get_if<InquireIOLengthState>()}) {
     return inq->bytes();
-  } else if (!io.get_if<ErroneousIoStatementState>()) {
+  } else if (!io.get_if<NoopStatementState>() &&
+      !io.get_if<ErroneousIoStatementState>()) {
     handler.Crash("GetIoLength() called for an I/O statement that is not "
                   "INQUIRE(IOLENGTH=)");
   }
diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index dedf1f8364ad..7052a6acf41c 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -189,6 +189,17 @@ InternalListIoStatementState<DIR>::InternalListIoStatementState(
     : InternalIoStatementState<DIR>{d, sourceFile, sourceLine},
       ioStatementState_{*this} {}
 
+template <Direction DIR>
+int InternalListIoStatementState<DIR>::EndIoStatement() {
+  if constexpr (DIR == Direction::Input) {
+    if (int status{ListDirectedStatementState<DIR>::EndIoStatement()};
+        status != IostatOk) {
+      return status;
+    }
+  }
+  return InternalIoStatementState<DIR>::EndIoStatement();
+}
+
 ExternalIoStatementBase::ExternalIoStatementBase(
     ExternalFileUnit &unit, const char *sourceFile, int sourceLine)
     : IoStatementBase{sourceFile, sourceLine}, unit_{unit} {}
@@ -569,6 +580,12 @@ std::optional<char32_t> IoStatementState::NextInField(
         case '*':
         case '\n': // for stream access
           return std::nullopt;
+        case '&':
+        case '$':
+          if (edit.IsNamelist()) {
+            return std::nullopt;
+          }
+          break;
         case ',':
           if (!(edit.modes.editingFlags & decimalComma)) {
             return std::nullopt;
@@ -707,6 +724,13 @@ ListDirectedStatementState<Direction::Output>::GetNextDataEdit(
   return edit;
 }
 
+int ListDirectedStatementState<Direction::Input>::EndIoStatement() {
+  if (repeatPosition_) {
+    repeatPosition_->Cancel();
+  }
+  return IostatOk;
+}
+
 std::optional<DataEdit>
 ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
     IoStatementState &io, int maxRepeat) {
@@ -819,6 +843,17 @@ ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
 }
 
 template <Direction DIR>
+int ExternalListIoStatementState<DIR>::EndIoStatement() {
+  if constexpr (DIR == Direction::Input) {
+    if (auto status{ListDirectedStatementState<DIR>::EndIoStatement()};
+        status != IostatOk) {
+      return status;
+    }
+  }
+  return ExternalIoStatementState<DIR>::EndIoStatement();
+}
+
+template <Direction DIR>
 bool ExternalUnformattedIoStatementState<DIR>::Receive(
     char *data, std::size_t bytes, std::size_t elementBytes) {
   if constexpr (DIR == Direction::Output) {
@@ -910,6 +945,16 @@ bool ChildUnformattedIoStatementState<DIR>::Receive(
   return this->child().parent().Receive(data, bytes, elementBytes);
 }
 
+template <Direction DIR> int ChildListIoStatementState<DIR>::EndIoStatement() {
+  if constexpr (DIR == Direction::Input) {
+    if (int status{ListDirectedStatementState<DIR>::EndIoStatement()};
+        status != IostatOk) {
+      return status;
+    }
+  }
+  return ChildIoStatementState<DIR>::EndIoStatement();
+}
+
 template class InternalIoStatementState<Direction::Output>;
 template class InternalIoStatementState<Direction::Input>;
 template class InternalFormattedIoStatementState<Direction::Output>;
@@ -1219,6 +1264,7 @@ bool InquireUnitState::Inquire(
   case HashInquiryKeyword("SIZE"):
     result = -1;
     if (unit().IsConnected()) {
+      unit().FlushOutput(*this);
       if (auto size{unit().knownSize()}) {
         result = *size;
       }
@@ -1346,7 +1392,7 @@ bool InquireUnconnectedFileState::Inquire(
   case HashInquiryKeyword("SEQUENTIAL"):
   case HashInquiryKeyword("STREAM"):
   case HashInquiryKeyword("UNFORMATTED"):
-    str = "UNKNONN";
+    str = "UNKNOWN";
     break;
   case HashInquiryKeyword("READ"):
     str =
diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
index 91169f6c6e32..0b6bcbd9af02 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -304,6 +304,7 @@ class ListDirectedStatementState<Direction::Input>
     : public FormattedIoStatementState<Direction::Input> {
 public:
   bool inNamelistSequence() const { return inNamelistSequence_; }
+  int EndIoStatement();
 
   // Skips value separators, handles repetition and null values.
   // Vacant when '/' appears; present with descriptor == ListDirectedNullValue
@@ -317,6 +318,9 @@ public:
   // NAMELIST input item.
   void ResetForNextNamelistItem(bool inNamelistSequence) {
     remaining_ = 0;
+    if (repeatPosition_) {
+      repeatPosition_->Cancel();
+    }
     eatComma_ = false;
     realPart_ = imaginaryPart_ = false;
     inNamelistSequence_ = inNamelistSequence;
@@ -399,6 +403,7 @@ public:
       const Descriptor &, const char *sourceFile = nullptr, int sourceLine = 0);
   IoStatementState &ioStatementState() { return ioStatementState_; }
   using ListDirectedStatementState<DIR>::GetNextDataEdit;
+  int EndIoStatement();
 
 private:
   IoStatementState ioStatementState_; // points to *this
@@ -474,6 +479,7 @@ class ExternalListIoStatementState : public ExternalIoStatementState<DIR>,
 public:
   using ExternalIoStatementState<DIR>::ExternalIoStatementState;
   using ListDirectedStatementState<DIR>::GetNextDataEdit;
+  int EndIoStatement();
 };
 
 template <Direction DIR>
@@ -532,6 +538,7 @@ class ChildListIoStatementState : public ChildIoStatementState<DIR>,
 public:
   using ChildIoStatementState<DIR>::ChildIoStatementState;
   using ListDirectedStatementState<DIR>::GetNextDataEdit;
+  int EndIoStatement();
 };
 
 template <Direction DIR>
diff --git a/flang/runtime/matmul-transpose.cpp b/flang/runtime/matmul-transpose.cpp
index 43fcf7c08490..ee5fcd842b02 100644
--- a/flang/runtime/matmul-transpose.cpp
+++ b/flang/runtime/matmul-transpose.cpp
@@ -31,6 +31,11 @@
 namespace {
 using namespace Fortran::runtime;
 
+// Suppress the warnings about calling __host__-only std::complex operators,
+// defined in C++ STD header files, from __device__ code.
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 // Contiguous numeric TRANSPOSE(matrix)*matrix multiplication
 //   TRANSPOSE(matrix(n, rows)) * matrix(n,cols) ->
 //             matrix(rows, n)  * matrix(n,cols) -> matrix(rows,cols)
@@ -54,7 +59,7 @@ using namespace Fortran::runtime;
 //   2  RES(I,J) = RES(I,J) + X(K,I)*Y(K,J) ! loop-invariant last term
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
     bool X_HAS_STRIDED_COLUMNS, bool Y_HAS_STRIDED_COLUMNS>
-inline static void MatrixTransposedTimesMatrix(
+inline static RT_API_ATTRS void MatrixTransposedTimesMatrix(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
     SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
     SubscriptValue n, std::size_t xColumnByteStride = 0,
@@ -85,8 +90,10 @@ inline static void MatrixTransposedTimesMatrix(
   }
 }
 
+RT_DIAG_POP
+
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
-inline static void MatrixTransposedTimesMatrixHelper(
+inline static RT_API_ATTRS void MatrixTransposedTimesMatrixHelper(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
     SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
     SubscriptValue n, std::optional<std::size_t> xColumnByteStride,
@@ -110,6 +117,9 @@ inline static void MatrixTransposedTimesMatrixHelper(
   }
 }
 
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 // Contiguous numeric matrix*vector multiplication
 //   matrix(rows,n) * column vector(n) -> column vector(rows)
 // Straightforward algorithm:
@@ -126,7 +136,7 @@ inline static void MatrixTransposedTimesMatrixHelper(
 //   2 RES(I) = RES(I) + X(K,I)*Y(K)
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
     bool X_HAS_STRIDED_COLUMNS>
-inline static void MatrixTransposedTimesVector(
+inline static RT_API_ATTRS void MatrixTransposedTimesVector(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
     SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
     std::size_t xColumnByteStride = 0) {
@@ -147,8 +157,10 @@ inline static void MatrixTransposedTimesVector(
   }
 }
 
+RT_DIAG_POP
+
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
-inline static void MatrixTransposedTimesVectorHelper(
+inline static RT_API_ATTRS void MatrixTransposedTimesVectorHelper(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
     SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
     std::optional<std::size_t> xColumnByteStride) {
@@ -161,10 +173,13 @@ inline static void MatrixTransposedTimesVectorHelper(
   }
 }
 
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 // Implements an instance of MATMUL for given argument types.
 template <bool IS_ALLOCATING, TypeCategory RCAT, int RKIND, typename XT,
     typename YT>
-inline static void DoMatmulTranspose(
+inline static RT_API_ATTRS void DoMatmulTranspose(
     std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor> &result,
     const Descriptor &x, const Descriptor &y, Terminator &terminator) {
   int xRank{x.rank()};
@@ -325,6 +340,8 @@ inline static void DoMatmulTranspose(
   }
 }
 
+RT_DIAG_POP
+
 // Maps the dynamic type information from the arguments' descriptors
 // to the right instantiation of DoMatmul() for valid combinations of
 // types.
@@ -333,8 +350,9 @@ template <bool IS_ALLOCATING> struct MatmulTranspose {
       std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor>;
   template <TypeCategory XCAT, int XKIND> struct MM1 {
     template <TypeCategory YCAT, int YKIND> struct MM2 {
-      void operator()(ResultDescriptor &result, const Descriptor &x,
-          const Descriptor &y, Terminator &terminator) const {
+      RT_API_ATTRS void operator()(ResultDescriptor &result,
+          const Descriptor &x, const Descriptor &y,
+          Terminator &terminator) const {
         if constexpr (constexpr auto resultType{
                           GetResultType(XCAT, XKIND, YCAT, YKIND)}) {
           if constexpr (Fortran::common::IsNumericTypeCategory(
@@ -349,13 +367,13 @@ template <bool IS_ALLOCATING> struct MatmulTranspose {
             static_cast<int>(XCAT), XKIND, static_cast<int>(YCAT), YKIND);
       }
     };
-    void operator()(ResultDescriptor &result, const Descriptor &x,
+    RT_API_ATTRS void operator()(ResultDescriptor &result, const Descriptor &x,
         const Descriptor &y, Terminator &terminator, TypeCategory yCat,
         int yKind) const {
       ApplyType<MM2, void>(yCat, yKind, terminator, result, x, y, terminator);
     }
   };
-  void operator()(ResultDescriptor &result, const Descriptor &x,
+  RT_API_ATTRS void operator()(ResultDescriptor &result, const Descriptor &x,
       const Descriptor &y, const char *sourceFile, int line) const {
     Terminator terminator{sourceFile, line};
     auto xCatKind{x.type().GetCategoryAndKind()};
@@ -369,14 +387,17 @@ template <bool IS_ALLOCATING> struct MatmulTranspose {
 
 namespace Fortran::runtime {
 extern "C" {
-void RTNAME(MatmulTranspose)(Descriptor &result, const Descriptor &x,
+RT_EXT_API_GROUP_BEGIN
+
+void RTDEF(MatmulTranspose)(Descriptor &result, const Descriptor &x,
     const Descriptor &y, const char *sourceFile, int line) {
   MatmulTranspose<true>{}(result, x, y, sourceFile, line);
 }
-void RTNAME(MatmulTransposeDirect)(const Descriptor &result,
-    const Descriptor &x, const Descriptor &y, const char *sourceFile,
-    int line) {
+void RTDEF(MatmulTransposeDirect)(const Descriptor &result, const Descriptor &x,
+    const Descriptor &y, const char *sourceFile, int line) {
   MatmulTranspose<false>{}(result, x, y, sourceFile, line);
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/matmul.cpp b/flang/runtime/matmul.cpp
index b46a94de01ce..e4595db77926 100644
--- a/flang/runtime/matmul.cpp
+++ b/flang/runtime/matmul.cpp
@@ -29,14 +29,21 @@
 
 namespace Fortran::runtime {
 
+// Suppress the warnings about calling __host__-only std::complex operators,
+// defined in C++ STD header files, from __device__ code.
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 // General accumulator for any type and stride; this is not used for
 // contiguous numeric cases.
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 class Accumulator {
 public:
   using Result = AccumulationType<RCAT, RKIND>;
-  Accumulator(const Descriptor &x, const Descriptor &y) : x_{x}, y_{y} {}
-  void Accumulate(const SubscriptValue xAt[], const SubscriptValue yAt[]) {
+  RT_API_ATTRS Accumulator(const Descriptor &x, const Descriptor &y)
+      : x_{x}, y_{y} {}
+  RT_API_ATTRS void Accumulate(
+      const SubscriptValue xAt[], const SubscriptValue yAt[]) {
     if constexpr (RCAT == TypeCategory::Logical) {
       sum_ = sum_ ||
           (IsLogicalElementTrue(x_, xAt) && IsLogicalElementTrue(y_, yAt));
@@ -45,7 +52,7 @@ public:
           static_cast<Result>(*y_.Element<YT>(yAt));
     }
   }
-  Result GetResult() const { return sum_; }
+  RT_API_ATTRS Result GetResult() const { return sum_; }
 
 private:
   const Descriptor &x_, &y_;
@@ -71,9 +78,10 @@ private:
 //   2  RES(I,J) = RES(I,J) + X(I,K)*Y(K,J) ! loop-invariant last term
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
     bool X_HAS_STRIDED_COLUMNS, bool Y_HAS_STRIDED_COLUMNS>
-inline void MatrixTimesMatrix(CppTypeFor<RCAT, RKIND> *RESTRICT product,
-    SubscriptValue rows, SubscriptValue cols, const XT *RESTRICT x,
-    const YT *RESTRICT y, SubscriptValue n, std::size_t xColumnByteStride = 0,
+inline RT_API_ATTRS void MatrixTimesMatrix(
+    CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
+    SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
+    SubscriptValue n, std::size_t xColumnByteStride = 0,
     std::size_t yColumnByteStride = 0) {
   using ResultType = CppTypeFor<RCAT, RKIND>;
   std::memset(product, 0, rows * cols * sizeof *product);
@@ -102,11 +110,13 @@ inline void MatrixTimesMatrix(CppTypeFor<RCAT, RKIND> *RESTRICT product,
   }
 }
 
+RT_DIAG_POP
+
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
-inline void MatrixTimesMatrixHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
-    SubscriptValue rows, SubscriptValue cols, const XT *RESTRICT x,
-    const YT *RESTRICT y, SubscriptValue n,
-    std::optional<std::size_t> xColumnByteStride,
+inline RT_API_ATTRS void MatrixTimesMatrixHelper(
+    CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
+    SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
+    SubscriptValue n, std::optional<std::size_t> xColumnByteStride,
     std::optional<std::size_t> yColumnByteStride) {
   if (!xColumnByteStride) {
     if (!yColumnByteStride) {
@@ -127,6 +137,9 @@ inline void MatrixTimesMatrixHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
   }
 }
 
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 // Contiguous numeric matrix*vector multiplication
 //   matrix(rows,n) * column vector(n) -> column vector(rows)
 // Straightforward algorithm:
@@ -143,9 +156,10 @@ inline void MatrixTimesMatrixHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
 //   2 RES(J) = RES(J) + X(J,K)*Y(K)
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
     bool X_HAS_STRIDED_COLUMNS>
-inline void MatrixTimesVector(CppTypeFor<RCAT, RKIND> *RESTRICT product,
-    SubscriptValue rows, SubscriptValue n, const XT *RESTRICT x,
-    const YT *RESTRICT y, std::size_t xColumnByteStride = 0) {
+inline RT_API_ATTRS void MatrixTimesVector(
+    CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
+    SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
+    std::size_t xColumnByteStride = 0) {
   using ResultType = CppTypeFor<RCAT, RKIND>;
   std::memset(product, 0, rows * sizeof *product);
   [[maybe_unused]] const XT *RESTRICT xp0{x};
@@ -163,10 +177,13 @@ inline void MatrixTimesVector(CppTypeFor<RCAT, RKIND> *RESTRICT product,
   }
 }
 
+RT_DIAG_POP
+
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
-inline void MatrixTimesVectorHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
-    SubscriptValue rows, SubscriptValue n, const XT *RESTRICT x,
-    const YT *RESTRICT y, std::optional<std::size_t> xColumnByteStride) {
+inline RT_API_ATTRS void MatrixTimesVectorHelper(
+    CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
+    SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
+    std::optional<std::size_t> xColumnByteStride) {
   if (!xColumnByteStride) {
     MatrixTimesVector<RCAT, RKIND, XT, YT, false>(product, rows, n, x, y);
   } else {
@@ -175,6 +192,9 @@ inline void MatrixTimesVectorHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
   }
 }
 
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 // Contiguous numeric vector*matrix multiplication
 //   row vector(n) * matrix(n,cols) -> row vector(cols)
 // Straightforward algorithm:
@@ -191,9 +211,10 @@ inline void MatrixTimesVectorHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
 //   2 RES(J) = RES(J) + X(K)*Y(K,J)
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
     bool Y_HAS_STRIDED_COLUMNS>
-inline void VectorTimesMatrix(CppTypeFor<RCAT, RKIND> *RESTRICT product,
-    SubscriptValue n, SubscriptValue cols, const XT *RESTRICT x,
-    const YT *RESTRICT y, std::size_t yColumnByteStride = 0) {
+inline RT_API_ATTRS void VectorTimesMatrix(
+    CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue n,
+    SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
+    std::size_t yColumnByteStride = 0) {
   using ResultType = CppTypeFor<RCAT, RKIND>;
   std::memset(product, 0, cols * sizeof *product);
   for (SubscriptValue k{0}; k < n; ++k) {
@@ -212,11 +233,14 @@ inline void VectorTimesMatrix(CppTypeFor<RCAT, RKIND> *RESTRICT product,
   }
 }
 
+RT_DIAG_POP
+
 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
     bool SPARSE_COLUMNS = false>
-inline void VectorTimesMatrixHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
-    SubscriptValue n, SubscriptValue cols, const XT *RESTRICT x,
-    const YT *RESTRICT y, std::optional<std::size_t> yColumnByteStride) {
+inline RT_API_ATTRS void VectorTimesMatrixHelper(
+    CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue n,
+    SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
+    std::optional<std::size_t> yColumnByteStride) {
   if (!yColumnByteStride) {
     VectorTimesMatrix<RCAT, RKIND, XT, YT, false>(product, n, cols, x, y);
   } else {
@@ -225,10 +249,13 @@ inline void VectorTimesMatrixHelper(CppTypeFor<RCAT, RKIND> *RESTRICT product,
   }
 }
 
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 // Implements an instance of MATMUL for given argument types.
 template <bool IS_ALLOCATING, TypeCategory RCAT, int RKIND, typename XT,
     typename YT>
-static inline void DoMatmul(
+static inline RT_API_ATTRS void DoMatmul(
     std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor> &result,
     const Descriptor &x, const Descriptor &y, Terminator &terminator) {
   int xRank{x.rank()};
@@ -398,6 +425,8 @@ static inline void DoMatmul(
   }
 }
 
+RT_DIAG_POP
+
 // Maps the dynamic type information from the arguments' descriptors
 // to the right instantiation of DoMatmul() for valid combinations of
 // types.
@@ -406,8 +435,9 @@ template <bool IS_ALLOCATING> struct Matmul {
       std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor>;
   template <TypeCategory XCAT, int XKIND> struct MM1 {
     template <TypeCategory YCAT, int YKIND> struct MM2 {
-      void operator()(ResultDescriptor &result, const Descriptor &x,
-          const Descriptor &y, Terminator &terminator) const {
+      RT_API_ATTRS void operator()(ResultDescriptor &result,
+          const Descriptor &x, const Descriptor &y,
+          Terminator &terminator) const {
         if constexpr (constexpr auto resultType{
                           GetResultType(XCAT, XKIND, YCAT, YKIND)}) {
           if constexpr (common::IsNumericTypeCategory(resultType->first) ||
@@ -421,13 +451,13 @@ template <bool IS_ALLOCATING> struct Matmul {
             static_cast<int>(XCAT), XKIND, static_cast<int>(YCAT), YKIND);
       }
     };
-    void operator()(ResultDescriptor &result, const Descriptor &x,
+    RT_API_ATTRS void operator()(ResultDescriptor &result, const Descriptor &x,
         const Descriptor &y, Terminator &terminator, TypeCategory yCat,
         int yKind) const {
       ApplyType<MM2, void>(yCat, yKind, terminator, result, x, y, terminator);
     }
   };
-  void operator()(ResultDescriptor &result, const Descriptor &x,
+  RT_API_ATTRS void operator()(ResultDescriptor &result, const Descriptor &x,
       const Descriptor &y, const char *sourceFile, int line) const {
     Terminator terminator{sourceFile, line};
     auto xCatKind{x.type().GetCategoryAndKind()};
@@ -439,13 +469,17 @@ template <bool IS_ALLOCATING> struct Matmul {
 };
 
 extern "C" {
-void RTNAME(Matmul)(Descriptor &result, const Descriptor &x,
-    const Descriptor &y, const char *sourceFile, int line) {
+RT_EXT_API_GROUP_BEGIN
+
+void RTDEF(Matmul)(Descriptor &result, const Descriptor &x, const Descriptor &y,
+    const char *sourceFile, int line) {
   Matmul<true>{}(result, x, y, sourceFile, line);
 }
-void RTNAME(MatmulDirect)(const Descriptor &result, const Descriptor &x,
+void RTDEF(MatmulDirect)(const Descriptor &result, const Descriptor &x,
     const Descriptor &y, const char *sourceFile, int line) {
   Matmul<false>{}(result, x, y, sourceFile, line);
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/memory.cpp b/flang/runtime/memory.cpp
index 5ed737905a9c..aa6ff9723d1a 100644
--- a/flang/runtime/memory.cpp
+++ b/flang/runtime/memory.cpp
@@ -8,11 +8,14 @@
 
 #include "flang/Runtime/memory.h"
 #include "terminator.h"
+#include "tools.h"
 #include <cstdlib>
 
 namespace Fortran::runtime {
+RT_OFFLOAD_VAR_GROUP_BEGIN
 
-void *AllocateMemoryOrCrash(const Terminator &terminator, std::size_t bytes) {
+RT_API_ATTRS void *AllocateMemoryOrCrash(
+    const Terminator &terminator, std::size_t bytes) {
   if (void *p{std::malloc(bytes)}) {
     return p;
   }
@@ -24,5 +27,20 @@ void *AllocateMemoryOrCrash(const Terminator &terminator, std::size_t bytes) {
   return nullptr;
 }
 
-void FreeMemory(void *p) { std::free(p); }
+RT_API_ATTRS void *ReallocateMemoryOrCrash(
+    const Terminator &terminator, void *ptr, std::size_t newByteSize) {
+  if (void *p{Fortran::runtime::realloc(ptr, newByteSize)}) {
+    return p;
+  }
+  if (newByteSize > 0) {
+    terminator.Crash("Fortran runtime internal error: memory realloc returned "
+                     "null, needed %zd bytes",
+        newByteSize);
+  }
+  return nullptr;
+}
+
+RT_API_ATTRS void FreeMemory(void *p) { std::free(p); }
+
+RT_OFFLOAD_VAR_GROUP_END
 } // namespace Fortran::runtime
diff --git a/flang/runtime/misc-intrinsic.cpp b/flang/runtime/misc-intrinsic.cpp
index 19eb9351d47f..56f2028c2ff0 100644
--- a/flang/runtime/misc-intrinsic.cpp
+++ b/flang/runtime/misc-intrinsic.cpp
@@ -8,6 +8,7 @@
 
 #include "flang/Runtime/misc-intrinsic.h"
 #include "terminator.h"
+#include "tools.h"
 #include "flang/Runtime/descriptor.h"
 #include <algorithm>
 #include <cstring>
@@ -15,9 +16,9 @@
 
 namespace Fortran::runtime {
 
-static void TransferImpl(Descriptor &result, const Descriptor &source,
-    const Descriptor &mold, const char *sourceFile, int line,
-    std::optional<std::int64_t> resultExtent) {
+static RT_API_ATTRS void TransferImpl(Descriptor &result,
+    const Descriptor &source, const Descriptor &mold, const char *sourceFile,
+    int line, std::optional<std::int64_t> resultExtent) {
   int rank{resultExtent.has_value() ? 1 : 0};
   std::size_t elementBytes{mold.ElementBytes()};
   result.Establish(mold.type(), elementBytes, nullptr, rank, nullptr,
@@ -52,8 +53,9 @@ static void TransferImpl(Descriptor &result, const Descriptor &source,
 }
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
-void RTNAME(Transfer)(Descriptor &result, const Descriptor &source,
+void RTDEF(Transfer)(Descriptor &result, const Descriptor &source,
     const Descriptor &mold, const char *sourceFile, int line) {
   std::optional<std::int64_t> elements;
   if (mold.rank() > 0) {
@@ -67,18 +69,19 @@ void RTNAME(Transfer)(Descriptor &result, const Descriptor &source,
                                            "when SOURCE= is not zero-sized");
       }
     } else {
-      elements = 0;
+      elements = std::int64_t{0};
     }
   }
   return TransferImpl(
       result, source, mold, sourceFile, line, std::move(elements));
 }
 
-void RTNAME(TransferSize)(Descriptor &result, const Descriptor &source,
+void RTDEF(TransferSize)(Descriptor &result, const Descriptor &source,
     const Descriptor &mold, const char *sourceFile, int line,
     std::int64_t size) {
   return TransferImpl(result, source, mold, sourceFile, line, size);
 }
 
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index 61815a7cc8a4..e6997bcf945b 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -82,7 +82,7 @@ bool IONAME(OutputNamelist)(Cookie cookie, const NamelistGroup &group) {
 
 static constexpr bool IsLegalIdStart(char32_t ch) {
   return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' ||
-      ch == '@' || ch == '$';
+      ch == '@';
 }
 
 static constexpr bool IsLegalIdChar(char32_t ch) {
@@ -247,6 +247,28 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc,
   return false;
 }
 
+static void StorageSequenceExtension(
+    Descriptor &desc, const Descriptor &source) {
+  // Support the near-universal extension of NAMELIST input into a
+  // designatable storage sequence identified by its initial scalar array
+  // element.  For example, treat "A(1) = 1. 2. 3." as if it had been
+  // "A(1:) = 1. 2. 3.".
+  if (desc.rank() == 0 && (source.rank() == 1 || source.IsContiguous())) {
+    if (auto stride{source.rank() == 1
+                ? source.GetDimension(0).ByteStride()
+                : static_cast<SubscriptValue>(source.ElementBytes())};
+        stride != 0) {
+      desc.raw().attribute = CFI_attribute_pointer;
+      desc.raw().rank = 1;
+      desc.GetDimension(0)
+          .SetBounds(1,
+              source.Elements() -
+                  ((source.OffsetElement() - desc.OffsetElement()) / stride))
+          .SetByteStride(stride);
+    }
+  }
+}
+
 static bool HandleSubstring(
     IoStatementState &io, Descriptor &desc, const char *name) {
   IoErrorHandler &handler{io.GetIoErrorHandler()};
@@ -378,12 +400,13 @@ static bool HandleComponent(IoStatementState &io, Descriptor &desc,
   return false;
 }
 
-// Advance to the terminal '/' of a namelist group.
+// Advance to the terminal '/' of a namelist group or leading '&'/'$'
+// of the next.
 static void SkipNamelistGroup(IoStatementState &io) {
   std::size_t byteCount{0};
   while (auto ch{io.GetNextNonBlank(byteCount)}) {
     io.HandleRelativePosition(byteCount);
-    if (*ch == '/') {
+    if (*ch == '/' || *ch == '&' || *ch == '$') {
       break;
     } else if (*ch == '\'' || *ch == '"') {
       // Skip quoted character literal
@@ -418,7 +441,7 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   std::size_t byteCount{0};
   while (true) {
     next = io.GetNextNonBlank(byteCount);
-    while (next && *next != '&') {
+    while (next && *next != '&' && *next != '$') {
       // Extension: comment lines without ! before namelist groups
       if (!io.AdvanceRecord()) {
         next.reset();
@@ -430,9 +453,10 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
       handler.SignalEnd();
       return false;
     }
-    if (*next != '&') {
+    if (*next != '&' && *next != '$') {
       handler.SignalError(
-          "NAMELIST input group does not begin with '&' (at '%lc')", *next);
+          "NAMELIST input group does not begin with '&' or '$' (at '%lc')",
+          *next);
       return false;
     }
     io.HandleRelativePosition(byteCount);
@@ -448,7 +472,7 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   // Read the group's items
   while (true) {
     next = io.GetNextNonBlank(byteCount);
-    if (!next || *next == '/') {
+    if (!next || *next == '/' || *next == '&' || *next == '$') {
       break;
     }
     if (!GetLowerCaseName(io, name, sizeof name)) {
@@ -478,10 +502,14 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
     bool hadSubscripts{false};
     bool hadSubstring{false};
     if (next && (*next == '(' || *next == '%')) {
+      const Descriptor *lastSubscriptBase{nullptr};
+      Descriptor *lastSubscriptDescriptor{nullptr};
       do {
         Descriptor &mutableDescriptor{staticDesc[whichStaticDesc].descriptor()};
         whichStaticDesc ^= 1;
         io.HandleRelativePosition(byteCount); // skip over '(' or '%'
+        lastSubscriptDescriptor = nullptr;
+        lastSubscriptBase = nullptr;
         if (*next == '(') {
           if (!hadSubstring && (hadSubscripts || useDescriptor->rank() == 0)) {
             mutableDescriptor = *useDescriptor;
@@ -495,11 +523,12 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
                                 "NAMELIST group '%s'",
                 name, group.groupName);
             return false;
+          } else if (HandleSubscripts(
+                         io, mutableDescriptor, *useDescriptor, name)) {
+            lastSubscriptBase = useDescriptor;
+            lastSubscriptDescriptor = &mutableDescriptor;
           } else {
-            if (!HandleSubscripts(
-                    io, mutableDescriptor, *useDescriptor, name)) {
-              return false;
-            }
+            return false;
           }
           hadSubscripts = true;
         } else {
@@ -512,6 +541,9 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
         useDescriptor = &mutableDescriptor;
         next = io.GetCurrentChar(byteCount);
       } while (next && (*next == '(' || *next == '%'));
+      if (lastSubscriptDescriptor) {
+        StorageSequenceExtension(*lastSubscriptDescriptor, *lastSubscriptBase);
+      }
     }
     // Skip the '='
     next = io.GetNextNonBlank(byteCount);
@@ -540,12 +572,15 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
       io.HandleRelativePosition(byteCount);
     }
   }
-  if (!next || *next != '/') {
+  if (next && *next == '/') {
+    io.HandleRelativePosition(byteCount);
+  } else if (*next && (*next == '&' || *next == '$')) {
+    // stop at beginning of next group
+  } else {
     handler.SignalError(
         "No '/' found after NAMELIST group '%s'", group.groupName);
     return false;
   }
-  io.HandleRelativePosition(byteCount);
   return true;
 }
 
@@ -565,7 +600,7 @@ bool IsNamelistNameOrSlash(IoStatementState &io) {
           // TODO: how to deal with NaN(...) ambiguity?
           return ch && (*ch == '=' || *ch == '(' || *ch == '%');
         } else {
-          return *ch == '/';
+          return *ch == '/' || *ch == '&' || *ch == '$';
         }
       }
     }
diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp
index cd54e6b54a2e..6cbf00e0c36c 100644
--- a/flang/runtime/numeric.cpp
+++ b/flang/runtime/numeric.cpp
@@ -17,8 +17,8 @@
 namespace Fortran::runtime {
 
 template <typename RES>
-inline RES getIntArgValue(const char *source, int line, void *arg, int kind,
-    std::int64_t defaultValue, int resKind) {
+inline RT_API_ATTRS RES getIntArgValue(const char *source, int line, void *arg,
+    int kind, std::int64_t defaultValue, int resKind) {
   RES res;
   if (!arg) {
     res = static_cast<RES>(defaultValue);
@@ -49,7 +49,8 @@ inline RES getIntArgValue(const char *source, int line, void *arg, int kind,
 }
 
 // NINT (16.9.141)
-template <typename RESULT, typename ARG> inline RESULT Nint(ARG x) {
+template <typename RESULT, typename ARG>
+inline RT_API_ATTRS RESULT Nint(ARG x) {
   if (x >= 0) {
     return std::trunc(x + ARG{0.5});
   } else {
@@ -58,15 +59,18 @@ template <typename RESULT, typename ARG> inline RESULT Nint(ARG x) {
 }
 
 // CEILING & FLOOR (16.9.43, .79)
-template <typename RESULT, typename ARG> inline RESULT Ceiling(ARG x) {
+template <typename RESULT, typename ARG>
+inline RT_API_ATTRS RESULT Ceiling(ARG x) {
   return std::ceil(x);
 }
-template <typename RESULT, typename ARG> inline RESULT Floor(ARG x) {
+template <typename RESULT, typename ARG>
+inline RT_API_ATTRS RESULT Floor(ARG x) {
   return std::floor(x);
 }
 
 // EXPONENT (16.9.75)
-template <typename RESULT, typename ARG> inline RESULT Exponent(ARG x) {
+template <typename RESULT, typename ARG>
+inline RT_API_ATTRS RESULT Exponent(ARG x) {
   if (std::isinf(x) || std::isnan(x)) {
     return std::numeric_limits<RESULT>::max(); // +/-Inf, NaN -> HUGE(0)
   } else if (x == 0) {
@@ -76,8 +80,13 @@ template <typename RESULT, typename ARG> inline RESULT Exponent(ARG x) {
   }
 }
 
+// Suppress the warnings about calling __host__-only std::frexp,
+// defined in C++ STD header files, from __device__ code.
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 // FRACTION (16.9.80)
-template <typename T> inline T Fraction(T x) {
+template <typename T> inline RT_API_ATTRS T Fraction(T x) {
   if (std::isnan(x)) {
     return x; // NaN -> same NaN
   } else if (std::isinf(x)) {
@@ -90,9 +99,11 @@ template <typename T> inline T Fraction(T x) {
   }
 }
 
+RT_DIAG_POP
+
 // MOD & MODULO (16.9.135, .136)
 template <bool IS_MODULO, typename T>
-inline T IntMod(T x, T p, const char *sourceFile, int sourceLine) {
+inline RT_API_ATTRS T IntMod(T x, T p, const char *sourceFile, int sourceLine) {
   if (p == 0) {
     Terminator{sourceFile, sourceLine}.Crash(
         IS_MODULO ? "MODULO with P==0" : "MOD with P==0");
@@ -104,7 +115,8 @@ inline T IntMod(T x, T p, const char *sourceFile, int sourceLine) {
   return mod;
 }
 template <bool IS_MODULO, typename T>
-inline T RealMod(T a, T p, const char *sourceFile, int sourceLine) {
+inline RT_API_ATTRS T RealMod(
+    T a, T p, const char *sourceFile, int sourceLine) {
   if (p == 0) {
     Terminator{sourceFile, sourceLine}.Crash(
         IS_MODULO ? "MODULO with P==0" : "MOD with P==0");
@@ -120,7 +132,7 @@ inline T RealMod(T a, T p, const char *sourceFile, int sourceLine) {
 }
 
 // RRSPACING (16.9.164)
-template <int PREC, typename T> inline T RRSpacing(T x) {
+template <int PREC, typename T> inline RT_API_ATTRS T RRSpacing(T x) {
   if (std::isnan(x)) {
     return x; // NaN -> same NaN
   } else if (std::isinf(x)) {
@@ -133,7 +145,7 @@ template <int PREC, typename T> inline T RRSpacing(T x) {
 }
 
 // SCALE (16.9.166)
-template <typename T> inline T Scale(T x, std::int64_t p) {
+template <typename T> inline RT_API_ATTRS T Scale(T x, std::int64_t p) {
   auto ip{static_cast<int>(p)};
   if (ip != p) {
     ip = p < 0 ? std::numeric_limits<int>::min()
@@ -144,7 +156,7 @@ template <typename T> inline T Scale(T x, std::int64_t p) {
 
 // SELECTED_INT_KIND (16.9.169)
 template <typename T>
-inline CppTypeFor<TypeCategory::Integer, 4> SelectedIntKind(T x) {
+inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedIntKind(T x) {
   if (x <= 2) {
     return 1;
   } else if (x <= 4) {
@@ -163,7 +175,8 @@ inline CppTypeFor<TypeCategory::Integer, 4> SelectedIntKind(T x) {
 
 // SELECTED_REAL_KIND (16.9.170)
 template <typename P, typename R, typename D>
-inline CppTypeFor<TypeCategory::Integer, 4> SelectedRealKind(P p, R r, D d) {
+inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedRealKind(
+    P p, R r, D d) {
   if (d != 2) {
     return -5;
   }
@@ -210,7 +223,7 @@ inline CppTypeFor<TypeCategory::Integer, 4> SelectedRealKind(P p, R r, D d) {
 }
 
 // SET_EXPONENT (16.9.171)
-template <typename T> inline T SetExponent(T x, std::int64_t p) {
+template <typename T> inline RT_API_ATTRS T SetExponent(T x, std::int64_t p) {
   if (std::isnan(x)) {
     return x; // NaN -> same NaN
   } else if (std::isinf(x)) {
@@ -229,7 +242,7 @@ template <typename T> inline T SetExponent(T x, std::int64_t p) {
 }
 
 // SPACING (16.9.180)
-template <int PREC, typename T> inline T Spacing(T x) {
+template <int PREC, typename T> inline RT_API_ATTRS T Spacing(T x) {
   if (std::isnan(x)) {
     return x; // NaN -> same NaN
   } else if (std::isinf(x)) {
@@ -246,18 +259,18 @@ template <int PREC, typename T> inline T Spacing(T x) {
 }
 
 // NEAREST (16.9.139)
-template <int PREC, typename T> inline T Nearest(T x, bool positive) {
-  auto spacing{Spacing<PREC>(x)};
-  if (x == 0) {
-    auto least{std::numeric_limits<T>::denorm_min()};
-    return positive ? least : -least;
+template <int PREC, typename T>
+inline RT_API_ATTRS T Nearest(T x, bool positive) {
+  if (positive) {
+    return std::nextafter(x, std::numeric_limits<T>::infinity());
   } else {
-    return positive ? x + spacing : x - spacing;
+    return std::nextafter(x, -std::numeric_limits<T>::infinity());
   }
 }
 
 // Exponentiation operator for (Real ** Integer) cases (10.1.5.2.1).
-template <typename BTy, typename ETy> BTy FPowI(BTy base, ETy exp) {
+template <typename BTy, typename ETy>
+RT_API_ATTRS BTy FPowI(BTy base, ETy exp) {
   if (exp == ETy{0})
     return BTy{1};
   bool isNegativePower{exp < ETy{0}};
@@ -289,565 +302,566 @@ template <typename BTy, typename ETy> BTy FPowI(BTy base, ETy exp) {
 }
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Ceiling4_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(Ceiling4_1)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 1>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Ceiling4_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(Ceiling4_2)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 2>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Ceiling4_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Ceiling4_4)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Ceiling4_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Ceiling4_8)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #if defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Ceiling4_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(Ceiling4_16)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 16>>(x);
 }
 #endif
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Ceiling8_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(Ceiling8_1)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 1>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Ceiling8_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(Ceiling8_2)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 2>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Ceiling8_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Ceiling8_4)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Ceiling8_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Ceiling8_8)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #if defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Ceiling8_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(Ceiling8_16)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 16>>(x);
 }
 #endif
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Ceiling10_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(Ceiling10_1)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 1>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Ceiling10_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(Ceiling10_2)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 2>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Ceiling10_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Ceiling10_4)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Ceiling10_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Ceiling10_8)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #if defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Ceiling10_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(Ceiling10_16)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 16>>(x);
 }
 #endif
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Ceiling16_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(Ceiling16_1)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 1>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Ceiling16_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(Ceiling16_2)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 2>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Ceiling16_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Ceiling16_4)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Ceiling16_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Ceiling16_8)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #if defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Ceiling16_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(Ceiling16_16)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Ceiling<CppTypeFor<TypeCategory::Integer, 16>>(x);
 }
 #endif
 #endif
 
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Exponent4_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Exponent4_4)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Exponent<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Exponent4_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Exponent4_8)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Exponent<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Exponent8_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Exponent8_4)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Exponent<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Exponent8_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Exponent8_8)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Exponent<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Exponent10_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Exponent10_4)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Exponent<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Exponent10_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Exponent10_8)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Exponent<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Exponent16_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Exponent16_4)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Exponent<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Exponent16_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Exponent16_8)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Exponent<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #endif
 
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Floor4_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(Floor4_1)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 1>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Floor4_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(Floor4_2)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 2>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Floor4_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Floor4_4)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Floor4_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Floor4_8)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #if defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Floor4_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(Floor4_16)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 16>>(x);
 }
 #endif
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Floor8_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(Floor8_1)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 1>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Floor8_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(Floor8_2)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 2>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Floor8_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Floor8_4)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Floor8_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Floor8_8)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #if defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Floor8_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(Floor8_16)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 16>>(x);
 }
 #endif
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Floor10_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(Floor10_1)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 1>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Floor10_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(Floor10_2)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 2>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Floor10_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Floor10_4)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Floor10_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Floor10_8)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #if defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Floor10_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(Floor10_16)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 16>>(x);
 }
 #endif
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Floor16_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(Floor16_1)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 1>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Floor16_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(Floor16_2)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 2>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Floor16_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Floor16_4)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Floor16_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Floor16_8)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #if defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Floor16_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(Floor16_16)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Floor<CppTypeFor<TypeCategory::Integer, 16>>(x);
 }
 #endif
 #endif
 
-CppTypeFor<TypeCategory::Real, 4> RTNAME(Fraction4)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(Fraction4)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Fraction(x);
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(Fraction8)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(Fraction8)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Fraction(x);
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(Fraction10)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(Fraction10)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Fraction(x);
 }
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTNAME(Fraction16)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Fraction16)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Fraction(x);
 }
 #endif
 
-bool RTNAME(IsFinite4)(CppTypeFor<TypeCategory::Real, 4> x) {
+bool RTDEF(IsFinite4)(CppTypeFor<TypeCategory::Real, 4> x) {
   return std::isfinite(x);
 }
-bool RTNAME(IsFinite8)(CppTypeFor<TypeCategory::Real, 8> x) {
+bool RTDEF(IsFinite8)(CppTypeFor<TypeCategory::Real, 8> x) {
   return std::isfinite(x);
 }
 #if LDBL_MANT_DIG == 64
-bool RTNAME(IsFinite10)(CppTypeFor<TypeCategory::Real, 10> x) {
+bool RTDEF(IsFinite10)(CppTypeFor<TypeCategory::Real, 10> x) {
   return std::isfinite(x);
 }
 #elif LDBL_MANT_DIG == 113
-bool RTNAME(IsFinite16)(CppTypeFor<TypeCategory::Real, 16> x) {
+bool RTDEF(IsFinite16)(CppTypeFor<TypeCategory::Real, 16> x) {
   return std::isfinite(x);
 }
 #endif
 
-bool RTNAME(IsNaN4)(CppTypeFor<TypeCategory::Real, 4> x) {
+bool RTDEF(IsNaN4)(CppTypeFor<TypeCategory::Real, 4> x) {
   return std::isnan(x);
 }
-bool RTNAME(IsNaN8)(CppTypeFor<TypeCategory::Real, 8> x) {
+bool RTDEF(IsNaN8)(CppTypeFor<TypeCategory::Real, 8> x) {
   return std::isnan(x);
 }
 #if LDBL_MANT_DIG == 64
-bool RTNAME(IsNaN10)(CppTypeFor<TypeCategory::Real, 10> x) {
+bool RTDEF(IsNaN10)(CppTypeFor<TypeCategory::Real, 10> x) {
   return std::isnan(x);
 }
 #elif LDBL_MANT_DIG == 113
-bool RTNAME(IsNaN16)(CppTypeFor<TypeCategory::Real, 16> x) {
+bool RTDEF(IsNaN16)(CppTypeFor<TypeCategory::Real, 16> x) {
   return std::isnan(x);
 }
 #endif
 
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(ModInteger1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(ModInteger1)(
     CppTypeFor<TypeCategory::Integer, 1> x,
     CppTypeFor<TypeCategory::Integer, 1> p, const char *sourceFile,
     int sourceLine) {
   return IntMod<false>(x, p, sourceFile, sourceLine);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(ModInteger2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(ModInteger2)(
     CppTypeFor<TypeCategory::Integer, 2> x,
     CppTypeFor<TypeCategory::Integer, 2> p, const char *sourceFile,
     int sourceLine) {
   return IntMod<false>(x, p, sourceFile, sourceLine);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(ModInteger4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(ModInteger4)(
     CppTypeFor<TypeCategory::Integer, 4> x,
     CppTypeFor<TypeCategory::Integer, 4> p, const char *sourceFile,
     int sourceLine) {
   return IntMod<false>(x, p, sourceFile, sourceLine);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(ModInteger8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(ModInteger8)(
     CppTypeFor<TypeCategory::Integer, 8> x,
     CppTypeFor<TypeCategory::Integer, 8> p, const char *sourceFile,
     int sourceLine) {
   return IntMod<false>(x, p, sourceFile, sourceLine);
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(ModInteger16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(ModInteger16)(
     CppTypeFor<TypeCategory::Integer, 16> x,
     CppTypeFor<TypeCategory::Integer, 16> p, const char *sourceFile,
     int sourceLine) {
   return IntMod<false>(x, p, sourceFile, sourceLine);
 }
 #endif
-CppTypeFor<TypeCategory::Real, 4> RTNAME(ModReal4)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(ModReal4)(
     CppTypeFor<TypeCategory::Real, 4> x, CppTypeFor<TypeCategory::Real, 4> p,
     const char *sourceFile, int sourceLine) {
   return RealMod<false>(x, p, sourceFile, sourceLine);
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(ModReal8)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(ModReal8)(
     CppTypeFor<TypeCategory::Real, 8> x, CppTypeFor<TypeCategory::Real, 8> p,
     const char *sourceFile, int sourceLine) {
   return RealMod<false>(x, p, sourceFile, sourceLine);
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(ModReal10)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(ModReal10)(
     CppTypeFor<TypeCategory::Real, 10> x, CppTypeFor<TypeCategory::Real, 10> p,
     const char *sourceFile, int sourceLine) {
   return RealMod<false>(x, p, sourceFile, sourceLine);
 }
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTNAME(ModReal16)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(ModReal16)(
     CppTypeFor<TypeCategory::Real, 16> x, CppTypeFor<TypeCategory::Real, 16> p,
     const char *sourceFile, int sourceLine) {
   return RealMod<false>(x, p, sourceFile, sourceLine);
 }
 #endif
 
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(ModuloInteger1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(ModuloInteger1)(
     CppTypeFor<TypeCategory::Integer, 1> x,
     CppTypeFor<TypeCategory::Integer, 1> p, const char *sourceFile,
     int sourceLine) {
   return IntMod<true>(x, p, sourceFile, sourceLine);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(ModuloInteger2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(ModuloInteger2)(
     CppTypeFor<TypeCategory::Integer, 2> x,
     CppTypeFor<TypeCategory::Integer, 2> p, const char *sourceFile,
     int sourceLine) {
   return IntMod<true>(x, p, sourceFile, sourceLine);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(ModuloInteger4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(ModuloInteger4)(
     CppTypeFor<TypeCategory::Integer, 4> x,
     CppTypeFor<TypeCategory::Integer, 4> p, const char *sourceFile,
     int sourceLine) {
   return IntMod<true>(x, p, sourceFile, sourceLine);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(ModuloInteger8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(ModuloInteger8)(
     CppTypeFor<TypeCategory::Integer, 8> x,
     CppTypeFor<TypeCategory::Integer, 8> p, const char *sourceFile,
     int sourceLine) {
   return IntMod<true>(x, p, sourceFile, sourceLine);
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(ModuloInteger16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(ModuloInteger16)(
     CppTypeFor<TypeCategory::Integer, 16> x,
     CppTypeFor<TypeCategory::Integer, 16> p, const char *sourceFile,
     int sourceLine) {
   return IntMod<true>(x, p, sourceFile, sourceLine);
 }
 #endif
-CppTypeFor<TypeCategory::Real, 4> RTNAME(ModuloReal4)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(ModuloReal4)(
     CppTypeFor<TypeCategory::Real, 4> x, CppTypeFor<TypeCategory::Real, 4> p,
     const char *sourceFile, int sourceLine) {
   return RealMod<true>(x, p, sourceFile, sourceLine);
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(ModuloReal8)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(ModuloReal8)(
     CppTypeFor<TypeCategory::Real, 8> x, CppTypeFor<TypeCategory::Real, 8> p,
     const char *sourceFile, int sourceLine) {
   return RealMod<true>(x, p, sourceFile, sourceLine);
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(ModuloReal10)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(ModuloReal10)(
     CppTypeFor<TypeCategory::Real, 10> x, CppTypeFor<TypeCategory::Real, 10> p,
     const char *sourceFile, int sourceLine) {
   return RealMod<true>(x, p, sourceFile, sourceLine);
 }
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTNAME(ModuloReal16)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(ModuloReal16)(
     CppTypeFor<TypeCategory::Real, 16> x, CppTypeFor<TypeCategory::Real, 16> p,
     const char *sourceFile, int sourceLine) {
   return RealMod<true>(x, p, sourceFile, sourceLine);
 }
 #endif
 
-CppTypeFor<TypeCategory::Real, 4> RTNAME(Nearest4)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(Nearest4)(
     CppTypeFor<TypeCategory::Real, 4> x, bool positive) {
   return Nearest<24>(x, positive);
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(Nearest8)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(Nearest8)(
     CppTypeFor<TypeCategory::Real, 8> x, bool positive) {
   return Nearest<53>(x, positive);
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(Nearest10)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(Nearest10)(
     CppTypeFor<TypeCategory::Real, 10> x, bool positive) {
   return Nearest<64>(x, positive);
 }
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTNAME(Nearest16)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Nearest16)(
     CppTypeFor<TypeCategory::Real, 16> x, bool positive) {
   return Nearest<113>(x, positive);
 }
 #endif
 
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Nint4_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(Nint4_1)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 1>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Nint4_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(Nint4_2)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 2>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Nint4_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Nint4_4)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Nint4_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Nint4_8)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #if defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Nint4_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(Nint4_16)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 16>>(x);
 }
 #endif
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Nint8_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(Nint8_1)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 1>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Nint8_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(Nint8_2)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 2>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Nint8_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Nint8_4)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Nint8_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Nint8_8)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #if defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Nint8_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(Nint8_16)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 16>>(x);
 }
 #endif
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Nint10_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(Nint10_1)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 1>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Nint10_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(Nint10_2)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 2>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Nint10_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Nint10_4)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Nint10_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Nint10_8)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #if defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Nint10_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(Nint10_16)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 16>>(x);
 }
 #endif
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(Nint16_1)(
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(Nint16_1)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 1>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(Nint16_2)(
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(Nint16_2)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 2>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(Nint16_4)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(Nint16_4)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 4>>(x);
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(Nint16_8)(
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(Nint16_8)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 8>>(x);
 }
 #if defined __SIZEOF_INT128__ && !AVOID_NATIVE_UINT128_T
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(Nint16_16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(Nint16_16)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Nint<CppTypeFor<TypeCategory::Integer, 16>>(x);
 }
 #endif
 #endif
 
-CppTypeFor<TypeCategory::Real, 4> RTNAME(RRSpacing4)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(RRSpacing4)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return RRSpacing<24>(x);
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(RRSpacing8)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(RRSpacing8)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return RRSpacing<53>(x);
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(RRSpacing10)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(RRSpacing10)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return RRSpacing<64>(x);
 }
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTNAME(RRSpacing16)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(RRSpacing16)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return RRSpacing<113>(x);
 }
 #endif
 
-CppTypeFor<TypeCategory::Real, 4> RTNAME(SetExponent4)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(SetExponent4)(
     CppTypeFor<TypeCategory::Real, 4> x, std::int64_t p) {
   return SetExponent(x, p);
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(SetExponent8)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(SetExponent8)(
     CppTypeFor<TypeCategory::Real, 8> x, std::int64_t p) {
   return SetExponent(x, p);
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(SetExponent10)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(SetExponent10)(
     CppTypeFor<TypeCategory::Real, 10> x, std::int64_t p) {
   return SetExponent(x, p);
 }
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTNAME(SetExponent16)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(SetExponent16)(
     CppTypeFor<TypeCategory::Real, 16> x, std::int64_t p) {
   return SetExponent(x, p);
 }
 #endif
 
-CppTypeFor<TypeCategory::Real, 4> RTNAME(Scale4)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(Scale4)(
     CppTypeFor<TypeCategory::Real, 4> x, std::int64_t p) {
   return Scale(x, p);
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(Scale8)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(Scale8)(
     CppTypeFor<TypeCategory::Real, 8> x, std::int64_t p) {
   return Scale(x, p);
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(Scale10)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(Scale10)(
     CppTypeFor<TypeCategory::Real, 10> x, std::int64_t p) {
   return Scale(x, p);
 }
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTNAME(Scale16)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Scale16)(
     CppTypeFor<TypeCategory::Real, 16> x, std::int64_t p) {
   return Scale(x, p);
 }
 #endif
 
 // SELECTED_INT_KIND
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(SelectedIntKind)(
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedIntKind)(
     const char *source, int line, void *x, int xKind) {
 #ifdef __SIZEOF_INT128__
   CppTypeFor<TypeCategory::Integer, 16> r =
@@ -861,9 +875,9 @@ CppTypeFor<TypeCategory::Integer, 4> RTNAME(SelectedIntKind)(
 }
 
 // SELECTED_REAL_KIND
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(SelectedRealKind)(
-    const char *source, int line, void *precision, int pKind, void *range,
-    int rKind, void *radix, int dKind) {
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(SelectedRealKind)(const char *source,
+    int line, void *precision, int pKind, void *range, int rKind, void *radix,
+    int dKind) {
 #ifdef __SIZEOF_INT128__
   CppTypeFor<TypeCategory::Integer, 16> p =
       getIntArgValue<CppTypeFor<TypeCategory::Integer, 16>>(
@@ -885,74 +899,76 @@ CppTypeFor<TypeCategory::Integer, 4> RTNAME(SelectedRealKind)(
   return SelectedRealKind(p, r, d);
 }
 
-CppTypeFor<TypeCategory::Real, 4> RTNAME(Spacing4)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(Spacing4)(
     CppTypeFor<TypeCategory::Real, 4> x) {
   return Spacing<24>(x);
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(Spacing8)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(Spacing8)(
     CppTypeFor<TypeCategory::Real, 8> x) {
   return Spacing<53>(x);
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(Spacing10)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(Spacing10)(
     CppTypeFor<TypeCategory::Real, 10> x) {
   return Spacing<64>(x);
 }
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTNAME(Spacing16)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Spacing16)(
     CppTypeFor<TypeCategory::Real, 16> x) {
   return Spacing<113>(x);
 }
 #endif
 
-CppTypeFor<TypeCategory::Real, 4> RTNAME(FPow4i)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(FPow4i)(
     CppTypeFor<TypeCategory::Real, 4> b,
     CppTypeFor<TypeCategory::Integer, 4> e) {
   return FPowI(b, e);
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(FPow8i)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(FPow8i)(
     CppTypeFor<TypeCategory::Real, 8> b,
     CppTypeFor<TypeCategory::Integer, 4> e) {
   return FPowI(b, e);
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(FPow10i)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(FPow10i)(
     CppTypeFor<TypeCategory::Real, 10> b,
     CppTypeFor<TypeCategory::Integer, 4> e) {
   return FPowI(b, e);
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(FPow16i)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(FPow16i)(
     CppTypeFor<TypeCategory::Real, 16> b,
     CppTypeFor<TypeCategory::Integer, 4> e) {
   return FPowI(b, e);
 }
 #endif
 
-CppTypeFor<TypeCategory::Real, 4> RTNAME(FPow4k)(
+CppTypeFor<TypeCategory::Real, 4> RTDEF(FPow4k)(
     CppTypeFor<TypeCategory::Real, 4> b,
     CppTypeFor<TypeCategory::Integer, 8> e) {
   return FPowI(b, e);
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(FPow8k)(
+CppTypeFor<TypeCategory::Real, 8> RTDEF(FPow8k)(
     CppTypeFor<TypeCategory::Real, 8> b,
     CppTypeFor<TypeCategory::Integer, 8> e) {
   return FPowI(b, e);
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(FPow10k)(
+CppTypeFor<TypeCategory::Real, 10> RTDEF(FPow10k)(
     CppTypeFor<TypeCategory::Real, 10> b,
     CppTypeFor<TypeCategory::Integer, 8> e) {
   return FPowI(b, e);
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(FPow16k)(
+CppTypeFor<TypeCategory::Real, 16> RTDEF(FPow16k)(
     CppTypeFor<TypeCategory::Real, 16> b,
     CppTypeFor<TypeCategory::Integer, 8> e) {
   return FPowI(b, e);
 }
 #endif
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/pointer.cpp b/flang/runtime/pointer.cpp
index b0003add7b35..f83c00089813 100644
--- a/flang/runtime/pointer.cpp
+++ b/flang/runtime/pointer.cpp
@@ -16,8 +16,9 @@
 
 namespace Fortran::runtime {
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
-void RTNAME(PointerNullifyIntrinsic)(Descriptor &pointer, TypeCategory category,
+void RTDEF(PointerNullifyIntrinsic)(Descriptor &pointer, TypeCategory category,
     int kind, int rank, int corank) {
   INTERNAL_CHECK(corank == 0);
   pointer.Establish(TypeCode{category, kind},
@@ -25,20 +26,20 @@ void RTNAME(PointerNullifyIntrinsic)(Descriptor &pointer, TypeCategory category,
       CFI_attribute_pointer);
 }
 
-void RTNAME(PointerNullifyCharacter)(Descriptor &pointer, SubscriptValue length,
+void RTDEF(PointerNullifyCharacter)(Descriptor &pointer, SubscriptValue length,
     int kind, int rank, int corank) {
   INTERNAL_CHECK(corank == 0);
   pointer.Establish(
       kind, length, nullptr, rank, nullptr, CFI_attribute_pointer);
 }
 
-void RTNAME(PointerNullifyDerived)(Descriptor &pointer,
+void RTDEF(PointerNullifyDerived)(Descriptor &pointer,
     const typeInfo::DerivedType &derivedType, int rank, int corank) {
   INTERNAL_CHECK(corank == 0);
   pointer.Establish(derivedType, nullptr, rank, nullptr, CFI_attribute_pointer);
 }
 
-void RTNAME(PointerSetBounds)(Descriptor &pointer, int zeroBasedDim,
+void RTDEF(PointerSetBounds)(Descriptor &pointer, int zeroBasedDim,
     SubscriptValue lower, SubscriptValue upper) {
   INTERNAL_CHECK(zeroBasedDim >= 0 && zeroBasedDim < pointer.rank());
   pointer.GetDimension(zeroBasedDim).SetBounds(lower, upper);
@@ -47,28 +48,28 @@ void RTNAME(PointerSetBounds)(Descriptor &pointer, int zeroBasedDim,
 
 // TODO: PointerSetCoBounds
 
-void RTNAME(PointerSetDerivedLength)(
+void RTDEF(PointerSetDerivedLength)(
     Descriptor &pointer, int which, SubscriptValue x) {
   DescriptorAddendum *addendum{pointer.Addendum()};
   INTERNAL_CHECK(addendum != nullptr);
   addendum->SetLenParameterValue(which, x);
 }
 
-void RTNAME(PointerApplyMold)(
+void RTDEF(PointerApplyMold)(
     Descriptor &pointer, const Descriptor &mold, int rank) {
   pointer.ApplyMold(mold, rank);
 }
 
-void RTNAME(PointerAssociateScalar)(Descriptor &pointer, void *target) {
+void RTDEF(PointerAssociateScalar)(Descriptor &pointer, void *target) {
   pointer.set_base_addr(target);
 }
 
-void RTNAME(PointerAssociate)(Descriptor &pointer, const Descriptor &target) {
+void RTDEF(PointerAssociate)(Descriptor &pointer, const Descriptor &target) {
   pointer = target;
   pointer.raw().attribute = CFI_attribute_pointer;
 }
 
-void RTNAME(PointerAssociateLowerBounds)(Descriptor &pointer,
+void RTDEF(PointerAssociateLowerBounds)(Descriptor &pointer,
     const Descriptor &target, const Descriptor &lowerBounds) {
   pointer = target;
   pointer.raw().attribute = CFI_attribute_pointer;
@@ -84,7 +85,7 @@ void RTNAME(PointerAssociateLowerBounds)(Descriptor &pointer,
   }
 }
 
-void RTNAME(PointerAssociateRemapping)(Descriptor &pointer,
+void RTDEF(PointerAssociateRemapping)(Descriptor &pointer,
     const Descriptor &target, const Descriptor &bounds, const char *sourceFile,
     int sourceLine) {
   pointer = target;
@@ -122,7 +123,7 @@ void RTNAME(PointerAssociateRemapping)(Descriptor &pointer,
   }
 }
 
-int RTNAME(PointerAllocate)(Descriptor &pointer, bool hasStat,
+int RTDEF(PointerAllocate)(Descriptor &pointer, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (!pointer.IsPointer()) {
@@ -141,7 +142,7 @@ int RTNAME(PointerAllocate)(Descriptor &pointer, bool hasStat,
   return stat;
 }
 
-int RTNAME(PointerAllocateSource)(Descriptor &pointer, const Descriptor &source,
+int RTDEF(PointerAllocateSource)(Descriptor &pointer, const Descriptor &source,
     bool hasStat, const Descriptor *errMsg, const char *sourceFile,
     int sourceLine) {
   int stat{RTNAME(PointerAllocate)(
@@ -153,7 +154,7 @@ int RTNAME(PointerAllocateSource)(Descriptor &pointer, const Descriptor &source,
   return stat;
 }
 
-int RTNAME(PointerDeallocate)(Descriptor &pointer, bool hasStat,
+int RTDEF(PointerDeallocate)(Descriptor &pointer, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (!pointer.IsPointer()) {
@@ -167,7 +168,7 @@ int RTNAME(PointerDeallocate)(Descriptor &pointer, bool hasStat,
       errMsg, hasStat);
 }
 
-int RTNAME(PointerDeallocatePolymorphic)(Descriptor &pointer,
+int RTDEF(PointerDeallocatePolymorphic)(Descriptor &pointer,
     const typeInfo::DerivedType *derivedType, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(PointerDeallocate)(
@@ -187,11 +188,11 @@ int RTNAME(PointerDeallocatePolymorphic)(Descriptor &pointer,
   return stat;
 }
 
-bool RTNAME(PointerIsAssociated)(const Descriptor &pointer) {
+bool RTDEF(PointerIsAssociated)(const Descriptor &pointer) {
   return pointer.raw().base_addr != nullptr;
 }
 
-bool RTNAME(PointerIsAssociatedWith)(
+bool RTDEF(PointerIsAssociatedWith)(
     const Descriptor &pointer, const Descriptor *target) {
   if (!target) {
     return pointer.raw().base_addr != nullptr;
@@ -220,5 +221,6 @@ bool RTNAME(PointerIsAssociatedWith)(
 
 // TODO: PointerCheckLengthParameter
 
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/product.cpp b/flang/runtime/product.cpp
index 683cb61fe995..a516bc51a959 100644
--- a/flang/runtime/product.cpp
+++ b/flang/runtime/product.cpp
@@ -18,13 +18,15 @@
 namespace Fortran::runtime {
 template <typename INTERMEDIATE> class NonComplexProductAccumulator {
 public:
-  explicit NonComplexProductAccumulator(const Descriptor &array)
+  explicit RT_API_ATTRS NonComplexProductAccumulator(const Descriptor &array)
       : array_{array} {}
-  void Reinitialize() { product_ = 1; }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  RT_API_ATTRS void Reinitialize() { product_ = 1; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     *p = static_cast<A>(product_);
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     product_ *= *array_.Element<A>(at);
     return product_ != 0;
   }
@@ -34,16 +36,24 @@ private:
   INTERMEDIATE product_{1};
 };
 
+// Suppress the warnings about calling __host__-only std::complex operators,
+// defined in C++ STD header files, from __device__ code.
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 template <typename PART> class ComplexProductAccumulator {
 public:
-  explicit ComplexProductAccumulator(const Descriptor &array) : array_{array} {}
-  void Reinitialize() { product_ = std::complex<PART>{1, 0}; }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  explicit RT_API_ATTRS ComplexProductAccumulator(const Descriptor &array)
+      : array_{array} {}
+  RT_API_ATTRS void Reinitialize() { product_ = std::complex<PART>{1, 0}; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     using ResultPart = typename A::value_type;
     *p = {static_cast<ResultPart>(product_.real()),
         static_cast<ResultPart>(product_.imag())};
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     product_ *= *array_.Element<A>(at);
     return true;
   }
@@ -53,37 +63,37 @@ private:
   std::complex<PART> product_{1, 0};
 };
 
+RT_DIAG_POP
+
 extern "C" {
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(ProductInteger1)(
-    const Descriptor &x, const char *source, int line, int dim,
-    const Descriptor *mask) {
+RT_EXT_API_GROUP_BEGIN
+
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(ProductInteger1)(const Descriptor &x,
+    const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 1>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x},
       "PRODUCT");
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(ProductInteger2)(
-    const Descriptor &x, const char *source, int line, int dim,
-    const Descriptor *mask) {
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(ProductInteger2)(const Descriptor &x,
+    const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 2>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x},
       "PRODUCT");
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(ProductInteger4)(
-    const Descriptor &x, const char *source, int line, int dim,
-    const Descriptor *mask) {
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(ProductInteger4)(const Descriptor &x,
+    const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 4>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x},
       "PRODUCT");
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(ProductInteger8)(
-    const Descriptor &x, const char *source, int line, int dim,
-    const Descriptor *mask) {
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(ProductInteger8)(const Descriptor &x,
+    const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 8>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Integer, 8>>{x},
       "PRODUCT");
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(ProductInteger16)(
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(ProductInteger16)(
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 16>(x, source, line, dim,
@@ -94,27 +104,27 @@ CppTypeFor<TypeCategory::Integer, 16> RTNAME(ProductInteger16)(
 #endif
 
 // TODO: real/complex(2 & 3)
-CppTypeFor<TypeCategory::Real, 4> RTNAME(ProductReal4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 4> RTDEF(ProductReal4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 4>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 8>>{x},
       "PRODUCT");
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(ProductReal8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 8> RTDEF(ProductReal8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 8>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 8>>{x},
       "PRODUCT");
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(ProductReal10)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 10> RTDEF(ProductReal10)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 10>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 10>>{x},
       "PRODUCT");
 }
 #elif LDBL_MANT_DIG == 113
-CppTypeFor<TypeCategory::Real, 16> RTNAME(ProductReal16)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 16> RTDEF(ProductReal16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 16>(x, source, line, dim, mask,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 16>>{x},
@@ -122,14 +132,14 @@ CppTypeFor<TypeCategory::Real, 16> RTNAME(ProductReal16)(const Descriptor &x,
 }
 #endif
 
-void RTNAME(CppProductComplex4)(CppTypeFor<TypeCategory::Complex, 4> &result,
+void RTDEF(CppProductComplex4)(CppTypeFor<TypeCategory::Complex, 4> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 4>(x, source, line, dim,
       mask, ComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 8>>{x},
       "PRODUCT");
 }
-void RTNAME(CppProductComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
+void RTDEF(CppProductComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 8>(x, source, line, dim,
@@ -137,7 +147,7 @@ void RTNAME(CppProductComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
       "PRODUCT");
 }
 #if LDBL_MANT_DIG == 64
-void RTNAME(CppProductComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
+void RTDEF(CppProductComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 10>(x, source, line, dim,
@@ -145,7 +155,7 @@ void RTNAME(CppProductComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
       "PRODUCT");
 }
 #elif LDBL_MANT_DIG == 113
-void RTNAME(CppProductComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
+void RTDEF(CppProductComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 16>(x, source, line, dim,
@@ -154,11 +164,13 @@ void RTNAME(CppProductComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
 }
 #endif
 
-void RTNAME(ProductDim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(ProductDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line, const Descriptor *mask) {
   TypedPartialNumericReduction<NonComplexProductAccumulator,
       NonComplexProductAccumulator, ComplexProductAccumulator>(
       result, x, dim, source, line, mask, "PRODUCT");
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/ragged.cpp b/flang/runtime/ragged.cpp
index 855aa02e7f59..a4d9e541ba53 100644
--- a/flang/runtime/ragged.cpp
+++ b/flang/runtime/ragged.cpp
@@ -7,20 +7,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Runtime/ragged.h"
+#include "tools.h"
 #include <cstdlib>
 
 namespace Fortran::runtime {
 
-inline bool isIndirection(const RaggedArrayHeader *const header) {
+inline RT_API_ATTRS bool isIndirection(const RaggedArrayHeader *const header) {
   return header->flags & 1;
 }
 
-inline std::size_t rank(const RaggedArrayHeader *const header) {
+inline RT_API_ATTRS std::size_t rank(const RaggedArrayHeader *const header) {
   return header->flags >> 1;
 }
 
-RaggedArrayHeader *RaggedArrayAllocate(RaggedArrayHeader *header, bool isHeader,
-    std::int64_t rank, std::int64_t elementSize, std::int64_t *extentVector) {
+RT_API_ATTRS RaggedArrayHeader *RaggedArrayAllocate(RaggedArrayHeader *header,
+    bool isHeader, std::int64_t rank, std::int64_t elementSize,
+    std::int64_t *extentVector) {
   if (header && rank) {
     std::int64_t size{1};
     for (std::int64_t counter{0}; counter < rank; ++counter) {
@@ -32,10 +34,13 @@ RaggedArrayHeader *RaggedArrayAllocate(RaggedArrayHeader *header, bool isHeader,
     header->flags = (rank << 1) | isHeader;
     header->extentPointer = extentVector;
     if (isHeader) {
-      header->bufferPointer = std::calloc(sizeof(RaggedArrayHeader), size);
-    } else {
-      header->bufferPointer =
-          static_cast<void *>(std::calloc(elementSize, size));
+      elementSize = sizeof(RaggedArrayHeader);
+    }
+    Terminator terminator{__FILE__, __LINE__};
+    std::size_t bytes{static_cast<std::size_t>(elementSize * size)};
+    header->bufferPointer = AllocateMemoryOrCrash(terminator, bytes);
+    if (header->bufferPointer) {
+      std::memset(header->bufferPointer, 0, bytes);
     }
     return header;
   } else {
@@ -44,7 +49,7 @@ RaggedArrayHeader *RaggedArrayAllocate(RaggedArrayHeader *header, bool isHeader,
 }
 
 // Deallocate a ragged array from the heap.
-void RaggedArrayDeallocate(RaggedArrayHeader *raggedArrayHeader) {
+RT_API_ATTRS void RaggedArrayDeallocate(RaggedArrayHeader *raggedArrayHeader) {
   if (raggedArrayHeader) {
     if (std::size_t end{rank(raggedArrayHeader)}) {
       if (isIndirection(raggedArrayHeader)) {
@@ -66,14 +71,14 @@ void RaggedArrayDeallocate(RaggedArrayHeader *raggedArrayHeader) {
 }
 
 extern "C" {
-void *RTNAME(RaggedArrayAllocate)(void *header, bool isHeader,
-    std::int64_t rank, std::int64_t elementSize, std::int64_t *extentVector) {
+void *RTDEF(RaggedArrayAllocate)(void *header, bool isHeader, std::int64_t rank,
+    std::int64_t elementSize, std::int64_t *extentVector) {
   auto *result = RaggedArrayAllocate(static_cast<RaggedArrayHeader *>(header),
       isHeader, rank, elementSize, extentVector);
   return static_cast<void *>(result);
 }
 
-void RTNAME(RaggedArrayDeallocate)(void *raggedArrayHeader) {
+void RTDEF(RaggedArrayDeallocate)(void *raggedArrayHeader) {
   RaggedArrayDeallocate(static_cast<RaggedArrayHeader *>(raggedArrayHeader));
 }
 } // extern "C"
diff --git a/flang/runtime/reduction-templates.h b/flang/runtime/reduction-templates.h
index 2aaf5c102a9c..cf1ee8a96775 100644
--- a/flang/runtime/reduction-templates.h
+++ b/flang/runtime/reduction-templates.h
@@ -40,7 +40,7 @@ namespace Fortran::runtime {
 // cases of FINDLOC, MAXLOC, & MINLOC).  These are the cases without DIM= or
 // cases where the argument has rank 1 and DIM=, if present, must be 1.
 template <typename TYPE, typename ACCUMULATOR>
-inline void DoTotalReduction(const Descriptor &x, int dim,
+inline RT_API_ATTRS void DoTotalReduction(const Descriptor &x, int dim,
     const Descriptor *mask, ACCUMULATOR &accumulator, const char *intrinsic,
     Terminator &terminator) {
   if (dim < 0 || dim > 1) {
@@ -76,7 +76,7 @@ inline void DoTotalReduction(const Descriptor &x, int dim,
 }
 
 template <TypeCategory CAT, int KIND, typename ACCUMULATOR>
-inline CppTypeFor<CAT, KIND> GetTotalReduction(const Descriptor &x,
+inline RT_API_ATTRS CppTypeFor<CAT, KIND> GetTotalReduction(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask,
     ACCUMULATOR &&accumulator, const char *intrinsic) {
   Terminator terminator{source, line};
@@ -98,7 +98,7 @@ inline CppTypeFor<CAT, KIND> GetTotalReduction(const Descriptor &x,
 // lower bounds other than one.  This utility subroutine creates an
 // array of subscripts [j,_,k] for result subscripts [j,k] so that the
 // elements of array(j,:,k) can be reduced.
-inline void GetExpandedSubscripts(SubscriptValue at[],
+inline RT_API_ATTRS void GetExpandedSubscripts(SubscriptValue at[],
     const Descriptor &descriptor, int zeroBasedDim,
     const SubscriptValue from[]) {
   descriptor.GetLowerBounds(at);
@@ -113,8 +113,9 @@ inline void GetExpandedSubscripts(SubscriptValue at[],
 }
 
 template <typename TYPE, typename ACCUMULATOR>
-inline void ReduceDimToScalar(const Descriptor &x, int zeroBasedDim,
-    SubscriptValue subscripts[], TYPE *result, ACCUMULATOR &accumulator) {
+inline RT_API_ATTRS void ReduceDimToScalar(const Descriptor &x,
+    int zeroBasedDim, SubscriptValue subscripts[], TYPE *result,
+    ACCUMULATOR &accumulator) {
   SubscriptValue xAt[maxRank];
   GetExpandedSubscripts(xAt, x, zeroBasedDim, subscripts);
   const auto &dim{x.GetDimension(zeroBasedDim)};
@@ -133,9 +134,9 @@ inline void ReduceDimToScalar(const Descriptor &x, int zeroBasedDim,
 }
 
 template <typename TYPE, typename ACCUMULATOR>
-inline void ReduceDimMaskToScalar(const Descriptor &x, int zeroBasedDim,
-    SubscriptValue subscripts[], const Descriptor &mask, TYPE *result,
-    ACCUMULATOR &accumulator) {
+inline RT_API_ATTRS void ReduceDimMaskToScalar(const Descriptor &x,
+    int zeroBasedDim, SubscriptValue subscripts[], const Descriptor &mask,
+    TYPE *result, ACCUMULATOR &accumulator) {
   SubscriptValue xAt[maxRank], maskAt[maxRank];
   GetExpandedSubscripts(xAt, x, zeroBasedDim, subscripts);
   GetExpandedSubscripts(maskAt, mask, zeroBasedDim, subscripts);
@@ -162,7 +163,7 @@ inline void ReduceDimMaskToScalar(const Descriptor &x, int zeroBasedDim,
 
 // Utility: establishes & allocates the result array for a partial
 // reduction (i.e., one with DIM=).
-static void CreatePartialReductionResult(Descriptor &result,
+static RT_API_ATTRS void CreatePartialReductionResult(Descriptor &result,
     const Descriptor &x, std::size_t resultElementSize, int dim,
     Terminator &terminator, const char *intrinsic, TypeCode typeCode) {
   int xRank{x.rank()};
@@ -192,9 +193,10 @@ static void CreatePartialReductionResult(Descriptor &result,
 // Partial reductions with DIM=
 
 template <typename ACCUMULATOR, TypeCategory CAT, int KIND>
-inline void PartialReduction(Descriptor &result, const Descriptor &x,
-    std::size_t resultElementSize, int dim, const Descriptor *mask,
-    Terminator &terminator, const char *intrinsic, ACCUMULATOR &accumulator) {
+inline RT_API_ATTRS void PartialReduction(Descriptor &result,
+    const Descriptor &x, std::size_t resultElementSize, int dim,
+    const Descriptor *mask, Terminator &terminator, const char *intrinsic,
+    ACCUMULATOR &accumulator) {
   CreatePartialReductionResult(result, x, resultElementSize, dim, terminator,
       intrinsic, TypeCode{CAT, KIND});
   SubscriptValue at[maxRank];
@@ -233,8 +235,8 @@ struct PartialIntegerReductionHelper {
   template <int KIND> struct Functor {
     static constexpr int Intermediate{
         std::max(KIND, 4)}; // use at least "int" for intermediate results
-    void operator()(Descriptor &result, const Descriptor &x, int dim,
-        const Descriptor *mask, Terminator &terminator,
+    RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
+        int dim, const Descriptor *mask, Terminator &terminator,
         const char *intrinsic) const {
       using Accumulator =
           ACCUM<CppTypeFor<TypeCategory::Integer, Intermediate>>;
@@ -248,9 +250,9 @@ struct PartialIntegerReductionHelper {
 };
 
 template <template <typename> class INTEGER_ACCUM>
-inline void PartialIntegerReduction(Descriptor &result, const Descriptor &x,
-    int dim, int kind, const Descriptor *mask, const char *intrinsic,
-    Terminator &terminator) {
+inline RT_API_ATTRS void PartialIntegerReduction(Descriptor &result,
+    const Descriptor &x, int dim, int kind, const Descriptor *mask,
+    const char *intrinsic, Terminator &terminator) {
   ApplyIntegerKind<
       PartialIntegerReductionHelper<INTEGER_ACCUM>::template Functor, void>(
       kind, terminator, result, x, dim, mask, terminator, intrinsic);
@@ -261,8 +263,8 @@ struct PartialFloatingReductionHelper {
   template <int KIND> struct Functor {
     static constexpr int Intermediate{
         std::max(KIND, 8)}; // use at least "double" for intermediate results
-    void operator()(Descriptor &result, const Descriptor &x, int dim,
-        const Descriptor *mask, Terminator &terminator,
+    RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
+        int dim, const Descriptor *mask, Terminator &terminator,
         const char *intrinsic) const {
       using Accumulator = ACCUM<CppTypeFor<TypeCategory::Real, Intermediate>>;
       Accumulator accumulator{x};
@@ -277,7 +279,7 @@ struct PartialFloatingReductionHelper {
 template <template <typename> class INTEGER_ACCUM,
     template <typename> class REAL_ACCUM,
     template <typename> class COMPLEX_ACCUM>
-inline void TypedPartialNumericReduction(Descriptor &result,
+inline RT_API_ATTRS void TypedPartialNumericReduction(Descriptor &result,
     const Descriptor &x, int dim, const char *source, int line,
     const Descriptor *mask, const char *intrinsic) {
   Terminator terminator{source, line};
@@ -307,7 +309,8 @@ inline void TypedPartialNumericReduction(Descriptor &result,
 
 template <typename ACCUMULATOR> struct LocationResultHelper {
   template <int KIND> struct Functor {
-    void operator()(ACCUMULATOR &accumulator, const Descriptor &result) const {
+    RT_API_ATTRS void operator()(
+        ACCUMULATOR &accumulator, const Descriptor &result) const {
       accumulator.GetResult(
           result.OffsetElement<CppTypeFor<TypeCategory::Integer, KIND>>());
     }
@@ -316,9 +319,9 @@ template <typename ACCUMULATOR> struct LocationResultHelper {
 
 template <typename ACCUMULATOR> struct PartialLocationHelper {
   template <int KIND> struct Functor {
-    void operator()(Descriptor &result, const Descriptor &x, int dim,
-        const Descriptor *mask, Terminator &terminator, const char *intrinsic,
-        ACCUMULATOR &accumulator) const {
+    RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
+        int dim, const Descriptor *mask, Terminator &terminator,
+        const char *intrinsic, ACCUMULATOR &accumulator) const {
       // Element size of the destination descriptor is the size
       // of {TypeCategory::Integer, KIND}.
       PartialReduction<ACCUMULATOR, TypeCategory::Integer, KIND>(result, x,
diff --git a/flang/runtime/reduction.cpp b/flang/runtime/reduction.cpp
index e8c2bd3e77e2..074a270cb508 100644
--- a/flang/runtime/reduction.cpp
+++ b/flang/runtime/reduction.cpp
@@ -24,12 +24,15 @@ namespace Fortran::runtime {
 
 template <typename INTERMEDIATE> class IntegerAndAccumulator {
 public:
-  explicit IntegerAndAccumulator(const Descriptor &array) : array_{array} {}
-  void Reinitialize() { and_ = ~INTERMEDIATE{0}; }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  explicit RT_API_ATTRS IntegerAndAccumulator(const Descriptor &array)
+      : array_{array} {}
+  RT_API_ATTRS void Reinitialize() { and_ = ~INTERMEDIATE{0}; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     *p = static_cast<A>(and_);
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     and_ &= *array_.Element<A>(at);
     return true;
   }
@@ -41,12 +44,15 @@ private:
 
 template <typename INTERMEDIATE> class IntegerOrAccumulator {
 public:
-  explicit IntegerOrAccumulator(const Descriptor &array) : array_{array} {}
-  void Reinitialize() { or_ = 0; }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  explicit RT_API_ATTRS IntegerOrAccumulator(const Descriptor &array)
+      : array_{array} {}
+  RT_API_ATTRS void Reinitialize() { or_ = 0; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     *p = static_cast<A>(or_);
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     or_ |= *array_.Element<A>(at);
     return true;
   }
@@ -58,12 +64,15 @@ private:
 
 template <typename INTERMEDIATE> class IntegerXorAccumulator {
 public:
-  explicit IntegerXorAccumulator(const Descriptor &array) : array_{array} {}
-  void Reinitialize() { xor_ = 0; }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  explicit RT_API_ATTRS IntegerXorAccumulator(const Descriptor &array)
+      : array_{array} {}
+  RT_API_ATTRS void Reinitialize() { xor_ = 0; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     *p = static_cast<A>(xor_);
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     xor_ ^= *array_.Element<A>(at);
     return true;
   }
@@ -74,35 +83,35 @@ private:
 };
 
 extern "C" {
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(IAll1)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(IAll1)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 1>(x, source, line, dim, mask,
       IntegerAndAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x}, "IALL");
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(IAll2)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(IAll2)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 2>(x, source, line, dim, mask,
       IntegerAndAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x}, "IALL");
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(IAll4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(IAll4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 4>(x, source, line, dim, mask,
       IntegerAndAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x}, "IALL");
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(IAll8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(IAll8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 8>(x, source, line, dim, mask,
       IntegerAndAccumulator<CppTypeFor<TypeCategory::Integer, 8>>{x}, "IALL");
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(IAll16)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(IAll16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 16>(x, source, line, dim,
       mask, IntegerAndAccumulator<CppTypeFor<TypeCategory::Integer, 16>>{x},
       "IALL");
 }
 #endif
-void RTNAME(IAllDim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(IAllDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line, const Descriptor *mask) {
   Terminator terminator{source, line};
   auto catKind{x.type().GetCategoryAndKind()};
@@ -112,35 +121,35 @@ void RTNAME(IAllDim)(Descriptor &result, const Descriptor &x, int dim,
       result, x, dim, catKind->second, mask, "IALL", terminator);
 }
 
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(IAny1)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(IAny1)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 1>(x, source, line, dim, mask,
       IntegerOrAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x}, "IANY");
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(IAny2)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(IAny2)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 2>(x, source, line, dim, mask,
       IntegerOrAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x}, "IANY");
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(IAny4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(IAny4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 4>(x, source, line, dim, mask,
       IntegerOrAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x}, "IANY");
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(IAny8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(IAny8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 8>(x, source, line, dim, mask,
       IntegerOrAccumulator<CppTypeFor<TypeCategory::Integer, 8>>{x}, "IANY");
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(IAny16)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(IAny16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 16>(x, source, line, dim,
       mask, IntegerOrAccumulator<CppTypeFor<TypeCategory::Integer, 16>>{x},
       "IANY");
 }
 #endif
-void RTNAME(IAnyDim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(IAnyDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line, const Descriptor *mask) {
   Terminator terminator{source, line};
   auto catKind{x.type().GetCategoryAndKind()};
@@ -150,39 +159,39 @@ void RTNAME(IAnyDim)(Descriptor &result, const Descriptor &x, int dim,
       result, x, dim, catKind->second, mask, "IANY", terminator);
 }
 
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(IParity1)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(IParity1)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 1>(x, source, line, dim, mask,
       IntegerXorAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x},
       "IPARITY");
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(IParity2)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(IParity2)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 2>(x, source, line, dim, mask,
       IntegerXorAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x},
       "IPARITY");
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(IParity4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(IParity4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 4>(x, source, line, dim, mask,
       IntegerXorAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x},
       "IPARITY");
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(IParity8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(IParity8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 8>(x, source, line, dim, mask,
       IntegerXorAccumulator<CppTypeFor<TypeCategory::Integer, 8>>{x},
       "IPARITY");
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(IParity16)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(IParity16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 16>(x, source, line, dim,
       mask, IntegerXorAccumulator<CppTypeFor<TypeCategory::Integer, 16>>{x},
       "IPARITY");
 }
 #endif
-void RTNAME(IParityDim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(IParityDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line, const Descriptor *mask) {
   Terminator terminator{source, line};
   auto catKind{x.type().GetCategoryAndKind()};
@@ -327,53 +336,54 @@ template <int KIND> struct CountDimension {
 };
 
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
-bool RTNAME(All)(const Descriptor &x, const char *source, int line, int dim) {
+bool RTDEF(All)(const Descriptor &x, const char *source, int line, int dim) {
   return GetTotalLogicalReduction(x, source, line, dim,
       LogicalAccumulator<LogicalReduction::All>{x}, "ALL");
 }
-void RTNAME(AllDim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(AllDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line) {
   Terminator terminator{source, line};
   DoReduceLogicalDimension<LogicalReduction::All>(
       result, x, dim, terminator, "ALL");
 }
 
-bool RTNAME(Any)(const Descriptor &x, const char *source, int line, int dim) {
+bool RTDEF(Any)(const Descriptor &x, const char *source, int line, int dim) {
   return GetTotalLogicalReduction(x, source, line, dim,
       LogicalAccumulator<LogicalReduction::Any>{x}, "ANY");
 }
-void RTNAME(AnyDim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(AnyDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line) {
   Terminator terminator{source, line};
   DoReduceLogicalDimension<LogicalReduction::Any>(
       result, x, dim, terminator, "ANY");
 }
 
-std::int64_t RTNAME(Count)(
+std::int64_t RTDEF(Count)(
     const Descriptor &x, const char *source, int line, int dim) {
   return GetTotalLogicalReduction(
       x, source, line, dim, CountAccumulator{x}, "COUNT");
 }
 
-void RTNAME(CountDim)(Descriptor &result, const Descriptor &x, int dim,
-    int kind, const char *source, int line) {
+void RTDEF(CountDim)(Descriptor &result, const Descriptor &x, int dim, int kind,
+    const char *source, int line) {
   Terminator terminator{source, line};
   ApplyIntegerKind<CountDimension, void>(
       kind, terminator, result, x, dim, terminator);
 }
 
-bool RTNAME(Parity)(
-    const Descriptor &x, const char *source, int line, int dim) {
+bool RTDEF(Parity)(const Descriptor &x, const char *source, int line, int dim) {
   return GetTotalLogicalReduction(x, source, line, dim,
       LogicalAccumulator<LogicalReduction::Parity>{x}, "PARITY");
 }
-void RTNAME(ParityDim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(ParityDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line) {
   Terminator terminator{source, line};
   DoReduceLogicalDimension<LogicalReduction::Parity>(
       result, x, dim, terminator, "PARITY");
 }
 
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/sum.cpp b/flang/runtime/sum.cpp
index c3c148296384..048399737c85 100644
--- a/flang/runtime/sum.cpp
+++ b/flang/runtime/sum.cpp
@@ -23,12 +23,15 @@ namespace Fortran::runtime {
 
 template <typename INTERMEDIATE> class IntegerSumAccumulator {
 public:
-  explicit IntegerSumAccumulator(const Descriptor &array) : array_{array} {}
-  void Reinitialize() { sum_ = 0; }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  explicit RT_API_ATTRS IntegerSumAccumulator(const Descriptor &array)
+      : array_{array} {}
+  void RT_API_ATTRS Reinitialize() { sum_ = 0; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     *p = static_cast<A>(sum_);
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     sum_ += *array_.Element<A>(at);
     return true;
   }
@@ -40,13 +43,15 @@ private:
 
 template <typename INTERMEDIATE> class RealSumAccumulator {
 public:
-  explicit RealSumAccumulator(const Descriptor &array) : array_{array} {}
-  void Reinitialize() { sum_ = correction_ = 0; }
-  template <typename A> A Result() const { return sum_; }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  explicit RT_API_ATTRS RealSumAccumulator(const Descriptor &array)
+      : array_{array} {}
+  void RT_API_ATTRS Reinitialize() { sum_ = correction_ = 0; }
+  template <typename A> RT_API_ATTRS A Result() const { return sum_; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     *p = Result<A>();
   }
-  template <typename A> bool Accumulate(A x) {
+  template <typename A> RT_API_ATTRS bool Accumulate(A x) {
     // Kahan summation
     auto next{x + correction_};
     auto oldSum{sum_};
@@ -54,7 +59,8 @@ public:
     correction_ = (sum_ - oldSum) - next; // algebraically zero
     return true;
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     return Accumulate(*array_.Element<A>(at));
   }
 
@@ -65,22 +71,25 @@ private:
 
 template <typename PART> class ComplexSumAccumulator {
 public:
-  explicit ComplexSumAccumulator(const Descriptor &array) : array_{array} {}
-  void Reinitialize() {
+  explicit RT_API_ATTRS ComplexSumAccumulator(const Descriptor &array)
+      : array_{array} {}
+  void RT_API_ATTRS Reinitialize() {
     reals_.Reinitialize();
     imaginaries_.Reinitialize();
   }
-  template <typename A> void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
     using ResultPart = typename A::value_type;
     *p = {reals_.template Result<ResultPart>(),
         imaginaries_.template Result<ResultPart>()};
   }
-  template <typename A> bool Accumulate(const A &z) {
+  template <typename A> RT_API_ATTRS bool Accumulate(const A &z) {
     reals_.Accumulate(z.real());
     imaginaries_.Accumulate(z.imag());
     return true;
   }
-  template <typename A> bool AccumulateAt(const SubscriptValue at[]) {
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
     return Accumulate(*array_.Element<A>(at));
   }
 
@@ -90,28 +99,30 @@ private:
 };
 
 extern "C" {
-CppTypeFor<TypeCategory::Integer, 1> RTNAME(SumInteger1)(const Descriptor &x,
+RT_EXT_API_GROUP_BEGIN
+
+CppTypeFor<TypeCategory::Integer, 1> RTDEF(SumInteger1)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 1>(x, source, line, dim, mask,
       IntegerSumAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x}, "SUM");
 }
-CppTypeFor<TypeCategory::Integer, 2> RTNAME(SumInteger2)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 2> RTDEF(SumInteger2)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 2>(x, source, line, dim, mask,
       IntegerSumAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x}, "SUM");
 }
-CppTypeFor<TypeCategory::Integer, 4> RTNAME(SumInteger4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(SumInteger4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 4>(x, source, line, dim, mask,
       IntegerSumAccumulator<CppTypeFor<TypeCategory::Integer, 4>>{x}, "SUM");
 }
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(SumInteger8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(SumInteger8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 8>(x, source, line, dim, mask,
       IntegerSumAccumulator<CppTypeFor<TypeCategory::Integer, 8>>{x}, "SUM");
 }
 #ifdef __SIZEOF_INT128__
-CppTypeFor<TypeCategory::Integer, 16> RTNAME(SumInteger16)(const Descriptor &x,
+CppTypeFor<TypeCategory::Integer, 16> RTDEF(SumInteger16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Integer, 16>(x, source, line, dim,
       mask, IntegerSumAccumulator<CppTypeFor<TypeCategory::Integer, 16>>{x},
@@ -120,52 +131,52 @@ CppTypeFor<TypeCategory::Integer, 16> RTNAME(SumInteger16)(const Descriptor &x,
 #endif
 
 // TODO: real/complex(2 & 3)
-CppTypeFor<TypeCategory::Real, 4> RTNAME(SumReal4)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 4> RTDEF(SumReal4)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 4>(
       x, source, line, dim, mask, RealSumAccumulator<double>{x}, "SUM");
 }
-CppTypeFor<TypeCategory::Real, 8> RTNAME(SumReal8)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 8> RTDEF(SumReal8)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 8>(
       x, source, line, dim, mask, RealSumAccumulator<double>{x}, "SUM");
 }
 #if LDBL_MANT_DIG == 64
-CppTypeFor<TypeCategory::Real, 10> RTNAME(SumReal10)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 10> RTDEF(SumReal10)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 10>(
       x, source, line, dim, mask, RealSumAccumulator<long double>{x}, "SUM");
 }
 #endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-CppTypeFor<TypeCategory::Real, 16> RTNAME(SumReal16)(const Descriptor &x,
+CppTypeFor<TypeCategory::Real, 16> RTDEF(SumReal16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 16>(
       x, source, line, dim, mask, RealSumAccumulator<long double>{x}, "SUM");
 }
 #endif
 
-void RTNAME(CppSumComplex4)(CppTypeFor<TypeCategory::Complex, 4> &result,
+void RTDEF(CppSumComplex4)(CppTypeFor<TypeCategory::Complex, 4> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 4>(
       x, source, line, dim, mask, ComplexSumAccumulator<double>{x}, "SUM");
 }
-void RTNAME(CppSumComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
+void RTDEF(CppSumComplex8)(CppTypeFor<TypeCategory::Complex, 8> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 8>(
       x, source, line, dim, mask, ComplexSumAccumulator<double>{x}, "SUM");
 }
 #if LDBL_MANT_DIG == 64
-void RTNAME(CppSumComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
+void RTDEF(CppSumComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 10>(
       x, source, line, dim, mask, ComplexSumAccumulator<long double>{x}, "SUM");
 }
 #elif LDBL_MANT_DIG == 113
-void RTNAME(CppSumComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
+void RTDEF(CppSumComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
   result = GetTotalReduction<TypeCategory::Complex, 16>(
@@ -173,10 +184,12 @@ void RTNAME(CppSumComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
 }
 #endif
 
-void RTNAME(SumDim)(Descriptor &result, const Descriptor &x, int dim,
+void RTDEF(SumDim)(Descriptor &result, const Descriptor &x, int dim,
     const char *source, int line, const Descriptor *mask) {
   TypedPartialNumericReduction<IntegerSumAccumulator, RealSumAccumulator,
       ComplexSumAccumulator>(result, x, dim, source, line, mask, "SUM");
 }
+
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/support.cpp b/flang/runtime/support.cpp
index 88a3e79009f6..12135804f00e 100644
--- a/flang/runtime/support.cpp
+++ b/flang/runtime/support.cpp
@@ -11,10 +11,12 @@
 
 namespace Fortran::runtime {
 extern "C" {
+RT_EXT_API_GROUP_BEGIN
 
-bool RTNAME(IsContiguous)(const Descriptor &descriptor) {
+bool RTDEF(IsContiguous)(const Descriptor &descriptor) {
   return descriptor.IsContiguous();
 }
 
+RT_EXT_API_GROUP_END
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h
index ea659190e143..d69079e43701 100644
--- a/flang/runtime/tools.h
+++ b/flang/runtime/tools.h
@@ -94,6 +94,31 @@ static inline RT_API_ATTRS std::int64_t GetInt64(
   }
 }
 
+static inline RT_API_ATTRS std::optional<std::int64_t> GetInt64Safe(
+    const char *p, std::size_t bytes, Terminator &terminator) {
+  switch (bytes) {
+  case 1:
+    return *reinterpret_cast<const CppTypeFor<TypeCategory::Integer, 1> *>(p);
+  case 2:
+    return *reinterpret_cast<const CppTypeFor<TypeCategory::Integer, 2> *>(p);
+  case 4:
+    return *reinterpret_cast<const CppTypeFor<TypeCategory::Integer, 4> *>(p);
+  case 8:
+    return *reinterpret_cast<const CppTypeFor<TypeCategory::Integer, 8> *>(p);
+  case 16: {
+    using Int128 = CppTypeFor<TypeCategory::Integer, 16>;
+    auto n{*reinterpret_cast<const Int128 *>(p)};
+    std::int64_t result{static_cast<std::int64_t>(n)};
+    if (static_cast<Int128>(result) == n) {
+      return result;
+    }
+    return std::nullopt;
+  }
+  default:
+    terminator.Crash("GetInt64Safe: no case for %zd bytes", bytes);
+  }
+}
+
 template <typename INT>
 inline RT_API_ATTRS bool SetInteger(INT &x, int kind, std::int64_t value) {
   switch (kind) {
@@ -411,5 +436,27 @@ RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
     bool toIsContiguous, bool fromIsContiguous);
 RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from);
 
+// Defines a utility function for copying and padding characters
+template <typename TO, typename FROM>
+RT_API_ATTRS void CopyAndPad(
+    TO *to, const FROM *from, std::size_t toChars, std::size_t fromChars) {
+  if constexpr (sizeof(TO) != sizeof(FROM)) {
+    std::size_t copyChars{std::min(toChars, fromChars)};
+    for (std::size_t j{0}; j < copyChars; ++j) {
+      to[j] = from[j];
+    }
+    for (std::size_t j{copyChars}; j < toChars; ++j) {
+      to[j] = static_cast<TO>(' ');
+    }
+  } else if (toChars <= fromChars) {
+    std::memcpy(to, from, toChars * sizeof(TO));
+  } else {
+    std::memcpy(to, from, std::min(toChars, fromChars) * sizeof(TO));
+    for (std::size_t j{fromChars}; j < toChars; ++j) {
+      to[j] = static_cast<TO>(' ');
+    }
+  }
+}
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_TOOLS_H_
diff --git a/flang/runtime/transformational.cpp b/flang/runtime/transformational.cpp
index da8ec05c884f..cf1e61c0844d 100644
--- a/flang/runtime/transformational.cpp
+++ b/flang/runtime/transformational.cpp
@@ -52,9 +52,11 @@ public:
           }
         }
       }
+    } else if (auto count{GetInt64Safe(
+                   shift_.OffsetElement<char>(), shiftElemLen_, terminator_)}) {
+      shiftCount_ = *count;
     } else {
-      shiftCount_ =
-          GetInt64(shift_.OffsetElement<char>(), shiftElemLen_, terminator_);
+      terminator_.Crash("%s: SHIFT= value exceeds 64 bits", which);
     }
   }
   RT_API_ATTRS SubscriptValue GetShift(const SubscriptValue resultAt[]) const {
@@ -67,8 +69,10 @@ public:
           ++k;
         }
       }
-      return GetInt64(
-          shift_.Element<char>(shiftAt), shiftElemLen_, terminator_);
+      auto count{GetInt64Safe(
+          shift_.Element<char>(shiftAt), shiftElemLen_, terminator_)};
+      RUNTIME_CHECK(terminator_, count.has_value());
+      return *count;
     } else {
       return shiftCount_; // invariant count extracted in Init()
     }
@@ -719,12 +723,15 @@ void RTDEF(Reshape)(Descriptor &result, const Descriptor &source,
   std::size_t resultElements{1};
   SubscriptValue shapeSubscript{shape.GetDimension(0).LowerBound()};
   for (int j{0}; j < resultRank; ++j, ++shapeSubscript) {
-    resultExtent[j] = GetInt64(
-        shape.Element<char>(&shapeSubscript), shapeElementBytes, terminator);
-    if (resultExtent[j] < 0) {
+    auto extent{GetInt64Safe(
+        shape.Element<char>(&shapeSubscript), shapeElementBytes, terminator)};
+    if (!extent) {
+      terminator.Crash("RESHAPE: value of SHAPE(%d) exceeds 64 bits", j + 1);
+    } else if (*extent < 0) {
       terminator.Crash("RESHAPE: bad value for SHAPE(%d)=%jd", j + 1,
-          static_cast<std::intmax_t>(resultExtent[j]));
+          static_cast<std::intmax_t>(*extent));
     }
+    resultExtent[j] = *extent;
     resultElements *= resultExtent[j];
   }
 
@@ -762,14 +769,16 @@ void RTDEF(Reshape)(Descriptor &result, const Descriptor &source,
     SubscriptValue orderSubscript{order->GetDimension(0).LowerBound()};
     std::size_t orderElementBytes{order->ElementBytes()};
     for (SubscriptValue j{0}; j < resultRank; ++j, ++orderSubscript) {
-      auto k{GetInt64(order->Element<char>(&orderSubscript), orderElementBytes,
-          terminator)};
-      if (k < 1 || k > resultRank || ((values >> k) & 1)) {
+      auto k{GetInt64Safe(order->Element<char>(&orderSubscript),
+          orderElementBytes, terminator)};
+      if (!k) {
+        terminator.Crash("RESHAPE: ORDER element value exceeds 64 bits");
+      } else if (*k < 1 || *k > resultRank || ((values >> *k) & 1)) {
         terminator.Crash("RESHAPE: bad value for ORDER element (%jd)",
-            static_cast<std::intmax_t>(k));
+            static_cast<std::intmax_t>(*k));
       }
-      values |= std::uint64_t{1} << k;
-      dimOrder[j] = k - 1;
+      values |= std::uint64_t{1} << *k;
+      dimOrder[j] = *k - 1;
     }
   } else {
     for (int j{0}; j < resultRank; ++j) {
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index e4f346ae941f..18590567c65e 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -11,6 +11,7 @@
 #include "lock.h"
 #include "tools.h"
 #include "unit-map.h"
+#include "flang/Runtime/magic-numbers.h"
 #include <cstdio>
 #include <limits>
 #include <utility>
@@ -220,21 +221,24 @@ UnitMap &ExternalFileUnit::CreateUnitMap() {
   UnitMap &newUnitMap{*New<UnitMap>{terminator}().release()};
 
   bool wasExtant{false};
-  ExternalFileUnit &out{*newUnitMap.LookUpOrCreate(6, terminator, wasExtant)};
+  ExternalFileUnit &out{*newUnitMap.LookUpOrCreate(
+      FORTRAN_DEFAULT_OUTPUT_UNIT, terminator, wasExtant)};
   RUNTIME_CHECK(terminator, !wasExtant);
   out.Predefine(1);
   handler.SignalError(out.SetDirection(Direction::Output));
   out.isUnformatted = false;
   defaultOutput = &out;
 
-  ExternalFileUnit &in{*newUnitMap.LookUpOrCreate(5, terminator, wasExtant)};
+  ExternalFileUnit &in{*newUnitMap.LookUpOrCreate(
+      FORTRAN_DEFAULT_INPUT_UNIT, terminator, wasExtant)};
   RUNTIME_CHECK(terminator, !wasExtant);
   in.Predefine(0);
   handler.SignalError(in.SetDirection(Direction::Input));
   in.isUnformatted = false;
   defaultInput = &in;
 
-  ExternalFileUnit &error{*newUnitMap.LookUpOrCreate(0, terminator, wasExtant)};
+  ExternalFileUnit &error{
+      *newUnitMap.LookUpOrCreate(FORTRAN_ERROR_UNIT, terminator, wasExtant)};
   RUNTIME_CHECK(terminator, !wasExtant);
   error.Predefine(2);
   handler.SignalError(error.SetDirection(Direction::Output));
diff --git a/flang/test/Driver/dynamic-linker.f90 b/flang/test/Driver/dynamic-linker.f90
index df119c22a2ea..7c3f1b5a53fe 100644
--- a/flang/test/Driver/dynamic-linker.f90
+++ b/flang/test/Driver/dynamic-linker.f90
@@ -3,18 +3,26 @@
 
 ! RUN: %flang -### --target=x86_64-linux-gnu -rpath /path/to/dir -shared \
 ! RUN:     -static %s 2>&1 | FileCheck \
-! RUN:     --check-prefixes=GNU-LINKER-OPTIONS %s
+! RUN:     --check-prefixes=GNU-LINKER-OPTIONS \
+! RUN:     --implicit-check-not=GNU-LINKER-OPTIONS-NOT %s
 ! RUN: %flang -### --target=x86_64-windows-msvc -rpath /path/to/dir -shared \
 ! RUN:     -static %s 2>&1 | FileCheck \
-! RUN:     --check-prefixes=MSVC-LINKER-OPTIONS %s
+! RUN:     --check-prefixes=MSVC-LINKER-OPTIONS \
+! RUN:     --implicit-check-not=MSVC-LINKER-OPTIONS-NOT %s
+! RUN: %flang -### --target=aarch64-linux-none -rdynamic %s 2>&1 | FileCheck --check-prefixes=RDYNAMIC-LINKER-OPTION %s
 
 ! TODO: Could the linker have an extension or a suffix?
 ! GNU-LINKER-OPTIONS: "{{.*}}ld{{(.exe)?}}"
 ! GNU-LINKER-OPTIONS-SAME: "-shared"
 ! GNU-LINKER-OPTIONS-SAME: "-static"
 ! GNU-LINKER-OPTIONS-SAME: "-rpath" "/path/to/dir"
+! GNU-LINKER-OPTIONS-NOT: "-lFortran_main.a"
+
+! RDYNAMIC-LINKER-OPTION: "{{.*}}ld"
+! RDYNAMIC-LINKER-OPTION-SAME: "-export-dynamic"
 
 ! For MSVC, adding -static does not add any additional linker options.
 ! MSVC-LINKER-OPTIONS: "{{.*}}link{{(.exe)?}}"
 ! MSVC-LINKER-OPTIONS-SAME: "-dll"
 ! MSVC-LINKER-OPTIONS-SAME: "-rpath" "/path/to/dir"
+! MSVC-LINKER-OPTIONS-NOT: "/WHOLEARCHIVE:Fortran_main"
diff --git a/flang/test/Driver/frame-pointer-forwarding.f90 b/flang/test/Driver/frame-pointer-forwarding.f90
index fd615987f82f..751494cc6a60 100644
--- a/flang/test/Driver/frame-pointer-forwarding.f90
+++ b/flang/test/Driver/frame-pointer-forwarding.f90
@@ -1,9 +1,12 @@
 ! Test that flang-new forwards -fno-omit-frame-pointer and -fomit-frame-pointer Flang frontend
-! RUN: %flang -fno-omit-frame-pointer --target=x86-none-none -fsyntax-only -### %s -o %t 2>&1  | FileCheck %s
-! CHECK: "-mframe-pointer=all"
+! RUN: %flang --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1  | FileCheck %s --check-prefix=CHECK-NOVALUE
+! CHECK-NOVALUE: "-fc1"{{.*}}"-mframe-pointer=non-leaf"
+
+! RUN: %flang -fomit-frame-pointer --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1  | FileCheck %s --check-prefix=CHECK-NONEFP
+! CHECK-NONEFP: "-fc1"{{.*}}"-mframe-pointer=none"
 
 ! RUN: %flang -fno-omit-frame-pointer --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1  | FileCheck %s --check-prefix=CHECK-NONLEAFFP
-! CHECK-NONLEAFFP: "-mframe-pointer=non-leaf"
+! CHECK-NONLEAFFP: "-fc1"{{.*}}"-mframe-pointer=non-leaf"
 
-! RUN: %flang -fomit-frame-pointer --target=aarch64-none-none -fsyntax-only -### %s -o %t 2>&1  | FileCheck %s --check-prefix=CHECK-NONEFP
-! CHECK-NONEFP: "-mframe-pointer=none"
+! RUN: %flang -fno-omit-frame-pointer --target=x86-none-none -fsyntax-only -### %s -o %t 2>&1  | FileCheck %s --check-prefix=CHECK-ALLFP
+! CHECK-ALLFP: "-fc1"{{.*}}"-mframe-pointer=all"
diff --git a/flang/test/Driver/func-attr.f90 b/flang/test/Driver/func-attr.f90
new file mode 100644
index 000000000000..7442ac54dcaa
--- /dev/null
+++ b/flang/test/Driver/func-attr.f90
@@ -0,0 +1,25 @@
+! Test that -mframe-pointer can accept only specific values and when given an invalid value, check it raises an error.
+
+! REQUIRES: aarch64-registered-target
+
+! RUN: %flang_fc1 -triple aarch64-none-none -mframe-pointer=none -emit-llvm -o - %s 2>&1| FileCheck %s --check-prefix=CHECK-NONEFP
+! RUN: %flang_fc1 -triple aarch64-none-none -mframe-pointer=non-leaf -emit-llvm -o - %s 2>&1| FileCheck %s --check-prefix=CHECK-NONLEAFFP
+! RUN: %flang_fc1 -triple aarch64-none-none -mframe-pointer=all -emit-llvm -o - %s 2>&1| FileCheck %s --check-prefix=CHECK-ALLFP
+! RUN: not %flang_fc1 -triple aarch64-none-none -mframe-pointer=wrongval -emit-llvm -o - %s 2>&1| FileCheck %s --check-prefix=CHECK-WRONGVALUEFP
+
+! CHECK-NONEFP-LABEL: @func_() {
+
+! CHECK-NONLEAFFP-LABEL: @func_()
+! CHECK-NONLEAFFP-SAME: #0
+
+! CHECK-ALLFP-LABEL: @func_()
+! CHECK-ALLFP-SAME: #0
+
+subroutine func
+end subroutine func
+
+! CHECK-NONEFP-NOT: attributes #0 = { "frame-pointer"="{{.*}}" }
+! CHECK-NONLEAFFP: attributes #0 = { "frame-pointer"="non-leaf" }
+! CHECK-ALLFP: attributes #0 = { "frame-pointer"="all" }
+
+! CHECK-WRONGVALUEFP:error: invalid value 'wrongval' in '-mframe-pointer=wrongval'
diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90
index a3ff416f4d77..45b1717d7187 100644
--- a/flang/test/Driver/mlir-debug-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90
@@ -82,5 +82,5 @@ end program
 ! ALL-NEXT: ExternalNameConversion
 ! DEBUG-NEXT: AddDebugFoundation
 ! NO-DEBUG-NOT: AddDebugFoundation
-! ALL-NEXT: FIRToLLVMLowering
+! ALL: FIRToLLVMLowering
 ! ALL-NOT: LLVMIRLoweringPass
diff --git a/flang/test/Driver/no-duplicate-main.f90 b/flang/test/Driver/no-duplicate-main.f90
index b884e7ecd7f1..88f4430828e0 100644
--- a/flang/test/Driver/no-duplicate-main.f90
+++ b/flang/test/Driver/no-duplicate-main.f90
@@ -1,4 +1,4 @@
-! UNSUPPORTED: system-windows, system-darwin
+! UNSUPPORTED: system-windows, system-darwin, system-aix
 
 ! RUN: %flang -x ir -o %t.c-object -c %S/Inputs/no_duplicate_main.ll
 ! RUN: %flang -o %t -c %s
diff --git a/flang/test/Driver/save-mlir-temps.f90 b/flang/test/Driver/save-mlir-temps.f90
index 50bc83030caa..1c8935fbd7aa 100644
--- a/flang/test/Driver/save-mlir-temps.f90
+++ b/flang/test/Driver/save-mlir-temps.f90
@@ -51,9 +51,9 @@
 ! Content to check from the MLIR outputs
 !--------------------------
 ! MLIR-FIR-NOT: llvm.func
-! MLIR-FIR: func.func @{{.*}}main() {
+! MLIR-FIR: func.func @{{.*}}main(){{.*}}{
 
 ! MLIR-FIR-NOT: func.func
-! MLIR-LLVMIR: llvm.func @{{.*}}main() {
+! MLIR-LLVMIR: llvm.func @{{.*}}main(){{.*}}{
 
 end program
diff --git a/flang/test/Evaluate/fold-nearest.f90 b/flang/test/Evaluate/fold-nearest.f90
index 99af30312841..bd8b020c392a 100644
--- a/flang/test/Evaluate/fold-nearest.f90
+++ b/flang/test/Evaluate/fold-nearest.f90
@@ -26,6 +26,8 @@ module m1
   logical, parameter :: test_14 = nearest(0., negZero) == -minSubnormal
   !WARN: warning: NEAREST: S argument is zero
   logical, parameter :: test_15 = nearest(negZero, 0.) == minSubnormal
+  logical, parameter :: test_16 = nearest(tiny(1.),-1.) == 1.1754942E-38
+  logical, parameter :: test_17 = nearest(tiny(1.),1.) == 1.1754945E-38
 end module
 
 module m2
diff --git a/flang/test/Fir/box-offset-codegen.fir b/flang/test/Fir/box-offset-codegen.fir
index 600555cd94ce..389ceebcc065 100644
--- a/flang/test/Fir/box-offset-codegen.fir
+++ b/flang/test/Fir/box-offset-codegen.fir
@@ -7,7 +7,7 @@ func.func @scalar_addr(%scalar : !fir.ref<!fir.box<!fir.type<t>>>) -> !fir.llvm_
   return %addr : !fir.llvm_ptr<!fir.ref<!fir.type<t>>>
 }
 // CHECK-LABEL: define ptr @scalar_addr(
-// CHECK-SAME: ptr %[[BOX:.*]]) {
+// CHECK-SAME: ptr %[[BOX:.*]]){{.*}}{
 // CHECK:    %[[VAL_0:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[BOX]], i32 0, i32 0
 // CHECK:    ret ptr %[[VAL_0]]
 
@@ -16,7 +16,7 @@ func.func @scalar_tdesc(%scalar : !fir.ref<!fir.box<!fir.type<t>>>) -> !fir.llvm
   return %tdesc : !fir.llvm_ptr<!fir.tdesc<!fir.type<t>>>
 }
 // CHECK-LABEL: define ptr @scalar_tdesc(
-// CHECK-SAME: ptr %[[BOX:.*]]) {
+// CHECK-SAME: ptr %[[BOX:.*]]){{.*}}{
 // CHECK:    %[[VAL_0:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[BOX]], i32 0, i32 7
 // CHECK:    ret ptr %[[VAL_0]]
 
@@ -25,7 +25,7 @@ func.func @array_addr(%array : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.ty
   return %addr : !fir.llvm_ptr<!fir.ptr<!fir.array<?x!fir.type<t>>>>
 }
 // CHECK-LABEL: define ptr @array_addr(
-// CHECK-SAME: ptr %[[BOX:.*]]) {
+// CHECK-SAME: ptr %[[BOX:.*]]){{.*}}{
 // CHECK:    %[[VAL_0:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[BOX]], i32 0, i32 0
 // CHECK:    ret ptr %[[VAL_0]]
 
@@ -34,6 +34,6 @@ func.func @array_tdesc(%array : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.t
   return %tdesc : !fir.llvm_ptr<!fir.tdesc<!fir.type<t>>>
 }
 // CHECK-LABEL: define ptr @array_tdesc(
-// CHECK-SAME: ptr %[[BOX:.*]]) {
+// CHECK-SAME: ptr %[[BOX:.*]]){{.*}}{
 // CHECK:    %[[VAL_0:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[BOX]], i32 0, i32 8
 // CHECK:    ret ptr %[[VAL_0]]
diff --git a/flang/test/Fir/polymorphic.fir b/flang/test/Fir/polymorphic.fir
index ce8e43b0be65..ecdcdeb57531 100644
--- a/flang/test/Fir/polymorphic.fir
+++ b/flang/test/Fir/polymorphic.fir
@@ -10,7 +10,7 @@ func.func @_QMpolymorphic_testPtest_allocate_unlimited_polymorphic_non_derived()
   return
 }
 
-// CHECK-LABEL: define void @_QMpolymorphic_testPtest_allocate_unlimited_polymorphic_non_derived() {
+// CHECK-LABEL: define void @_QMpolymorphic_testPtest_allocate_unlimited_polymorphic_non_derived(){{.*}}{
 // CHECK:   %[[MEM:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }
 // CHECK:   %[[DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
 // CHECK:   store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 0, i32 20180515, i8 0, i8 -1, i8 1, i8 1, ptr null, [1 x i64] undef }, ptr %[[MEM]]
@@ -87,7 +87,7 @@ func.func @_QMunlimitedPsub1(%arg0: !fir.class<!fir.array<?xnone>> {fir.bindc_na
 }
 
 // CHECK-LABEL: define void @_QMunlimitedPsub1(
-// CHECK-SAME: ptr %[[ARRAY:.*]]) {
+// CHECK-SAME: ptr %[[ARRAY:.*]]){{.*}}{
 // CHECK: %[[BOX:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }
 // CHECK: %{{.}} = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ARRAY]], i32 0, i32 7, i32 0, i32 2
 // CHECK: %[[TYPE_DESC_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ARRAY]], i32 0, i32 8
@@ -151,7 +151,7 @@ func.func @_QQmain() {
   return
 }
 
-// CHECK-LABEL: define void @_QQmain() {
+// CHECK-LABEL: define void @_QQmain(){{.*}}{
 // CHECK: %[[CLASS_NONE:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }
 // CHECK: %[[DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr @_QMmod1Ea, i64 ptrtoint (ptr getelementptr (%_QMmod1TtK2, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 42, i8 1, i8 1, ptr @_QMmod1E.dt.t.2, [1 x i64] undef }, ptr %[[CLASS_NONE]], align 8
@@ -175,7 +175,7 @@ func.func @_QMmod2Pinitp(%arg0: !fir.ref<!fir.class<!fir.ptr<none>>> {fir.bindc_
 func.func private @_FortranAPointerAssociate(!fir.ref<!fir.box<none>>, !fir.box<none>) -> none attributes {fir.runtime}
 
 // CHECK-LABEL: define void @_QMmod2Pinitp(
-// CHECK-SAME: ptr %[[ARG0:.*]]) {
+// CHECK-SAME: ptr %[[ARG0:.*]]){{.*}}{
 // CHECK: %[[ALLOCA_CLASS_NONE:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }
 // CHECK: %[[LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[ARG0]]
 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD]], ptr %[[ALLOCA_CLASS_NONE]]
diff --git a/flang/test/Fir/tbaa-codegen.fir b/flang/test/Fir/tbaa-codegen.fir
index fd0eb9c7304e..87bb15c0fea6 100644
--- a/flang/test/Fir/tbaa-codegen.fir
+++ b/flang/test/Fir/tbaa-codegen.fir
@@ -28,7 +28,7 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
 }
 
 // CHECK-LABEL: define void @_QPsimple(
-// CHECK-SAME:      ptr %[[ARG0:.*]]) {
+// CHECK-SAME:      ptr %[[ARG0:.*]]){{.*}}{
 // [...]
 // load  a(2):
 // CHECK:  %[[VAL20:.*]] = getelementptr i8, ptr %{{.*}}, i64 %{{.*}}
diff --git a/flang/test/Fir/tbaa-codegen2.fir b/flang/test/Fir/tbaa-codegen2.fir
index d73a7b96a538..e649c06731c6 100644
--- a/flang/test/Fir/tbaa-codegen2.fir
+++ b/flang/test/Fir/tbaa-codegen2.fir
@@ -60,7 +60,7 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
   }
 }
 // CHECK-LABEL: define void @_QPfunc(
-// CHECK-SAME:      ptr %[[ARG0:.*]]) {
+// CHECK-SAME:      ptr %[[ARG0:.*]]){{.*}}{
 // [...]
 // CHECK:  %[[VAL5:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ARG0]], i32 0, i32 7, i32 0, i32 0
 // box access:
diff --git a/flang/test/Lower/HLFIR/calls-f77.f90 b/flang/test/Lower/HLFIR/calls-f77.f90
index 09eebaae7606..ac5be007eb83 100644
--- a/flang/test/Lower/HLFIR/calls-f77.f90
+++ b/flang/test/Lower/HLFIR/calls-f77.f90
@@ -156,7 +156,7 @@ subroutine return_char(n)
 end subroutine
 ! CHECK-LABEL: func.func @_QPreturn_char(
 ! CHECK:  %[[VAL_1:.*]]:2 = hlfir.declare {{.*}}n
-! CHECK:  %[[VAL_2:.*]] = arith.constant -1 : i32
+! CHECK:  %[[VAL_2:.*]] = arith.constant 6 : i32
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i64>
 ! CHECK:  %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i64) -> index
 ! CHECK:  %[[VAL_9:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90 b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
index cb9dd2fa7956..b943cd3225a5 100644
--- a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
+++ b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
@@ -8,7 +8,7 @@ end subroutine test_int_allocatable
 ! CHECK-LABEL:   func.func @_QPtest_int_allocatable(
 ! CHECK-SAME:                                       %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<i32>>> {fir.bindc_name = "a"}) {
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_int_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
-! CHECK:           %[[VAL_2:.*]] = arith.constant -1 : i32
+! CHECK:           %[[VAL_2:.*]] = arith.constant 6 : i32
 ! CHECK:           %[[VAL_3:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant {{[0-9]*}} : i32
@@ -28,7 +28,7 @@ end subroutine test_int_pointer
 ! CHECK-LABEL:   func.func @_QPtest_int_pointer(
 ! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>> {fir.bindc_name = "p"}) {
 ! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_int_pointerEp"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
-! CHECK:           %[[VAL_2:.*]] = arith.constant -1 : i32
+! CHECK:           %[[VAL_2:.*]] = arith.constant 6 : i32
 ! CHECK:           %[[VAL_3:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,{{[0-9]*}}>>
 ! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant {{[0-9]*}} : i32
diff --git a/flang/test/Lower/Intrinsics/associated-proc-pointers.f90 b/flang/test/Lower/Intrinsics/associated-proc-pointers.f90
new file mode 100644
index 000000000000..248b0aff8d28
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/associated-proc-pointers.f90
@@ -0,0 +1,116 @@
+! Test ASSOCIATED() with procedure pointers.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+subroutine test_proc_pointer_1(p, dummy_proc)
+  procedure(), pointer :: p
+  procedure() :: dummy_proc
+  call takes_log(associated(p, dummy_proc))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_proc_pointer_1(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
+! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.boxproc<() -> ()>) {
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_1Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_1]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_4]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_6]], %[[VAL_7]] : i64
+! CHECK:           %[[VAL_9:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_10:.*]] = arith.cmpi ne, %[[VAL_9]], %[[VAL_6]] : i64
+! CHECK:           %[[VAL_11:.*]] = arith.andi %[[VAL_8]], %[[VAL_10]] : i1
+! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i1) -> !fir.logical<4>
+
+subroutine test_proc_pointer_2(p, p_target)
+  procedure(), pointer :: p, p_target
+  call takes_log(associated(p, p_target))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_proc_pointer_2(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
+! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep_target"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_7:.*]] = fir.box_addr %[[VAL_6]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_5]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_7]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_9]] : i64
+! CHECK:           %[[VAL_11:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_12:.*]] = arith.cmpi ne, %[[VAL_11]], %[[VAL_8]] : i64
+! CHECK:           %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1
+! CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i1) -> !fir.logical<4>
+
+subroutine test_proc_pointer_3(p, dummy_proc)
+  procedure(), pointer :: p
+  procedure(), optional :: dummy_proc
+  call takes_log(associated(p, dummy_proc))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_proc_pointer_3(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
+! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.boxproc<() -> ()>) {
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_3Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_1]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_4]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_6]], %[[VAL_7]] : i64
+! CHECK:           %[[VAL_9:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_10:.*]] = arith.cmpi ne, %[[VAL_9]], %[[VAL_6]] : i64
+! CHECK:           %[[VAL_11:.*]] = arith.andi %[[VAL_8]], %[[VAL_10]] : i1
+! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i1) -> !fir.logical<4>
+
+subroutine test_proc_pointer_4(p)
+  procedure(), pointer :: p
+  external :: some_external
+  call takes_log(associated(p, some_external))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_proc_pointer_4(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_4Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QPsome_external) : () -> ()
+! CHECK:           %[[VAL_3:.*]] = fir.emboxproc %[[VAL_2]] : (() -> ()) -> !fir.boxproc<() -> ()>
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_6]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_9:.*]] = arith.cmpi eq, %[[VAL_7]], %[[VAL_8]] : i64
+! CHECK:           %[[VAL_10:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_11:.*]] = arith.cmpi ne, %[[VAL_10]], %[[VAL_7]] : i64
+! CHECK:           %[[VAL_12:.*]] = arith.andi %[[VAL_9]], %[[VAL_11]] : i1
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i1) -> !fir.logical<4>
+
+subroutine test_proc_pointer_5(p, dummy_proc)
+  interface
+    character(10) function char_func()
+    end function
+  end interface
+  procedure(char_func), pointer :: p
+  procedure(char_func) :: dummy_proc
+  call takes_log(associated(p, dummy_proc))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_proc_pointer_5(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
+! CHECK-SAME:                                      %[[VAL_1:.*]]: tuple<!fir.boxproc<() -> ()>, i64> {fir.char_proc}) {
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_5Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_3:.*]] = fir.extract_value %[[VAL_1]], [0 : index] : (tuple<!fir.boxproc<() -> ()>, i64>) -> !fir.boxproc<() -> ()>
+! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_5:.*]] = arith.constant 10 : i64
+! CHECK:           %[[VAL_6:.*]] = fir.emboxproc %[[VAL_4]] : (() -> ()) -> !fir.boxproc<() -> ()>
+! CHECK:           %[[VAL_7:.*]] = fir.undefined tuple<!fir.boxproc<() -> ()>, i64>
+! CHECK:           %[[VAL_8:.*]] = fir.insert_value %[[VAL_7]], %[[VAL_6]], [0 : index] : (tuple<!fir.boxproc<() -> ()>, i64>, !fir.boxproc<() -> ()>) -> tuple<!fir.boxproc<() -> ()>, i64>
+! CHECK:           %[[VAL_9:.*]] = fir.insert_value %[[VAL_8]], %[[VAL_5]], [1 : index] : (tuple<!fir.boxproc<() -> ()>, i64>, i64) -> tuple<!fir.boxproc<() -> ()>, i64>
+! CHECK:           %[[VAL_10:.*]] = fir.extract_value %[[VAL_9]], [0 : index] : (tuple<!fir.boxproc<() -> ()>, i64>) -> !fir.boxproc<() -> ()>
+! CHECK:           %[[VAL_11:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_12:.*]] = fir.box_addr %[[VAL_11]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_13:.*]] = fir.box_addr %[[VAL_10]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_12]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_13]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_14]], %[[VAL_15]] : i64
+! CHECK:           %[[VAL_17:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_18:.*]] = arith.cmpi ne, %[[VAL_17]], %[[VAL_14]] : i64
+! CHECK:           %[[VAL_19:.*]] = arith.andi %[[VAL_16]], %[[VAL_18]] : i1
+! CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i1) -> !fir.logical<4>
diff --git a/flang/test/Lower/Intrinsics/c_f_procpointer.f90 b/flang/test/Lower/Intrinsics/c_f_procpointer.f90
new file mode 100644
index 000000000000..f70a56c91b91
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/c_f_procpointer.f90
@@ -0,0 +1,42 @@
+! Test C_F_PROCPOINTER() lowering.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+subroutine test_c_funloc(fptr, cptr)
+  use iso_c_binding, only : c_f_procpointer, c_funptr
+  real, pointer, external :: fptr
+  type(c_funptr), cptr
+  call c_f_procpointer(cptr, fptr)
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_c_funloc(
+! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
+! CHECK-SAME:                                %[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>> {fir.bindc_name = "cptr"}) {
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_c_funlocEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
+! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_2]]#1, %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> (() -> ())
+! CHECK:           %[[VAL_8:.*]] = fir.emboxproc %[[VAL_7]] : (() -> ()) -> !fir.boxproc<() -> ()>
+! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+
+subroutine test_c_funloc_char(fptr, cptr)
+  use iso_c_binding, only : c_f_procpointer, c_funptr
+  interface
+    character(10) function char_func()
+    end function
+  end interface
+  procedure(char_func), pointer :: fptr
+  type(c_funptr), cptr
+  call c_f_procpointer(cptr, fptr)
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_c_funloc_char(
+! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
+! CHECK-SAME:                                     %[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>> {fir.bindc_name = "cptr"}) {
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_c_funloc_charEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
+! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_2]]#1, %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> (() -> ())
+! CHECK:           %[[VAL_8:.*]] = fir.emboxproc %[[VAL_7]] : (() -> ()) -> !fir.boxproc<() -> ()>
+! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
diff --git a/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90 b/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90
new file mode 100644
index 000000000000..c9578b17ac52
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90
@@ -0,0 +1,38 @@
+! Test C_FUNLOC() with procedure pointers.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+subroutine test_c_funloc(p)
+  use iso_c_binding, only : c_funloc
+  real, pointer, external :: p
+  call test(c_funloc(p))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_c_funloc(
+! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEp"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
+! CHECK:           %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
+! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_2]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (() -> ()) -> i64
+! CHECK:           fir.store %[[VAL_7]] to %[[VAL_5]] : !fir.ref<i64>
+
+subroutine test_c_funloc_char(p)
+  use iso_c_binding, only : c_funloc
+  interface
+    character(10) function char_func()
+    end function
+  end interface
+  procedure(char_func), pointer :: p
+  call test(c_funloc(p))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_c_funloc_char(
+! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEp"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
+! CHECK:           %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
+! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_2]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (() -> ()) -> i64
+! CHECK:           fir.store %[[VAL_7]] to %[[VAL_5]] : !fir.ref<i64>
diff --git a/flang/test/Lower/OpenACC/acc-device-type.f90 b/flang/test/Lower/OpenACC/acc-device-type.f90
new file mode 100644
index 000000000000..871dbc95f60f
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-device-type.f90
@@ -0,0 +1,44 @@
+! This test checks lowering of OpenACC device_type clause on directive where its
+! position and the clauses that follow have special semantic
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+subroutine sub1()
+
+  !$acc parallel num_workers(16)
+  !$acc end parallel
+
+! CHECK: acc.parallel num_workers(%c16{{.*}} : i32) {
+
+  !$acc parallel num_workers(1) device_type(nvidia) num_workers(16)
+  !$acc end parallel
+
+! CHECK: acc.parallel num_workers(%c1{{.*}} : i32, %c16{{.*}} : i32 [#acc.device_type<nvidia>])
+
+  !$acc parallel device_type(*) num_workers(1) device_type(nvidia) num_workers(16)
+  !$acc end parallel
+
+! CHECK: acc.parallel num_workers(%c1{{.*}} : i32 [#acc.device_type<star>], %c16{{.*}} : i32 [#acc.device_type<nvidia>])
+
+  !$acc parallel vector_length(1)
+  !$acc end parallel
+
+! CHECK: acc.parallel vector_length(%c1{{.*}} : i32)
+
+  !$acc parallel device_type(multicore) vector_length(1)
+  !$acc end parallel
+
+! CHECK: acc.parallel vector_length(%c1{{.*}} : i32 [#acc.device_type<multicore>])
+
+  !$acc parallel num_gangs(2) device_type(nvidia) num_gangs(4)
+  !$acc end parallel
+
+! CHECK: acc.parallel num_gangs({%c2{{.*}} : i32}, {%c4{{.*}} : i32} [#acc.device_type<nvidia>])
+
+  !$acc parallel num_gangs(2) device_type(nvidia) num_gangs(1, 1, 1)
+  !$acc end parallel
+
+! CHECK: acc.parallel num_gangs({%c2{{.*}} : i32}, {%c1{{.*}} : i32, %c1{{.*}} : i32, %c1{{.*}} : i32} [#acc.device_type<nvidia>])
+
+
+end subroutine
diff --git a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
index 34e723269724..93bc699031d5 100644
--- a/flang/test/Lower/OpenACC/acc-kernels-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
@@ -62,7 +62,7 @@ subroutine acc_kernels_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
-! CHECK-NEXT: } attributes {asyncAttr}
+! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} 
 
   !$acc kernels loop async(1)
   DO i = 1, n
@@ -103,7 +103,7 @@ subroutine acc_kernels_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
-! CHECK-NEXT: } attributes {waitAttr}
+! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
 
   !$acc kernels loop wait(1)
   DO i = 1, n
@@ -111,7 +111,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.kernels wait([[WAIT1]] : i32) {
+! CHECK:      acc.kernels wait({[[WAIT1]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -126,7 +126,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.kernels wait([[WAIT2]], [[WAIT3]] : i32, i32) {
+! CHECK:      acc.kernels wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -141,7 +141,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.kernels wait([[WAIT4]], [[WAIT5]] : i32, i32) {
+! CHECK:      acc.kernels wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -155,7 +155,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.kernels num_gangs([[NUMGANGS1]] : i32) {
+! CHECK:      acc.kernels num_gangs({[[NUMGANGS1]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -169,7 +169,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.kernels num_gangs([[NUMGANGS2]] : i32) {
+! CHECK:      acc.kernels num_gangs({[[NUMGANGS2]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-kernels.f90 b/flang/test/Lower/OpenACC/acc-kernels.f90
index 1f882c6df510..99629bb83517 100644
--- a/flang/test/Lower/OpenACC/acc-kernels.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels.f90
@@ -40,7 +40,7 @@ subroutine acc_kernels
 
 ! CHECK:      acc.kernels  {
 ! CHECK:        acc.terminator
-! CHECK-NEXT: } attributes {asyncAttr}
+! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} 
 
   !$acc kernels async(1)
   !$acc end kernels
@@ -63,13 +63,13 @@ subroutine acc_kernels
 
 ! CHECK:      acc.kernels  {
 ! CHECK:        acc.terminator
-! CHECK-NEXT: } attributes {waitAttr}
+! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
 
   !$acc kernels wait(1)
   !$acc end kernels
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.kernels  wait([[WAIT1]] : i32) {
+! CHECK:      acc.kernels  wait({[[WAIT1]] : i32}) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -78,7 +78,7 @@ subroutine acc_kernels
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.kernels  wait([[WAIT2]], [[WAIT3]] : i32, i32) {
+! CHECK:      acc.kernels  wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -87,7 +87,7 @@ subroutine acc_kernels
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.kernels  wait([[WAIT4]], [[WAIT5]] : i32, i32) {
+! CHECK:      acc.kernels  wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -95,7 +95,7 @@ subroutine acc_kernels
   !$acc end kernels
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.kernels  num_gangs([[NUMGANGS1]] : i32) {
+! CHECK:      acc.kernels  num_gangs({[[NUMGANGS1]] : i32}) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -103,7 +103,7 @@ subroutine acc_kernels
   !$acc end kernels
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.kernels  num_gangs([[NUMGANGS2]] : i32) {
+! CHECK:      acc.kernels  num_gangs({[[NUMGANGS2]] : i32}) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
diff --git a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
index 1856215ce59d..deee7089033e 100644
--- a/flang/test/Lower/OpenACC/acc-parallel-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
@@ -64,7 +64,7 @@ subroutine acc_parallel_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {asyncAttr}
+! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
 
   !$acc parallel loop async(1)
   DO i = 1, n
@@ -105,7 +105,7 @@ subroutine acc_parallel_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {waitAttr}
+! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
 
   !$acc parallel loop wait(1)
   DO i = 1, n
@@ -113,7 +113,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.parallel wait([[WAIT1]] : i32) {
+! CHECK:      acc.parallel wait({[[WAIT1]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -128,7 +128,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.parallel wait([[WAIT2]], [[WAIT3]] : i32, i32) {
+! CHECK:      acc.parallel wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -143,7 +143,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.parallel wait([[WAIT4]], [[WAIT5]] : i32, i32) {
+! CHECK:      acc.parallel wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -157,7 +157,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.parallel num_gangs([[NUMGANGS1]] : i32) {
+! CHECK:      acc.parallel num_gangs({[[NUMGANGS1]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -171,7 +171,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.parallel num_gangs([[NUMGANGS2]] : i32) {
+! CHECK:      acc.parallel num_gangs({[[NUMGANGS2]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-parallel.f90 b/flang/test/Lower/OpenACC/acc-parallel.f90
index bbf51ba36a7d..a369bf01f259 100644
--- a/flang/test/Lower/OpenACC/acc-parallel.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel.f90
@@ -62,7 +62,7 @@ subroutine acc_parallel
 
 ! CHECK:      acc.parallel {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {asyncAttr}
+! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
 
   !$acc parallel async(1)
   !$acc end parallel
@@ -85,13 +85,13 @@ subroutine acc_parallel
 
 ! CHECK:      acc.parallel {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {waitAttr}
+! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
 
   !$acc parallel wait(1)
   !$acc end parallel
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.parallel wait([[WAIT1]] : i32) {
+! CHECK:      acc.parallel wait({[[WAIT1]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -100,7 +100,7 @@ subroutine acc_parallel
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.parallel wait([[WAIT2]], [[WAIT3]] : i32, i32) {
+! CHECK:      acc.parallel wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -109,7 +109,7 @@ subroutine acc_parallel
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.parallel wait([[WAIT4]], [[WAIT5]] : i32, i32) {
+! CHECK:      acc.parallel wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -117,7 +117,7 @@ subroutine acc_parallel
   !$acc end parallel
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.parallel num_gangs([[NUMGANGS1]] : i32) {
+! CHECK:      acc.parallel num_gangs({[[NUMGANGS1]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -125,14 +125,14 @@ subroutine acc_parallel
   !$acc end parallel
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.parallel num_gangs([[NUMGANGS2]] : i32) {
+! CHECK:      acc.parallel num_gangs({[[NUMGANGS2]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
   !$acc parallel num_gangs(1, 1, 1)
   !$acc end parallel
 
-! CHECK:      acc.parallel num_gangs(%{{.*}}, %{{.*}}, %{{.*}} : i32, i32, i32) {
+! CHECK:      acc.parallel num_gangs({%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
diff --git a/flang/test/Lower/OpenACC/acc-serial-loop.f90 b/flang/test/Lower/OpenACC/acc-serial-loop.f90
index 4ed7bb8da29a..712bfc80ce38 100644
--- a/flang/test/Lower/OpenACC/acc-serial-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-serial-loop.f90
@@ -83,7 +83,7 @@ subroutine acc_serial_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {asyncAttr}
+! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
 
   !$acc serial loop async(1)
   DO i = 1, n
@@ -124,7 +124,7 @@ subroutine acc_serial_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {waitAttr}
+! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
 
   !$acc serial loop wait(1)
   DO i = 1, n
@@ -132,7 +132,7 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.serial wait([[WAIT1]] : i32) {
+! CHECK:      acc.serial wait({[[WAIT1]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -147,7 +147,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.serial wait([[WAIT2]], [[WAIT3]] : i32, i32) {
+! CHECK:      acc.serial wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -162,7 +162,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.serial wait([[WAIT4]], [[WAIT5]] : i32, i32) {
+! CHECK:      acc.serial wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-serial.f90 b/flang/test/Lower/OpenACC/acc-serial.f90
index ab3b0ccd5459..d05e51d3d274 100644
--- a/flang/test/Lower/OpenACC/acc-serial.f90
+++ b/flang/test/Lower/OpenACC/acc-serial.f90
@@ -62,7 +62,7 @@ subroutine acc_serial
 
 ! CHECK:      acc.serial {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {asyncAttr}
+! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} 
 
   !$acc serial async(1)
   !$acc end serial
@@ -85,13 +85,13 @@ subroutine acc_serial
 
 ! CHECK:      acc.serial {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {waitAttr}
+! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
 
   !$acc serial wait(1)
   !$acc end serial
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.serial wait([[WAIT1]] : i32) {
+! CHECK:      acc.serial wait({[[WAIT1]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -100,7 +100,7 @@ subroutine acc_serial
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.serial wait([[WAIT2]], [[WAIT3]] : i32, i32) {
+! CHECK:      acc.serial wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -109,7 +109,7 @@ subroutine acc_serial
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.serial wait([[WAIT4]], [[WAIT5]] : i32, i32) {
+! CHECK:      acc.serial wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90
index 148a7ee31f08..2060e2062c1a 100644
--- a/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90
+++ b/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90
@@ -13,11 +13,11 @@
 
 ! Check that we are accessing the clone inside the loop
 !CHECK-DAG: omp.wsloop for (%[[INDX_WS:.*]]) : {{.*}} {
-!CHECK-DAG: %[[NEG_ONE:.*]] = arith.constant -1 : i32
+!CHECK-DAG: %[[UNIT:.*]] = arith.constant 6 : i32
 !CHECK-NEXT: %[[ADDR:.*]] = fir.address_of(@_QQclX
 !CHECK-NEXT: %[[CVT0:.*]] = fir.convert %[[ADDR]] 
 !CHECK-NEXT: %[[CNST:.*]] = arith.constant
-!CHECK-NEXT: %[[CALL_BEGIN_IO:.*]] = fir.call @_FortranAioBeginExternalListOutput(%[[NEG_ONE]], %[[CVT0]], %[[CNST]]) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+!CHECK-NEXT: %[[CALL_BEGIN_IO:.*]] = fir.call @_FortranAioBeginExternalListOutput(%[[UNIT]], %[[CVT0]], %[[CNST]]) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
 !CHECK-NEXT: %[[CVT_0_1:.*]] = fir.convert %[[ARG1_PVT]] 
 !CHECK-NEXT: %[[CVT_0_2:.*]] = fir.convert %[[FIVE]]
 !CHECK-NEXT: %[[CALL_OP_ASCII:.*]] = fir.call @_FortranAioOutputAscii(%[[CALL_BEGIN_IO]], %[[CVT_0_1]], %[[CVT_0_2]])
diff --git a/flang/test/Lower/OpenMP/FIR/target.f90 b/flang/test/Lower/OpenMP/FIR/target.f90
index 2034ac84334e..5d36699bf0e9 100644
--- a/flang/test/Lower/OpenMP/FIR/target.f90
+++ b/flang/test/Lower/OpenMP/FIR/target.f90
@@ -134,6 +134,100 @@ subroutine omp_target_exit_device
 end subroutine omp_target_exit_device
 
 !===============================================================================
+! Target_Update `to` clause
+!===============================================================================
+
+subroutine omp_target_update_to
+   integer :: a(1024)
+
+   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_update_toEa"}
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+
+   !CHECK: %[[TO_MAP:.*]] = omp.map_info var_ptr(%[[A_ALLOC]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)
+   !CHECK-SAME: map_clauses(to) capture(ByRef)
+   !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
+
+   !CHECK: omp.target_update_data
+   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update to(a)
+end subroutine omp_target_update_to
+
+!===============================================================================
+! Target_Update `from` clause
+!===============================================================================
+
+subroutine omp_target_update_from
+   integer :: a(1024)
+
+   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_update_fromEa"}
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+
+   !CHECK: %[[FROM_MAP:.*]] = omp.map_info var_ptr(%[[A_ALLOC]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)
+   !CHECK-SAME: map_clauses(from) capture(ByRef)
+   !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
+
+   !CHECK: omp.target_update_data
+   !CHECK-SAME: motion_entries(%[[FROM_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update from(a)
+end subroutine omp_target_update_from
+
+!===============================================================================
+! Target_Update `if` clause
+!===============================================================================
+
+subroutine omp_target_update_if
+   integer :: a(1024)
+   logical :: i
+
+   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+   !CHECK-DAG: %[[COND:.*]] = fir.convert %{{.*}} : (!fir.logical<4>) -> i1
+
+   !CHECK: %[[TO_MAP:.*]] = omp.map_info
+
+   !CHECK: omp.target_update_data if(%[[COND]] : i1)
+   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update to(a) if(i)
+end subroutine omp_target_update_if
+
+!===============================================================================
+! Target_Update `device` clause
+!===============================================================================
+
+subroutine omp_target_update_device
+   integer :: a(1024)
+
+   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+   !CHECK-DAG: %[[DEVICE:.*]] = arith.constant 1 : i32
+
+   !CHECK: %[[TO_MAP:.*]] = omp.map_info
+
+   !CHECK: omp.target_update_data
+   !CHECK-SAME: device(%[[DEVICE]] : i32)
+   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update to(a) device(1)
+end subroutine omp_target_update_device
+
+!===============================================================================
+! Target_Update `nowait` clause
+!===============================================================================
+
+subroutine omp_target_update_nowait
+   integer :: a(1024)
+
+   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+
+   !CHECK: %[[TO_MAP:.*]] = omp.map_info
+
+   !CHECK: omp.target_update_data
+   !CHECK-SAME: nowait
+   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update to(a) nowait
+end subroutine omp_target_update_nowait
+
+!===============================================================================
 ! Target_Data with region
 !===============================================================================
 
diff --git a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
index e6ea5b5c4051..28f59c95d60b 100644
--- a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
+++ b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
@@ -15,11 +15,11 @@
 
 ! Check that we are accessing the clone inside the loop
 !CHECK-DAG: omp.wsloop for (%[[INDX_WS:.*]]) : {{.*}} {
-!CHECK-DAG: %[[NEG_ONE:.*]] = arith.constant -1 : i32
+!CHECK-DAG: %[[UNIT:.*]] = arith.constant 6 : i32
 !CHECK-NEXT: %[[ADDR:.*]] = fir.address_of(@_QQclX
 !CHECK-NEXT: %[[CVT0:.*]] = fir.convert %[[ADDR]] 
 !CHECK-NEXT: %[[CNST:.*]] = arith.constant
-!CHECK-NEXT: %[[CALL_BEGIN_IO:.*]] = fir.call @_FortranAioBeginExternalListOutput(%[[NEG_ONE]], %[[CVT0]], %[[CNST]]) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+!CHECK-NEXT: %[[CALL_BEGIN_IO:.*]] = fir.call @_FortranAioBeginExternalListOutput(%[[UNIT]], %[[CVT0]], %[[CNST]]) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
 !CHECK-NEXT: %[[CVT_0_1:.*]] = fir.convert %[[ARG1_PVT_DECL]]#1 
 !CHECK-NEXT: %[[CVT_0_2:.*]] = fir.convert %[[FIVE]]
 !CHECK-NEXT: %[[CALL_OP_ASCII:.*]] = fir.call @_FortranAioOutputAscii(%[[CALL_BEGIN_IO]], %[[CVT_0_1]], %[[CVT_0_2]])
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 26b4d595d522..5ca3a08d8a8b 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -135,6 +135,94 @@ subroutine omp_target_exit_device
 end subroutine omp_target_exit_device
 
 !===============================================================================
+! Target_Update `to` clause
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_update_to() {
+subroutine omp_target_update_to
+   integer :: a(1024)
+
+   !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}})
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+
+   !CHECK: %[[TO_MAP:.*]] = omp.map_info var_ptr(%[[A_DECL]]#1 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)
+   !CHECK-SAME: map_clauses(to) capture(ByRef)
+   !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
+
+   !CHECK: omp.target_update_data motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update to(a)
+end subroutine omp_target_update_to
+
+!===============================================================================
+! Target_Update `from` clause
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_update_from() {
+subroutine omp_target_update_from
+   integer :: a(1024)
+
+   !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}})
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+
+   !CHECK: %[[FROM_MAP:.*]] = omp.map_info var_ptr(%[[A_DECL]]#1 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)
+   !CHECK-SAME: map_clauses(from) capture(ByRef)
+   !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
+
+   !CHECK: omp.target_update_data motion_entries(%[[FROM_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update from(a)
+end subroutine omp_target_update_from
+
+!===============================================================================
+! Target_Update `if` clause
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_update_if() {
+subroutine omp_target_update_if
+   integer :: a(1024)
+   logical :: i
+
+   !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}})
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+   !CHECK-DAG: %[[COND:.*]] = fir.convert %{{.*}} : (!fir.logical<4>) -> i1
+
+   !CHECK: omp.target_update_data if(%[[COND]] : i1) motion_entries
+   !$omp target update from(a) if(i)
+end subroutine omp_target_update_if
+
+!===============================================================================
+! Target_Update `device` clause
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_update_device() {
+subroutine omp_target_update_device
+   integer :: a(1024)
+   logical :: i
+
+   !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}})
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+   !CHECK-DAG: %[[DEVICE:.*]] = arith.constant 1 : i32
+
+   !CHECK: omp.target_update_data device(%[[DEVICE]] : i32) motion_entries
+   !$omp target update from(a) device(1)
+end subroutine omp_target_update_device
+
+!===============================================================================
+! Target_Update `nowait` clause
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_update_nowait() {
+subroutine omp_target_update_nowait
+   integer :: a(1024)
+   logical :: i
+
+   !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}})
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+
+   !CHECK: omp.target_update_data nowait motion_entries
+   !$omp target update from(a) nowait
+end subroutine omp_target_update_nowait
+
+!===============================================================================
 ! Target_Data with region
 !===============================================================================
 
diff --git a/flang/test/Lower/array-character.f90 b/flang/test/Lower/array-character.f90
index ee01589f802e..c93ef4be3082 100644
--- a/flang/test/Lower/array-character.f90
+++ b/flang/test/Lower/array-character.f90
@@ -57,7 +57,7 @@ end subroutine
 program p
   ! CHECK-DAG: %[[VAL_0:.*]] = arith.constant 4 : index
   ! CHECK-DAG: %[[VAL_1:.*]] = arith.constant 3 : index
-  ! CHECK-DAG: %[[VAL_2:.*]] = arith.constant -1 : i32
+  ! CHECK-DAG: %[[VAL_2:.*]] = arith.constant 6 : i32
   ! CHECK: %[[VAL_5:.*]] = fir.alloca !fir.array<3x!fir.char<1,4>> {bindc_name = "c1", uniq_name = "_QFEc1"}
   ! CHECK: %[[VAL_6:.*]] = fir.address_of(@_QFEc2) : !fir.ref<!fir.array<3x!fir.char<1,4>>>
   ! CHECK: %[[VAL_7:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,
@@ -91,7 +91,7 @@ end program p
 
 ! CHECK-LABEL: func @_QPcharlit() {
 subroutine charlit
-  ! CHECK-DAG: %[[VAL_0:.*]] = arith.constant -1 : i32
+  ! CHECK-DAG: %[[VAL_0:.*]] = arith.constant 6 : i32
   ! CHECK-DAG: %[[VAL_3:.*]] = arith.constant 3 : index
   ! CHECK-DAG: %[[VAL_4:.*]] = arith.constant false
   ! CHECK-DAG: %[[VAL_5:.*]] = arith.constant 4 : index
diff --git a/flang/test/Lower/array-expression-slice-1.f90 b/flang/test/Lower/array-expression-slice-1.f90
index d81b8488205e..152450902432 100644
--- a/flang/test/Lower/array-expression-slice-1.f90
+++ b/flang/test/Lower/array-expression-slice-1.f90
@@ -10,7 +10,7 @@
 ! CHECK-DAG:         %[[VAL_13:.*]] = arith.constant 2 : i64
 ! CHECK-DAG:         %[[VAL_14:.*]] = arith.constant 7 : i64
 ! CHECK-DAG:         %[[VAL_16:.*]] = arith.constant 4 : i64
-! CHECK-DAG:         %[[VAL_18:.*]] = arith.constant -1 : i32
+! CHECK-DAG:         %[[VAL_18:.*]] = arith.constant 6 : i32
 ! CHECK-DAG:         %[[VAL_19:.*]] = arith.constant 0 : i64
 ! CHECK-DAG:         %[[VAL_20:.*]] = arith.constant 1 : i64
 ! CHECK-DAG:         %[[VAL_21:.*]] = arith.constant 3 : i64
@@ -385,7 +385,7 @@ end program p
 ! CHECK-DAG:     %[[VAL_2:.*]] = arith.constant 2 : index
 ! CHECK-DAG:     %[[VAL_3:.*]] = arith.constant 1 : index
 ! CHECK-DAG:     %[[VAL_4:.*]] = arith.constant 4 : index
-! CHECK-DAG:     %[[VAL_6:.*]] = arith.constant -1 : i32
+! CHECK-DAG:     %[[VAL_6:.*]] = arith.constant 6 : i32
 ! CHECK-DAG:     %[[VAL_7:.*]] = arith.constant 10 : index
 ! CHECK:         %[[VAL_8:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:         %[[VAL_9:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<10x!fir.char<1>>>
diff --git a/flang/test/Lower/array-expression.f90 b/flang/test/Lower/array-expression.f90
index f73cd6e5f4f1..75789cd6952a 100644
--- a/flang/test/Lower/array-expression.f90
+++ b/flang/test/Lower/array-expression.f90
@@ -1113,7 +1113,7 @@ end subroutine test19h
 ! CHECK:         %[[VAL_6:.*]] = fir.convert %[[VAL_5]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<10x!fir.char<1,?>>>
 ! CHECK:         %[[VAL_7:.*]] = arith.constant 2 : index
 ! CHECK:         %[[VAL_8:.*]] = arith.constant 10 : index
-! CHECK:         %[[VAL_9:.*]] = arith.constant -1 : i32
+! CHECK:         %[[VAL_9:.*]] = arith.constant 6 : i32
 ! CHECK:         %[[VAL_10:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref<!fir.char<1,
 ! CHECK:         %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
 ! CHECK:         %[[VAL_12:.*]] = arith.constant {{.*}} : i32
diff --git a/flang/test/Lower/array-temp.f90 b/flang/test/Lower/array-temp.f90
index f8c2ec3e03c5..971b3506fbe3 100644
--- a/flang/test/Lower/array-temp.f90
+++ b/flang/test/Lower/array-temp.f90
@@ -47,7 +47,7 @@ end
 ! CHECK:   %[[C_2:[-0-9a-z_]+]] = arith.constant 2 : index
 ! CHECK:   %[[C_1:[-0-9a-z_]+]] = arith.constant 1 : index
 ! CHECK:   %[[C_27_i32:[-0-9a-z_]+]] = arith.constant 27 : i32
-! CHECK:   %[[C_m1_i32:[-0-9a-z_]+]] = arith.constant -1 : i32
+! CHECK:   %[[C_6_i32:[-0-9a-z_]+]] = arith.constant 6 : i32
 ! CHECK:   %[[C_st:[-0-9a-z_]+]] = arith.constant 7.000000e+00 : f32
 ! CHECK:   %[[C_1_i32:[-0-9a-z_]+]] = arith.constant 1 : i32
 ! CHECK:   %[[C_st_0:[-0-9a-z_]+]] = arith.constant -2.000000e+00 : f32
@@ -118,7 +118,7 @@ end
 ! CHECK:   cf.br ^bb9(%[[V_42]], %[[V_46:[0-9]+]] : index, index)
 ! CHECK: ^bb11:  // pred: ^bb9
 ! CHECK:   fir.freemem %[[V_18:[0-9]+]] : !fir.heap<!fir.array<?xf32>>
-! CHECK:   %[[V_49:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput(%[[C_m1_i32]], %{{.*}}, %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+! CHECK:   %[[V_49:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput(%[[C_6_i32]], %{{.*}}, %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
 ! CHECK:   %[[V_50:[0-9]+]] = fir.slice %[[C_1]], %[[C_2]], %[[C_1]] : (index, index, index) -> !fir.slice<1>
 ! CHECK:   %[[V_51:[0-9]+]] = fir.embox %[[V_4]](%[[V_5]]) [%[[V_50]]] : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box<!fir.array<2xf32>>
 ! CHECK:   %[[V_52:[0-9]+]] = fir.convert %[[V_51:[0-9]+]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
@@ -141,7 +141,7 @@ end
 ! CHECK:   %[[C_2:[-0-9a-z_]+]] = arith.constant 2 : index
 ! CHECK:   %[[C_1:[-0-9a-z_]+]] = arith.constant 1 : index
 ! CHECK:   %[[C_34_i32:[-0-9a-z_]+]] = arith.constant 34 : i32
-! CHECK:   %[[C_m1_i32:[-0-9a-z_]+]] = arith.constant -1 : i32
+! CHECK:   %[[C_6_i32:[-0-9a-z_]+]] = arith.constant 6 : i32
 ! CHECK:   %[[C_st:[-0-9a-z_]+]] = arith.constant 7.000000e+00 : f32
 ! CHECK:   %[[C_1_i32:[-0-9a-z_]+]] = arith.constant 1 : i32
 ! CHECK:   %[[C_st_0:[-0-9a-z_]+]] = arith.constant -2.000000e+00 : f32
@@ -244,7 +244,7 @@ end
 ! CHECK:   cf.br ^bb15(%[[V_69]], %[[V_70:[0-9]+]] : index, index)
 ! CHECK: ^bb19:  // pred: ^bb15
 ! CHECK:   fir.freemem %[[V_24:[0-9]+]] : !fir.heap<!fir.array<2x?xf32>>
-! CHECK:   %[[V_73:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput(%[[C_m1_i32]], %{{.*}}, %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+! CHECK:   %[[V_73:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput(%[[C_6_i32]], %{{.*}}, %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
 ! CHECK:   %[[V_74:[0-9]+]] = fir.slice %[[C_1]], %[[C_2]], %[[C_1]], %[[C_1]], %[[C_2]], %[[C_1]] : (index, index, index, index, index, index) -> !fir.slice<2>
 ! CHECK:   %[[V_75:[0-9]+]] = fir.embox %[[V_4]](%[[V_5]]) [%[[V_74]]] : (!fir.ref<!fir.array<2x?xf32>>, !fir.shape<2>, !fir.slice<2>) -> !fir.box<!fir.array<?x2xf32>>
 ! CHECK:   %[[V_76:[0-9]+]] = fir.convert %[[V_75:[0-9]+]] : (!fir.box<!fir.array<?x2xf32>>) -> !fir.box<none>
@@ -267,7 +267,7 @@ end
 ! CHECK:   %[[C_m1:[-0-9a-z_]+]] = arith.constant -1 : index
 ! CHECK:   %[[C_1:[-0-9a-z_]+]] = arith.constant 1 : index
 ! CHECK:   %[[C_41_i32:[-0-9a-z_]+]] = arith.constant 41 : i32
-! CHECK:   %[[C_m1_i32:[-0-9a-z_]+]] = arith.constant -1 : i32
+! CHECK:   %[[C_6_i32:[-0-9a-z_]+]] = arith.constant 6 : i32
 ! CHECK:   %[[C_st:[-0-9a-z_]+]] = arith.constant 7.000000e+00 : f32
 ! CHECK:   %[[C_1_i32:[-0-9a-z_]+]] = arith.constant 1 : i32
 ! CHECK:   %[[C_st_0:[-0-9a-z_]+]] = arith.constant -2.000000e+00 : f32
@@ -370,7 +370,7 @@ end
 ! CHECK:   cf.br ^bb15(%[[V_69]], %[[V_70:[0-9]+]] : index, index)
 ! CHECK: ^bb19:  // pred: ^bb15
 ! CHECK:   fir.freemem %[[V_24:[0-9]+]] : !fir.heap<!fir.array<?x2xf32>>
-! CHECK:   %[[V_73:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput(%[[C_m1_i32]], %{{.*}}, %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+! CHECK:   %[[V_73:[0-9]+]] = fir.call @_FortranAioBeginExternalListOutput(%[[C_6_i32]], %{{.*}}, %{{.*}}) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
 ! CHECK:   %[[V_74:[0-9]+]] = fir.slice %[[C_1]], %[[C_2]], %[[C_1]], %[[C_1]], %[[C_2]], %[[C_1]] : (index, index, index, index, index, index) -> !fir.slice<2>
 ! CHECK:   %[[V_75:[0-9]+]] = fir.embox %[[V_4]](%[[V_5]]) [%[[V_74]]] : (!fir.ref<!fir.array<?x2xf32>>, !fir.shape<2>, !fir.slice<2>) -> !fir.box<!fir.array<2x?xf32>>
 ! CHECK:   %[[V_76:[0-9]+]] = fir.convert %[[V_75:[0-9]+]] : (!fir.box<!fir.array<2x?xf32>>) -> !fir.box<none>
diff --git a/flang/test/Lower/host-associated.f90 b/flang/test/Lower/host-associated.f90
index 073493d7fe28..25e637805e87 100644
--- a/flang/test/Lower/host-associated.f90
+++ b/flang/test/Lower/host-associated.f90
@@ -478,7 +478,7 @@ end subroutine test_proc_dummy_other
 ! CHECK-DAG:         %[[VAL_3:.*]] = arith.constant false
 ! CHECK-DAG:         %[[VAL_4:.*]] = arith.constant 1 : index
 ! CHECK-DAG:         %[[VAL_5:.*]] = arith.constant 32 : i8
-! CHECK-DAG:         %[[VAL_6:.*]] = arith.constant -1 : i32
+! CHECK-DAG:         %[[VAL_6:.*]] = arith.constant 6 : i32
 ! CHECK-DAG:         %[[VAL_8:.*]] = arith.constant 10 : i64
 ! CHECK-DAG:         %[[VAL_9:.*]] = arith.constant 40 : index
 ! CHECK-DAG:         %[[VAL_10:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/io-statement-2.f90 b/flang/test/Lower/io-statement-2.f90
index 108c64344f30..dcf7387f8a22 100644
--- a/flang/test/Lower/io-statement-2.f90
+++ b/flang/test/Lower/io-statement-2.f90
@@ -159,11 +159,11 @@ end
 
 ! CHECK-LABEL: func @_QPimpliedformat
 subroutine impliedformat
-  ! CHECK: BeginExternalListInput(%c-1
+  ! CHECK: BeginExternalListInput
   ! CHECK: InputReal32
   ! CHECK: EndIoStatement(%3) {{.*}}: (!fir.ref<i8>) -> i32
   read*, x
-  ! CHECK: BeginExternalListOutput(%c-1
+  ! CHECK: BeginExternalListOutput
   ! CHECK: OutputReal32
   ! CHECK: EndIoStatement
   print*, x
diff --git a/flang/test/Lower/vector-subscript-io.f90 b/flang/test/Lower/vector-subscript-io.f90
index 9030bebd4ac4..d298609269da 100644
--- a/flang/test/Lower/vector-subscript-io.f90
+++ b/flang/test/Lower/vector-subscript-io.f90
@@ -9,7 +9,7 @@ subroutine simple(x, y)
   integer :: x(10)
   read(*,*) x(y)
 ! CHECK-DAG: %[[VAL_0:.*]] = arith.constant 10 : index
-! CHECK-DAG: %[[VAL_1:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_1:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_3:.*]] = arith.constant 4 : i32
 ! CHECK-DAG: %[[VAL_4:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index
@@ -51,7 +51,7 @@ subroutine only_once(x)
   real :: x(:, :)
   ! Test subscripts are only evaluated once.
   read(*,*) x(get_substcript(), get_vector())
-! CHECK-DAG: %[[VAL_26:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_26:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_28:.*]] = arith.constant 0 : i64
 ! CHECK-DAG: %[[VAL_29:.*]] = arith.constant 0 : index
 ! CHECK-DAG: %[[VAL_30:.*]] = arith.constant 1 : index
@@ -102,7 +102,7 @@ subroutine with_assumed_shapes(x, y)
   integer :: y(:)
   integer :: x(:)
   read(*,*) x(y)
-! CHECK-DAG: %[[VAL_60:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_60:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_62:.*]] = arith.constant 4 : i32
 ! CHECK-DAG: %[[VAL_63:.*]] = arith.constant 0 : index
 ! CHECK-DAG: %[[VAL_64:.*]] = arith.constant 1 : index
@@ -138,7 +138,7 @@ subroutine lower_bounds(x, y)
   read(*,*) x(3, y)
 ! CHECK-DAG: %[[VAL_84:.*]] = arith.constant 4 : index
 ! CHECK-DAG: %[[VAL_85:.*]] = arith.constant 6 : index
-! CHECK-DAG: %[[VAL_86:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_86:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_88:.*]] = arith.constant 3 : i64
 ! CHECK-DAG: %[[VAL_89:.*]] = arith.constant 2 : index
 ! CHECK-DAG: %[[VAL_90:.*]] = arith.constant 4 : i32
@@ -177,7 +177,7 @@ subroutine two_vectors(x, y1, y2)
   real :: x(4, 4)
   read(*,*) x(y1, y2)
 ! CHECK-DAG: %[[VAL_114:.*]] = arith.constant 4 : index
-! CHECK-DAG: %[[VAL_115:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_115:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_117:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_118:.*]] = arith.constant 0 : index
 ! CHECK-DAG: %[[VAL_119:.*]] = arith.constant 1 : index
@@ -220,7 +220,7 @@ subroutine triplets_and_vector(x, y)
   integer :: y(3)
   complex :: x(4, 4)
   read(*,*) x(1:4:2, y)
-! CHECK-DAG: %[[VAL_147:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_147:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_149:.*]] = arith.constant 4 : index
 ! CHECK-DAG: %[[VAL_150:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_151:.*]] = arith.constant 2 : index
@@ -264,7 +264,7 @@ subroutine simple_char(x, y)
   character(*) :: x(3:8)
   read(*,*) x(y)
 ! CHECK-DAG: %[[VAL_178:.*]] = arith.constant 6 : index
-! CHECK-DAG: %[[VAL_179:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_179:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_181:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_182:.*]] = arith.constant 0 : index
 ! CHECK-DAG: %[[VAL_183:.*]] = arith.constant 1 : index
@@ -301,7 +301,7 @@ subroutine substring(x, y, i, j)
   integer :: y(3), i, j
   character(*) :: x(:)
   read(*,*) x(y)(i:j)
-! CHECK-DAG: %[[VAL_206:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_206:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_208:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_209:.*]] = arith.constant 0 : index
 ! CHECK-DAG: %[[VAL_210:.*]] = arith.constant 1 : index
@@ -347,7 +347,7 @@ subroutine complex_part(z, y)
   integer :: y(:)
   complex :: z(:)
   read(*,*) z(y)%IM
-! CHECK-DAG: %[[VAL_244:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_244:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_246:.*]] = arith.constant 1 : i32
 ! CHECK-DAG: %[[VAL_247:.*]] = arith.constant 0 : index
 ! CHECK-DAG: %[[VAL_248:.*]] = arith.constant 1 : index
@@ -392,7 +392,7 @@ subroutine simple_derived(x, y)
   type(t) :: x(3:8)
   read(*,*) x(y)
 ! CHECK-DAG: %[[VAL_267:.*]] = arith.constant 6 : index
-! CHECK-DAG: %[[VAL_268:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_268:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_270:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_271:.*]] = arith.constant 4 : index
 ! CHECK-DAG: %[[VAL_272:.*]] = arith.constant 0 : index
@@ -430,7 +430,7 @@ subroutine with_path(b, i)
   integer :: i(:)
   read (*, *) b(5, i, 8:9:1)%a(4,5)%i
 ! CHECK-DAG: %[[VAL_294:.*]] = arith.constant 4 : index
-! CHECK-DAG: %[[VAL_295:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_295:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_297:.*]] = arith.constant 8 : index
 ! CHECK-DAG: %[[VAL_298:.*]] = arith.constant 9 : index
 ! CHECK-DAG: %[[VAL_299:.*]] = arith.constant 4 : i64
@@ -481,7 +481,7 @@ subroutine simple_iostat(x, y, j, stat)
   integer :: j, y(:), stat
   real :: x(:)
   read(*, *, iostat=stat) x(y), j
-! CHECK-DAG: %[[VAL_334:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_334:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_336:.*]] = arith.constant false
 ! CHECK-DAG: %[[VAL_337:.*]] = arith.constant true
 ! CHECK-DAG: %[[VAL_338:.*]] = arith.constant 1 : index
@@ -527,7 +527,7 @@ subroutine iostat_in_io_loop(k, j, stat)
   integer  :: stat
   read(*, *, iostat=stat) (k(i, j), i=1,3,1)
 ! CHECK-DAG: %[[VAL_365:.*]] = arith.constant 5 : index
-! CHECK-DAG: %[[VAL_366:.*]] = arith.constant -1 : i32
+! CHECK-DAG: %[[VAL_366:.*]] = arith.constant 5 : i32
 ! CHECK-DAG: %[[VAL_368:.*]] = arith.constant 3 : index
 ! CHECK-DAG: %[[VAL_369:.*]] = arith.constant true
 ! CHECK-DAG: %[[VAL_370:.*]] = arith.constant false
diff --git a/flang/test/Parser/compiler-directives.f90 b/flang/test/Parser/compiler-directives.f90
index 88cfd0944faf..67e8d5b292aa 100644
--- a/flang/test/Parser/compiler-directives.f90
+++ b/flang/test/Parser/compiler-directives.f90
@@ -17,6 +17,7 @@ module m
   !dir$ integer
   !dir$ integer=64
   !dir$ integer = 64
+  !dir$  integer = 64
   PROC(4)
   !dir$ optimize:1
   !dir$ optimize : 1
diff --git a/flang/test/Runtime/no-cpp-dep.c b/flang/test/Runtime/no-cpp-dep.c
index f8fe97b5bf78..654bebed345b 100644
--- a/flang/test/Runtime/no-cpp-dep.c
+++ b/flang/test/Runtime/no-cpp-dep.c
@@ -5,7 +5,10 @@ a C compiler.
 
 REQUIRES: c-compiler
 
-RUN: %cc -std=c99 %s -I%include %libruntime %libdecimal -lm -o /dev/null
+RUN: %if system-aix %{ export OBJECT_MODE=64 %}
+RUN: %cc -std=c99 %s -I%include %libruntime %libdecimal -lm  \
+RUN: %if system-aix %{-lpthread %}
+RUN: rm a.out
 */
 
 #include "flang/Runtime/entry-names.h"
diff --git a/flang/test/Semantics/bad-forward-type.f90 b/flang/test/Semantics/bad-forward-type.f90
index 19e23e654642..432d450a15f3 100644
--- a/flang/test/Semantics/bad-forward-type.f90
+++ b/flang/test/Semantics/bad-forward-type.f90
@@ -84,7 +84,6 @@ subroutine s9
   type con
     Type(t(3)), pointer :: y
   end type
-  !ERROR: Cannot construct value for derived type 't' before it is defined
   Integer :: nn = Size(Transfer(t(3)(666),[0]))
   type :: t(n)
     integer, kind :: n = 3
diff --git a/flang/test/Semantics/contiguous01.f90 b/flang/test/Semantics/contiguous01.f90
index 1d3600aef6c5..0f086624a20a 100644
--- a/flang/test/Semantics/contiguous01.f90
+++ b/flang/test/Semantics/contiguous01.f90
@@ -5,7 +5,7 @@ module m0
 end
 module m
   use m0
-  !ERROR: Cannot change CONTIGUOUS attribute on use-associated 'p1'
+  !WARNING: Use-associated 'p1' already has 'CONTIGUOUS' attribute
   contiguous p1
   !ERROR: Cannot change CONTIGUOUS attribute on use-associated 'p2'
   contiguous p2
diff --git a/flang/test/Semantics/init01.f90 b/flang/test/Semantics/init01.f90
index 9f75a8d55673..0f5a2144c79f 100644
--- a/flang/test/Semantics/init01.f90
+++ b/flang/test/Semantics/init01.f90
@@ -90,7 +90,7 @@ subroutine components(n)
   real, pointer :: p10 => o3%x
   associate (a1 => o3, a2 => o3%x)
     block
-      real, pointer :: p11 => a1
+      type(t3), pointer :: p11 => a1
       real, pointer :: p12 => a2
     end block
   end associate
diff --git a/flang/test/Semantics/intrinsics02.f90 b/flang/test/Semantics/intrinsics02.f90
new file mode 100644
index 000000000000..0b1f7c13a156
--- /dev/null
+++ b/flang/test/Semantics/intrinsics02.f90
@@ -0,0 +1,38 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+module explicit
+  intrinsic cos
+end
+subroutine testExplicit
+  use explicit
+  !ERROR: 'cos' is use-associated from module 'explicit' and cannot be re-declared
+  real :: cos = 2.
+end
+subroutine extendsUsedIntrinsic
+  use explicit
+  interface cos
+    pure real function mycos(x)
+      real, intent(in) :: x
+    end
+  end interface
+end
+subroutine sameIntrinsic1
+  use explicit
+  !WARNING: Use-associated 'cos' already has 'INTRINSIC' attribute
+  intrinsic cos
+  real :: one = cos(0.)
+end
+module renamer
+  use explicit, renamedCos => cos
+end
+subroutine sameIntrinsic2
+  use explicit
+  use renamer, cos => renamedCos
+  real :: one = cos(0.)
+end
+module implicit
+  real :: one = cos(0.)
+end
+subroutine testImplicit
+  use implicit
+  real :: cos = 2.
+end
diff --git a/flang/test/Semantics/notifywait01.f90 b/flang/test/Semantics/notifywait01.f90
new file mode 100644
index 000000000000..83a58ba79288
--- /dev/null
+++ b/flang/test/Semantics/notifywait01.f90
@@ -0,0 +1,26 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! This test checks the acceptance of standard-conforming notify-wait-stmts based
+! on the statement specification in section 11.6 of the Fortran 2023 standard.
+
+program test_notify_wait
+  use iso_fortran_env, only: notify_type
+  implicit none
+
+  type(notify_type) :: notify_var[*]
+  integer :: count, count_array(1), sync_status, coindexed_integer[*]
+  character(len=128) :: error_message
+
+  !_______________________ standard-conforming statements ___________________________
+
+  notify wait(notify_var)
+  notify wait(notify_var, until_count=count)
+  notify wait(notify_var, until_count=count_array(1))
+  notify wait(notify_var, until_count=coindexed_integer[1])
+  notify wait(notify_var, stat=sync_status)
+  notify wait(notify_var, until_count=count, stat=sync_status)
+  notify wait(notify_var, errmsg=error_message)
+  notify wait(notify_var, until_count=count, errmsg=error_message)
+  notify wait(notify_var, stat=sync_status, errmsg=error_message)
+  notify wait(notify_var, until_count=count, stat=sync_status, errmsg=error_message)
+
+end program test_notify_wait
diff --git a/flang/test/Semantics/notifywait02.f90 b/flang/test/Semantics/notifywait02.f90
new file mode 100644
index 000000000000..eebf3d05edaf
--- /dev/null
+++ b/flang/test/Semantics/notifywait02.f90
@@ -0,0 +1,74 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! This test checks for semantic errors in notify wait statements based on the
+! statement specification in section 11.6 of the Fortran 2023 standard
+
+program test_notify_wait
+  use iso_fortran_env, only: notify_type
+  implicit none
+
+  ! notify_type variables must be coarrays
+  type(notify_type) :: non_coarray
+
+  type(notify_type) :: notify_var[*], redundant_notify[*]
+  integer :: count, sync_status
+  character(len=128) :: error_message
+
+  !____________________ non-standard-conforming statements __________________________
+
+  !_________________________ invalid notify-variable ________________________________
+
+  ! notify-variable has an unknown expression
+  !ERROR: expected '('
+  notify wait(notify=notify_var)
+
+  !_____________ invalid event-wait-spec-lists: invalid until-spec _________________
+
+  ! Invalid until-spec keyword
+  !ERROR: expected '('
+  notify wait(notify_var, until_amount=count)
+
+  ! Invalid until-spec: missing until-spec variable
+  !ERROR: expected '('
+  notify wait(notify_var, until_count)
+
+  ! Invalid until-spec: missing 'until_count='
+  !ERROR: expected '('
+  notify wait(notify_var, count)
+
+  !_________________ invalid sync-stat-lists: invalid stat= ________________________
+
+  ! Invalid stat-variable keyword
+  !ERROR: expected '('
+  notify wait(notify_var, status=sync_status)
+
+  ! Invalid sync-stat-list: missing stat-variable
+  !ERROR: expected '('
+  notify wait(notify_var, stat)
+
+  ! Invalid sync-stat-list: missing 'stat='
+  !ERROR: expected '('
+  notify wait(notify_var, sync_status)
+
+  !________________ invalid sync-stat-lists: invalid errmsg= _______________________
+
+  ! Invalid errmsg-variable keyword
+  !ERROR: expected '('
+  notify wait(notify_var, errormsg=error_message)
+
+  ! Invalid sync-stat-list: missing 'errmsg='
+  !ERROR: expected '('
+  notify wait(notify_var, error_message)
+
+  ! Invalid sync-stat-list: missing errmsg-variable
+  !ERROR: expected '('
+  notify wait(notify_var, errmsg)
+
+  !______________ invalid notify-variable: redundant notify-variable _________________
+
+  !ERROR: expected '('
+  notify wait(notify_var, redundant_notify)
+
+  !ERROR: expected '('
+  notify wait(notify_var, redundant_notify, stat=sync_status, errmsg=error_message)
+
+end program test_notify_wait
diff --git a/flang/test/Semantics/notifywait03.f90 b/flang/test/Semantics/notifywait03.f90
new file mode 100644
index 000000000000..0fc56f66ad32
--- /dev/null
+++ b/flang/test/Semantics/notifywait03.f90
@@ -0,0 +1,123 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! This test checks for semantic errors in notify wait statements based on the
+! statement specification in section 11.6 of the Fortran 2023 standard.
+! Some of the errors in this test would be hidden by the errors in
+! the test notify02.f90 if they were included in that file,
+! and are thus tested here.
+
+program test_notify_wait
+  use iso_fortran_env, only : notify_type
+  implicit none
+
+  ! notify_type variables must be coarrays
+  type(notify_type) :: non_coarray
+
+  type(notify_type) :: notify_var[*], notify_array(2)[*]
+  integer :: count, count_array(1), non_notify[*], sync_status, coindexed_integer[*], superfluous_stat, non_scalar(1)
+  character(len=128) :: error_message, non_scalar_char(1), coindexed_character[*], superfluous_errmsg
+  logical :: invalid_type
+
+  !____________________ non-standard-conforming statements __________________________
+
+  !_________________________ invalid notify-variable ________________________________
+
+  !ERROR: The notify-variable must be of type NOTIFY_TYPE from module ISO_FORTRAN_ENV
+  notify wait(non_notify)
+
+  !ERROR: The notify-variable must be a coarray
+  notify wait(non_coarray)
+
+  !ERROR: A notify-variable in a NOTIFY WAIT statement may not be a coindexed object
+  notify wait(notify_var[1])
+
+  !ERROR: A notify-variable in a NOTIFY WAIT statement may not be a coindexed object
+  notify wait(notify_array(1)[1])
+
+  !ERROR: Must be a scalar value, but is a rank-1 array
+  notify wait(notify_array)
+
+  !_____________ invalid event-wait-spec-lists: invalid until-spec _________________
+
+  !ERROR: Must have INTEGER type, but is LOGICAL(4)
+  notify wait(notify_var, until_count=invalid_type)
+
+  !ERROR: Must be a scalar value, but is a rank-1 array
+  notify wait(notify_var, until_count=non_scalar)
+
+  !_________________ invalid sync-stat-lists: invalid stat= ________________________
+
+  !ERROR: Must have INTEGER type, but is LOGICAL(4)
+  notify wait(notify_var, stat=invalid_type)
+
+  !ERROR: Must be a scalar value, but is a rank-1 array
+  notify wait(notify_var, stat=non_scalar)
+
+  !________________ invalid sync-stat-lists: invalid errmsg= _______________________
+
+  !ERROR: Must have CHARACTER type, but is LOGICAL(4)
+  notify wait(notify_var, errmsg=invalid_type)
+
+  !ERROR: Must be a scalar value, but is a rank-1 array
+  notify wait(notify_var, errmsg=non_scalar_char)
+
+  !______ invalid event-wait-spec-lists: redundant event-wait-spec-list ____________
+
+  !ERROR: Until-spec in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, until_count=count, until_count=count_array(1))
+
+  !ERROR: Until-spec in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, until_count=count, stat=sync_status, until_count=count_array(1))
+
+  !ERROR: Until-spec in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, until_count=count, errmsg=error_message, until_count=count_array(1))
+
+  !ERROR: Until-spec in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, until_count=count, stat=sync_status, errmsg=error_message, until_count=count_array(1))
+
+  !ERROR: A stat-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, stat=sync_status, stat=superfluous_stat)
+
+  !ERROR: A stat-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, stat=sync_status, until_count=count, stat=superfluous_stat)
+
+  !ERROR: A stat-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, stat=sync_status, errmsg=error_message, stat=superfluous_stat)
+
+  !ERROR: A stat-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, stat=sync_status, until_count=count, errmsg=error_message, stat=superfluous_stat)
+
+  !ERROR: A errmsg-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, errmsg=error_message, errmsg=superfluous_errmsg)
+
+  !ERROR: A errmsg-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, errmsg=error_message, until_count=count, errmsg=superfluous_errmsg)
+
+  !ERROR: A errmsg-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, errmsg=error_message, stat=superfluous_stat, errmsg=superfluous_errmsg)
+
+  !ERROR: A errmsg-variable in a event-wait-spec-list may not be repeated
+  notify wait(notify_var, errmsg=error_message, until_count=count, stat=superfluous_stat, errmsg=superfluous_errmsg)
+
+  !_____________ invalid sync-stat-lists: coindexed stat-variable - C1173 __________________
+
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  notify wait(notify_var, stat=coindexed_integer[1])
+
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  notify wait(notify_var, errmsg=coindexed_character[1])
+
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  notify wait(notify_var, stat=coindexed_integer[1], errmsg=error_message)
+
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  notify wait(notify_var, stat=sync_status, errmsg=coindexed_character[1])
+
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  notify wait(notify_var, stat=coindexed_integer[1], errmsg=coindexed_character[1])
+
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  !ERROR: The stat-variable or errmsg-variable in a event-wait-spec-list may not be a coindexed object
+  notify wait(notify_var, errmsg=coindexed_character[1], stat=coindexed_integer[1])
+
+end program test_notify_wait
diff --git a/flang/test/Semantics/pointer01.f90 b/flang/test/Semantics/pointer01.f90
index cb860f3a3f43..9e87d1b689eb 100644
--- a/flang/test/Semantics/pointer01.f90
+++ b/flang/test/Semantics/pointer01.f90
@@ -16,6 +16,7 @@ program main
   !ERROR: 'inner' cannot have the POINTER attribute
   pointer inner
   real obj
+  !ERROR: 'ip' is a pointer but is not initialized like one
   !ERROR: 'ip' may not have both the POINTER and PARAMETER attributes
   integer, parameter :: ip = 123
   pointer ip
diff --git a/flang/test/Semantics/resolve61.f90 b/flang/test/Semantics/resolve61.f90
index 6728050243ec..d6499f07b860 100644
--- a/flang/test/Semantics/resolve61.f90
+++ b/flang/test/Semantics/resolve61.f90
@@ -107,11 +107,16 @@ subroutine p12
   type t2
     integer c2
   end type
+  type, bind(c) :: t3
+    integer c3
+  end type
   type(t1) :: x1
   type(t2) :: x2
+  type(t3) :: x3
   pointer(a, x1)
-  !ERROR: Type of Cray pointee 'x2' is a non-sequence derived type
+  !ERROR: Type of Cray pointee 'x2' is a derived type that is neither SEQUENCE nor BIND(C)
   pointer(b, x2)
+  pointer(c, x3)
 end
 
 subroutine p13
diff --git a/flang/test/Semantics/symbol15.f90 b/flang/test/Semantics/symbol15.f90
index 318819e224cd..97dc50a23845 100644
--- a/flang/test/Semantics/symbol15.f90
+++ b/flang/test/Semantics/symbol15.f90
@@ -14,10 +14,10 @@ module m
  !DEF: /m/op2 POINTER, PUBLIC ObjectEntity REAL(4)
  !DEF: /m/null INTRINSIC, PUBLIC, PURE (Function) ProcEntity
  real, pointer :: op2 => null()
- !DEF: /m/op3 POINTER, PUBLIC (InDataStmt) ObjectEntity REAL(4)
+ !DEF: /m/op3 POINTER, PUBLIC ObjectEntity REAL(4)
  !DEF: /m/x PUBLIC, TARGET ObjectEntity REAL(4)
  real, pointer :: op3 => x
- !DEF: /m/op4 POINTER, PUBLIC (InDataStmt) ObjectEntity REAL(4)
+ !DEF: /m/op4 POINTER, PUBLIC ObjectEntity REAL(4)
  !DEF: /m/y PUBLIC, TARGET ObjectEntity REAL(4)
  real, pointer :: op4 => y(1)
  !REF: /m/iface
@@ -50,10 +50,10 @@ module m
   !DEF: /m/t1/opc2 POINTER ObjectEntity REAL(4)
   !REF: /m/null
   real, pointer :: opc2 => null()
-  !DEF: /m/t1/opc3 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/t1/opc3 POINTER ObjectEntity REAL(4)
   !REF: /m/x
   real, pointer :: opc3 => x
-  !DEF: /m/t1/opc4 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/t1/opc4 POINTER ObjectEntity REAL(4)
   !REF: /m/y
   real, pointer :: opc4 => y(1)
   !REF: /m/iface
@@ -100,10 +100,10 @@ module m
   !DEF: /m/pdt1/opc2 POINTER ObjectEntity REAL(4)
   !REF: /m/null
   real, pointer :: opc2 => null()
-  !DEF: /m/pdt1/opc3 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/pdt1/opc3 POINTER ObjectEntity REAL(4)
   !REF: /m/x
   real, pointer :: opc3 => x
-  !DEF: /m/pdt1/opc4 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/pdt1/opc4 POINTER ObjectEntity REAL(4)
   !REF: /m/y
   !REF: /m/pdt1/k
   real, pointer :: opc4 => y(k)
@@ -160,10 +160,10 @@ module m
   subroutine ext2
   end subroutine
  end interface
- !DEF: /m/op10 POINTER, PUBLIC(InDataStmt) ObjectEntity REAL(4)
+ !DEF: /m/op10 POINTER, PUBLIC ObjectEntity REAL(4)
  !REF: /m/x
  real, pointer :: op10 => x
- !DEF: /m/op11 POINTER, PUBLIC(InDataStmt) ObjectEntity REAL(4)
+ !DEF: /m/op11 POINTER, PUBLIC ObjectEntity REAL(4)
  !REF: /m/y
  real, pointer :: op11 => y(1)
  !REF: /m/iface
@@ -176,10 +176,10 @@ module m
  procedure(iface), pointer :: pp11 => ext2
  !DEF: /m/t2 PUBLIC DerivedType
  type :: t2
-  !DEF: /m/t2/opc10 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/t2/opc10 POINTER ObjectEntity REAL(4)
   !REF: /m/x
   real, pointer :: opc10 => x
-  !DEF: /m/t2/opc11 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/t2/opc11 POINTER ObjectEntity REAL(4)
   !REF: /m/y
   real, pointer :: opc11 => y(1)
   !REF: /m/iface
@@ -203,10 +203,10 @@ module m
  type :: pdt2(k)
   !REF: /m/pdt2/k
   integer, kind :: k
-  !DEF: /m/pdt2/opc10 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/pdt2/opc10 POINTER ObjectEntity REAL(4)
   !REF: /m/x
   real, pointer :: opc10 => x
-  !DEF: /m/pdt2/opc11 POINTER (InDataStmt) ObjectEntity REAL(4)
+  !DEF: /m/pdt2/opc11 POINTER ObjectEntity REAL(4)
   !REF: /m/y
   !REF: /m/pdt2/k
   real, pointer :: opc11 => y(k)
diff --git a/flang/test/Transforms/simplifyintrinsics.fir b/flang/test/Transforms/simplifyintrinsics.fir
index d42924a17a80..61cddd4f48df 100644
--- a/flang/test/Transforms/simplifyintrinsics.fir
+++ b/flang/test/Transforms/simplifyintrinsics.fir
@@ -2158,9 +2158,9 @@ func.func @_QPtestminloc_1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_n
 }
 // CHECK-LABEL:   func.func @_QPtestminloc_1d_dim(
 // CHECK-SAME:                                             %[[ARR:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
-// CHECK:             fir.call @_FortranAMinlocDimx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
+// CHECK:             fir.call @_FortranAMinlocDimx1_i32_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
 
-// CHECK-LABEL:  func.func private @_FortranAMinlocDimx1_i32_contract_simplified(%arg0: !fir.ref<!fir.box<none>>, %arg1: !fir.box<none>, %arg2: !fir.box<none>) attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+// CHECK-LABEL:  func.func private @_FortranAMinlocDimx1_i32_i32_contract_simplified(%arg0: !fir.ref<!fir.box<none>>, %arg1: !fir.box<none>, %arg2: !fir.box<none>) attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
 // CHECK-NEXT:    %[[V0:.*]] = fir.alloca i32
 // CHECK-NEXT:    %c0_i32 = arith.constant 0 : i32
 // CHECK-NEXT:    %c1 = arith.constant 1 : index
diff --git a/flang/unittests/Runtime/CommandTest.cpp b/flang/unittests/Runtime/CommandTest.cpp
index 2b648b31666a..dfc3ad68b3ab 100644
--- a/flang/unittests/Runtime/CommandTest.cpp
+++ b/flang/unittests/Runtime/CommandTest.cpp
@@ -10,9 +10,15 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/extensions.h"
 #include "flang/Runtime/main.h"
+#include <cstddef>
 #include <cstdlib>
 
+#if _REENTRANT || _POSIX_C_SOURCE >= 199506L
+#include <limits.h> // LOGIN_NAME_MAX used in getlog test
+#endif
+
 using namespace Fortran::runtime;
 
 template <std::size_t n = 64>
@@ -59,6 +65,13 @@ protected:
     return res;
   }
 
+  void CheckCharEqStr(const char *value, const std::string &expected) const {
+    ASSERT_NE(value, nullptr);
+    EXPECT_EQ(std::strncmp(value, expected.c_str(), expected.size()), 0)
+        << "expected: " << expected << "\n"
+        << "value: " << value;
+  }
+
   void CheckDescriptorEqStr(
       const Descriptor *value, const std::string &expected) const {
     ASSERT_NE(value, nullptr);
@@ -397,6 +410,11 @@ class EnvironmentVariables : public CommandFixture {
 protected:
   EnvironmentVariables() : CommandFixture(0, nullptr) {
     SetEnv("NAME", "VALUE");
+#ifdef _WIN32
+    SetEnv("USERNAME", "loginName");
+#else
+    SetEnv("LOGNAME", "loginName");
+#endif
     SetEnv("EMPTY", "");
   }
 
@@ -494,3 +512,68 @@ TEST_F(EnvironmentVariables, ErrMsgTooShort) {
       1);
   CheckDescriptorEqStr(errMsg.get(), "Mis");
 }
+
+// username first char must not be null
+TEST_F(EnvironmentVariables, GetlogGetName) {
+  const int charLen{3};
+  char input[charLen]{"\0\0"};
+
+  FORTRAN_PROCEDURE_NAME(getlog)
+  (reinterpret_cast<std::byte *>(input), charLen);
+
+  EXPECT_NE(input[0], '\0');
+}
+
+#if _REENTRANT || _POSIX_C_SOURCE >= 199506L
+TEST_F(EnvironmentVariables, GetlogPadSpace) {
+  // guarantee 1 char longer than max, last char should be pad space
+  const int charLen{LOGIN_NAME_MAX + 2};
+  char input[charLen];
+
+  FORTRAN_PROCEDURE_NAME(getlog)
+  (reinterpret_cast<std::byte *>(input), charLen);
+
+  EXPECT_EQ(input[charLen - 1], ' ');
+}
+#endif
+
+#ifdef _WIN32 // Test ability to get name from environment variable
+TEST_F(EnvironmentVariables, GetlogEnvGetName) {
+  if (EnableFineGrainedTests()) {
+    ASSERT_NE(std::getenv("USERNAME"), nullptr)
+        << "Environment variable USERNAME does not exist";
+
+    char input[]{"XXXXXXXXX"};
+    FORTRAN_PROCEDURE_NAME(getlog)
+    (reinterpret_cast<std::byte *>(input), sizeof(input));
+
+    CheckCharEqStr(input, "loginName");
+  }
+}
+
+TEST_F(EnvironmentVariables, GetlogEnvBufferShort) {
+  if (EnableFineGrainedTests()) {
+    ASSERT_NE(std::getenv("USERNAME"), nullptr)
+        << "Environment variable USERNAME does not exist";
+
+    char input[]{"XXXXXX"};
+    FORTRAN_PROCEDURE_NAME(getlog)
+    (reinterpret_cast<std::byte *>(input), sizeof(input));
+
+    CheckCharEqStr(input, "loginN");
+  }
+}
+
+TEST_F(EnvironmentVariables, GetlogEnvPadSpace) {
+  if (EnableFineGrainedTests()) {
+    ASSERT_NE(std::getenv("USERNAME"), nullptr)
+        << "Environment variable USERNAME does not exist";
+
+    char input[]{"XXXXXXXXXX"};
+    FORTRAN_PROCEDURE_NAME(getlog)
+    (reinterpret_cast<std::byte *>(input), sizeof(input));
+
+    CheckCharEqStr(input, "loginName ");
+  }
+}
+#endif
diff --git a/flang/unittests/Runtime/Numeric.cpp b/flang/unittests/Runtime/Numeric.cpp
index 5afed750c0b1..43263d1ac423 100644
--- a/flang/unittests/Runtime/Numeric.cpp
+++ b/flang/unittests/Runtime/Numeric.cpp
@@ -86,7 +86,7 @@ TEST(Numeric, Nearest) {
   EXPECT_EQ(RTNAME(Nearest8)(Real<8>{1.0}, true),
       Real<8>{1.0} + std::ldexp(Real<8>{1.0}, -52));
   EXPECT_EQ(RTNAME(Nearest8)(Real<8>{1.0}, false),
-      Real<8>{1.0} - std::ldexp(Real<8>{1.0}, -52));
+      Real<8>{1.0} - 0.5 * std::ldexp(Real<8>{1.0}, -52));
 }
 
 TEST(Numeric, Nint) {
diff --git a/flang/unittests/Runtime/NumericalFormatTest.cpp b/flang/unittests/Runtime/NumericalFormatTest.cpp
index b5b8eb059437..9dd2771fe4a7 100644
--- a/flang/unittests/Runtime/NumericalFormatTest.cpp
+++ b/flang/unittests/Runtime/NumericalFormatTest.cpp
@@ -654,28 +654,44 @@ TEST(IOApiTests, FormatDoubleValues) {
               {"(EX24.13,';')", " 0XF.FFFFFFFFFFFF8P+1020;"},
           }},
       {// EX rounding
-          0x3ff1000000000000uLL, // 1.0625
+          0x3ff0100000000000uLL,
           {
-              {"(F7.4,';')", " 1.0625;"},
-              {"(EX9.1,';')", " 0X8.8P-3;"},
-              {"(EX9.0,';')", "  0X8.P-3;"},
-              {"(RN,EX9.0,';')", "  0X8.P-3;"},
-              {"(RU,EX9.0,';')", "  0X9.P-3;"},
-              {"(RD,EX9.0,';')", "  0X8.P-3;"},
-              {"(RZ,EX9.0,';')", "  0X8.P-3;"},
-              {"(RC,EX9.0,';')", "  0X9.P-3;"},
+              {"(F11.8,';')", " 1.00390625;"},
+              {"(EX10.2,';')", " 0X8.08P-3;"},
+              {"(EX10.1,';')", "  0X8.0P-3;"},
+              {"(EX10.0,';')", " 0X8.08P-3;"},
+              {"(EX0.0,';')", "0X8.08P-3;"},
+              {"(EX0,';')", "0X8.08P-3;"},
+              {"(RN,EX10.1,';')", "  0X8.0P-3;"},
+              {"(RU,EX10.1,';')", "  0X8.1P-3;"},
+              {"(RD,EX10.1,';')", "  0X8.0P-3;"},
+              {"(RZ,EX10.1,';')", "  0X8.0P-3;"},
+              {"(RC,EX10.1,';')", "  0X8.1P-3;"},
+              {"(RN,EX10.0,';')", " 0X8.08P-3;"},
+              {"(RU,EX10.0,';')", " 0X8.08P-3;"},
+              {"(RD,EX10.0,';')", " 0X8.08P-3;"},
+              {"(RZ,EX10.0,';')", " 0X8.08P-3;"},
+              {"(RC,EX10.0,';')", " 0X8.08P-3;"},
           }},
       {// EX rounding
-          0xbff1000000000000uLL, // -1.0625
+          0xbff0100000000000uLL,
           {
-              {"(F7.4,';')", "-1.0625;"},
-              {"(EX9.1,';')", "-0X8.8P-3;"},
-              {"(EX9.0,';')", " -0X8.P-3;"},
-              {"(RN,EX9.0,';')", " -0X8.P-3;"},
-              {"(RU,EX9.0,';')", " -0X8.P-3;"},
-              {"(RD,EX9.0,';')", " -0X9.P-3;"},
-              {"(RZ,EX9.0,';')", " -0X8.P-3;"},
-              {"(RC,EX9.0,';')", " -0X9.P-3;"},
+              {"(F11.8,';')", "-1.00390625;"},
+              {"(EX10.2,';')", "-0X8.08P-3;"},
+              {"(EX10.1,';')", " -0X8.0P-3;"},
+              {"(EX10.0,';')", "-0X8.08P-3;"},
+              {"(EX0.0,';')", "-0X8.08P-3;"},
+              {"(EX0,';')", "-0X8.08P-3;"},
+              {"(RN,EX10.1,';')", " -0X8.0P-3;"},
+              {"(RU,EX10.1,';')", " -0X8.0P-3;"},
+              {"(RD,EX10.1,';')", " -0X8.1P-3;"},
+              {"(RZ,EX10.1,';')", " -0X8.0P-3;"},
+              {"(RC,EX10.1,';')", " -0X8.1P-3;"},
+              {"(RN,EX10.0,';')", "-0X8.08P-3;"},
+              {"(RU,EX10.0,';')", "-0X8.08P-3;"},
+              {"(RD,EX10.0,';')", "-0X8.08P-3;"},
+              {"(RZ,EX10.0,';')", "-0X8.08P-3;"},
+              {"(RC,EX10.0,';')", "-0X8.08P-3;"},
           }},
   };
 
@@ -840,49 +856,70 @@ TEST(IOApiTests, FormatIntegerValues) {
 
 // Ensure double input values correctly map to raw uint64 values
 TEST(IOApiTests, EditDoubleInputValues) {
-  using TestCaseTy = std::tuple<const char *, const char *, std::uint64_t>;
+  using TestCaseTy = std::tuple<const char *, const char *, std::uint64_t, int>;
+  int ovf{IostatRealInputOverflow};
   static const std::vector<TestCaseTy> testCases{
-      {"(F18.0)", "                 0", 0x0},
-      {"(F18.0)", "                  ", 0x0},
-      {"(F18.0)", "                -0", 0x8000000000000000},
-      {"(F18.0)", "                01", 0x3ff0000000000000},
-      {"(F18.0)", "                 1", 0x3ff0000000000000},
-      {"(F18.0)", "              125.", 0x405f400000000000},
-      {"(F18.0)", "              12.5", 0x4029000000000000},
-      {"(F18.0)", "              1.25", 0x3ff4000000000000},
-      {"(F18.0)", "             01.25", 0x3ff4000000000000},
-      {"(F18.0)", "              .125", 0x3fc0000000000000},
-      {"(F18.0)", "             0.125", 0x3fc0000000000000},
-      {"(F18.0)", "             .0625", 0x3fb0000000000000},
-      {"(F18.0)", "            0.0625", 0x3fb0000000000000},
-      {"(F18.0)", "               125", 0x405f400000000000},
-      {"(F18.1)", "               125", 0x4029000000000000},
-      {"(F18.2)", "               125", 0x3ff4000000000000},
-      {"(F18.3)", "               125", 0x3fc0000000000000},
-      {"(-1P,F18.0)", "               125", 0x4093880000000000}, // 1250
-      {"(1P,F18.0)", "               125", 0x4029000000000000}, // 12.5
-      {"(BZ,F18.0)", "              125 ", 0x4093880000000000}, // 1250
-      {"(BZ,F18.0)", "       125 . e +1 ", 0x42a6bcc41e900000}, // 1.25e13
-      {"(BZ,F18.0)", "           .      ", 0x0},
-      {"(BZ,F18.0)", "           . e +1 ", 0x0},
-      {"(DC,F18.0)", "              12,5", 0x4029000000000000},
-      {"(EX22.0)", "0X0P0                 ", 0x0}, // +0.
-      {"(EX22.0)", "-0X0P0                ", 0x8000000000000000}, // -0.
-      {"(EX22.0)", "0X.8P1                ", 0x3ff0000000000000}, // 1.0
-      {"(EX22.0)", "0X8.P-3               ", 0x3ff0000000000000}, // 1.0
-      {"(EX22.0)", "0X.1P4                ", 0x3ff0000000000000}, // 1.0
-      {"(EX22.0)", "0X10.P-4              ", 0x3ff0000000000000}, // 1.0
-      {"(EX22.0)", "0X8.00P-3             ", 0x3ff0000000000000}, // 1.0
-      {"(EX22.0)", "0X80.0P-6             ", 0x4000000000000000}, // 2.0
-      {"(EX22.0)", "0XC.CCCCCCCCCCCDP-7   ", 0x3fb999999999999a}, // 0.1
-      {"(EX22.0)", "0X.8P-1021            ", 0x0010000000000000}, // min normal
-      {"(EX22.0)", "0X.8P-1022            ", 0x0008000000000000}, // subnormal
-      {"(EX22.0)", "0X.8P-1073            ", 0x0000000000000001}, // min subn.
-      {"(EX22.0)", "0X.FFFFFFFFFFFFF8P1024", 0x7fefffffffffffff}, // max finite
-      {"(EX22.0)", "0X.8P1025             ", 0x7ff0000000000000}, // +Inf
-      {"(EX22.0)", "-0X.8P1025            ", 0xfff0000000000000}, // -Inf
+      {"(F18.0)", "                 0", 0x0, 0},
+      {"(F18.0)", "                  ", 0x0, 0},
+      {"(F18.0)", "                -0", 0x8000000000000000, 0},
+      {"(F18.0)", "                01", 0x3ff0000000000000, 0},
+      {"(F18.0)", "                 1", 0x3ff0000000000000, 0},
+      {"(F18.0)", "              125.", 0x405f400000000000, 0},
+      {"(F18.0)", "              12.5", 0x4029000000000000, 0},
+      {"(F18.0)", "              1.25", 0x3ff4000000000000, 0},
+      {"(F18.0)", "             01.25", 0x3ff4000000000000, 0},
+      {"(F18.0)", "              .125", 0x3fc0000000000000, 0},
+      {"(F18.0)", "             0.125", 0x3fc0000000000000, 0},
+      {"(F18.0)", "             .0625", 0x3fb0000000000000, 0},
+      {"(F18.0)", "            0.0625", 0x3fb0000000000000, 0},
+      {"(F18.0)", "               125", 0x405f400000000000, 0},
+      {"(F18.1)", "               125", 0x4029000000000000, 0},
+      {"(F18.2)", "               125", 0x3ff4000000000000, 0},
+      {"(F18.3)", "               125", 0x3fc0000000000000, 0},
+      {"(-1P,F18.0)", "               125", 0x4093880000000000, 0}, // 1250
+      {"(1P,F18.0)", "               125", 0x4029000000000000, 0}, // 12.5
+      {"(BZ,F18.0)", "              125 ", 0x4093880000000000, 0}, // 1250
+      {"(BZ,F18.0)", "       125 . e +1 ", 0x42a6bcc41e900000, 0}, // 1.25e13
+      {"(BZ,F18.0)", "           .      ", 0x0, 0},
+      {"(BZ,F18.0)", "           . e +1 ", 0x0, 0},
+      {"(DC,F18.0)", "              12,5", 0x4029000000000000, 0},
+      {"(EX22.0)", "0X0P0                 ", 0x0, 0}, // +0.
+      {"(EX22.0)", "-0X0P0                ", 0x8000000000000000, 0}, // -0.
+      {"(EX22.0)", "0X.8P1                ", 0x3ff0000000000000, 0}, // 1.0
+      {"(EX22.0)", "0X8.P-3               ", 0x3ff0000000000000, 0}, // 1.0
+      {"(EX22.0)", "0X.1P4                ", 0x3ff0000000000000, 0}, // 1.0
+      {"(EX22.0)", "0X10.P-4              ", 0x3ff0000000000000, 0}, // 1.0
+      {"(EX22.0)", "0X8.00P-3             ", 0x3ff0000000000000, 0}, // 1.0
+      {"(EX22.0)", "0X80.0P-6             ", 0x4000000000000000, 0}, // 2.0
+      {"(EX22.0)", "0XC.CCCCCCCCCCCDP-7   ", 0x3fb999999999999a, 0}, // 0.1
+      {"(EX22.0)", "0X.8P-1021            ", 0x0010000000000000,
+          0}, // min normal
+      {"(EX22.0)", "0X.8P-1022            ", 0x0008000000000000,
+          0}, // subnormal
+      {"(EX22.0)", "0X.8P-1073            ", 0x0000000000000001,
+          0}, // min subn.
+      {"(EX22.0)", "0X.FFFFFFFFFFFFF8P1024", 0x7fefffffffffffff,
+          0}, // max finite
+      {"(EX22.0)", "0X.8P1025             ", 0x7ff0000000000000, ovf}, // +Inf
+      {"(EX22.0)", "-0X.8P1025            ", 0xfff0000000000000, ovf}, // -Inf
+      {"(RZ,F7.0)", " 2.e308", 0x7fefffffffffffff, 0}, // +HUGE()
+      {"(RD,F7.0)", " 2.e308", 0x7fefffffffffffff, 0}, // +HUGE()
+      {"(RU,F7.0)", " 2.e308", 0x7ff0000000000000, ovf}, // +Inf
+      {"(RZ,F7.0)", "-2.e308", 0xffefffffffffffff, 0}, // -HUGE()
+      {"(RD,F7.0)", "-2.e308", 0xfff0000000000000, ovf}, // -Inf
+      {"(RU,F7.0)", "-2.e308", 0xffefffffffffffff, 0}, // -HUGE()
+      {"(RZ,F7.0)", " 1.e999", 0x7fefffffffffffff, 0}, // +HUGE()
+      {"(RD,F7.0)", " 1.e999", 0x7fefffffffffffff, 0}, // +HUGE()
+      {"(RU,F7.0)", " 1.e999", 0x7ff0000000000000, ovf}, // +Inf
+      {"(RZ,F7.0)", "-1.e999", 0xffefffffffffffff, 0}, // -HUGE()
+      {"(RD,F7.0)", "-1.e999", 0xfff0000000000000, ovf}, // -Inf
+      {"(RU,F7.0)", "-1.e999", 0xffefffffffffffff, 0}, // -HUGE()
+      {"(E9.1)", " 1.0E-325", 0x0, 0},
+      {"(RU,E9.1)", " 1.0E-325", 0x1, 0},
+      {"(E9.1)", "-1.0E-325", 0x0, 0},
+      {"(RD,E9.1)", "-1.0E-325", 0x8000000000000001, 0},
   };
-  for (auto const &[format, data, want] : testCases) {
+  for (auto const &[format, data, want, iostat] : testCases) {
     auto cookie{IONAME(BeginInternalFormattedInput)(
         data, std::strlen(data), format, std::strlen(format))};
     union {
@@ -899,12 +936,14 @@ TEST(IOApiTests, EditDoubleInputValues) {
     char iomsg[bufferSize];
     std::memset(iomsg, '\0', bufferSize - 1);
 
-    // Ensure no errors were encountered reading input buffer into union value
+    // Ensure no unexpected errors were encountered reading input buffer into
+    // union value
     IONAME(GetIoMsg)(cookie, iomsg, bufferSize - 1);
     auto status{IONAME(EndIoStatement)(cookie)};
-    ASSERT_EQ(status, 0) << '\'' << format << "' failed reading '" << data
-                         << "', status " << static_cast<int>(status)
-                         << " iomsg '" << iomsg << "'";
+    ASSERT_EQ(status, iostat)
+        << '\'' << format << "' failed reading '" << data << "', status "
+        << static_cast<int>(status) << " != expected " << iostat << " iomsg '"
+        << iomsg << "'";
 
     // Ensure raw uint64 value matches expected conversion from double
     ASSERT_EQ(u.raw, want) << '\'' << format << "' failed reading '" << data
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index f3ccbdc9eb15..05c46a64297a 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -5,8 +5,9 @@ endif()
 
 # Set up the target architectures to build the GPU libc for.
 set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
-                             "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030"
-                             "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036"
+                             "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942"
+                             "gfx1010;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034"
+                             "gfx1035;gfx1036"
                              "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151")
 set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
                             "sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90")
diff --git a/libc/fuzzing/stdlib/CMakeLists.txt b/libc/fuzzing/stdlib/CMakeLists.txt
index 09ac985623ca..711b0fd9820f 100644
--- a/libc/fuzzing/stdlib/CMakeLists.txt
+++ b/libc/fuzzing/stdlib/CMakeLists.txt
@@ -26,7 +26,7 @@ add_libc_fuzzer(
     libc.src.stdlib.strtof
     libc.src.stdlib.strtod
     libc.src.stdlib.strtold
-    libc.src.__support.FPUtil.float_properties
+    libc.src.__support.FPUtil.fp_bits
 )
 
 add_libc_fuzzer(
diff --git a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
index ea2f492f57ae..0e0d82fd3e8a 100644
--- a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
+++ b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
@@ -14,7 +14,7 @@
 #include "src/stdlib/strtof.h"
 #include "src/stdlib/strtold.h"
 
-#include "src/__support/FPUtil/FloatProperties.h"
+#include "src/__support/FPUtil/FPBits.h"
 
 #include <math.h>
 #include <stddef.h>
diff --git a/libc/src/__support/CPP/bit.h b/libc/src/__support/CPP/bit.h
index 4de142b56165..122f6b8c3328 100644
--- a/libc/src/__support/CPP/bit.h
+++ b/libc/src/__support/CPP/bit.h
@@ -29,10 +29,10 @@ namespace LIBC_NAMESPACE::cpp {
 // UB in the implementation.
 template <
     typename To, typename From,
-    typename = cpp::enable_if_t<sizeof(To) == sizeof(From)>,
-    typename = cpp::enable_if_t<cpp::is_trivially_constructible<To>::value>,
-    typename = cpp::enable_if_t<cpp::is_trivially_copyable<To>::value>,
-    typename = cpp::enable_if_t<cpp::is_trivially_copyable<From>::value>>
+    typename = cpp::enable_if_t<sizeof(To) == sizeof(From) &&
+                                cpp::is_trivially_constructible<To>::value &&
+                                cpp::is_trivially_copyable<To>::value &&
+                                cpp::is_trivially_copyable<From>::value>>
 LIBC_INLINE constexpr To bit_cast(const From &from) {
   MSAN_UNPOISON(&from, sizeof(From));
 #if LIBC_HAS_BUILTIN(__builtin_bit_cast)
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index 1cb22536a1cf..ad2c4ad27bce 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -24,24 +24,17 @@ add_header_library(
 )
 
 add_header_library(
-  float_properties
-  HDRS
-    FloatProperties.h
-  DEPENDS
-    libc.src.__support.macros.properties.float
-    libc.src.__support.uint128
-    libc.src.__support.math_extras
-)
-
-add_header_library(
   fp_bits
   HDRS
     FPBits.h
   DEPENDS
-    .float_properties
     libc.src.__support.common
     libc.src.__support.CPP.bit
     libc.src.__support.CPP.type_traits
+    libc.src.__support.macros.attributes
+    libc.src.__support.macros.properties.float
+    libc.src.__support.math_extras
+    libc.src.__support.uint128
 )
 
 add_header_library(
@@ -49,7 +42,6 @@ add_header_library(
   HDRS
     fpbits_str.h
   DEPENDS
-    .float_properties
     .fp_bits
     libc.src.__support.CPP.bit
     libc.src.__support.CPP.type_traits
@@ -230,7 +222,6 @@ add_header_library(
   HDRS
     dyadic_float.h
   DEPENDS
-    .float_properties
     .fp_bits
     .multiply_add
     libc.src.__support.common
diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 37e2820eab85..d06625ed1385 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -11,50 +11,232 @@
 
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
+#include "src/__support/UInt128.h"
 #include "src/__support/common.h"
-#include "src/__support/macros/attributes.h" // LIBC_INLINE
+#include "src/__support/macros/attributes.h" // LIBC_INLINE, LIBC_INLINE_VAR
+#include "src/__support/macros/properties/float.h" // LIBC_COMPILER_HAS_FLOAT128
+#include "src/__support/math_extras.h"             // mask_trailing_ones
 
-#include "FloatProperties.h"
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE {
 namespace fputil {
 
-// A generic class to represent single precision, double precision, and quad
-// precision IEEE 754 floating point formats.
-// On most platforms, the 'float' type corresponds to single precision floating
-// point numbers, the 'double' type corresponds to double precision floating
-// point numers, and the 'long double' type corresponds to the quad precision
-// floating numbers. On x86 platforms however, the 'long double' type maps to
-// an x87 floating point format. This format is an IEEE 754 extension format.
-// It is handled as an explicit specialization of this class.
-template <typename T> struct FPBits : private FloatProperties<T> {
-  static_assert(cpp::is_floating_point_v<T>,
-                "FPBits instantiated with invalid type.");
-  using typename FloatProperties<T>::StorageType;
-  using FloatProperties<T>::TOTAL_LEN;
+// The supported floating point types.
+enum class FPType {
+  IEEE754_Binary16,
+  IEEE754_Binary32,
+  IEEE754_Binary64,
+  IEEE754_Binary128,
+  X86_Binary80,
+};
+
+namespace internal {
+
+// The type of encoding for supported floating point types.
+enum class FPEncoding {
+  IEEE754,
+  X86_ExtendedPrecision,
+};
+
+template <FPType> struct FPBaseProperties {};
+
+template <> struct FPBaseProperties<FPType::IEEE754_Binary16> {
+  using StorageType = uint16_t;
+  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 16;
+  LIBC_INLINE_VAR static constexpr int SIG_LEN = 10;
+  LIBC_INLINE_VAR static constexpr int EXP_LEN = 5;
+  LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
+};
+
+template <> struct FPBaseProperties<FPType::IEEE754_Binary32> {
+  using StorageType = uint32_t;
+  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 32;
+  LIBC_INLINE_VAR static constexpr int SIG_LEN = 23;
+  LIBC_INLINE_VAR static constexpr int EXP_LEN = 8;
+  LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
+};
+
+template <> struct FPBaseProperties<FPType::IEEE754_Binary64> {
+  using StorageType = uint64_t;
+  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 64;
+  LIBC_INLINE_VAR static constexpr int SIG_LEN = 52;
+  LIBC_INLINE_VAR static constexpr int EXP_LEN = 11;
+  LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
+};
+
+template <> struct FPBaseProperties<FPType::IEEE754_Binary128> {
+  using StorageType = UInt128;
+  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 128;
+  LIBC_INLINE_VAR static constexpr int SIG_LEN = 112;
+  LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
+  LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
+};
+
+template <> struct FPBaseProperties<FPType::X86_Binary80> {
+  using StorageType = UInt128;
+  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 80;
+  LIBC_INLINE_VAR static constexpr int SIG_LEN = 64;
+  LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
+  LIBC_INLINE_VAR static constexpr auto ENCODING =
+      FPEncoding::X86_ExtendedPrecision;
+};
+
+} // namespace internal
 
+template <FPType fp_type>
+struct FPProperties : public internal::FPBaseProperties<fp_type> {
 private:
-  using FloatProperties<T>::EXP_SIG_MASK;
+  using UP = internal::FPBaseProperties<fp_type>;
+
+public:
+  // The number of bits to represent sign. For documentation purpose, always 1.
+  LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
+  using UP::EXP_LEN;   // The number of bits for the *exponent* part
+  using UP::SIG_LEN;   // The number of bits for the *significand* part
+  using UP::TOTAL_LEN; // For convenience, the sum of `SIG_LEN`, `EXP_LEN`,
+                       // and `SIGN_LEN`.
+  static_assert(SIGN_LEN + EXP_LEN + SIG_LEN == TOTAL_LEN);
+
+  // An unsigned integer that is wide enough to contain all of the floating
+  // point bits.
+  using StorageType = typename UP::StorageType;
+
+  // The number of bits in StorageType.
+  LIBC_INLINE_VAR static constexpr int STORAGE_LEN =
+      sizeof(StorageType) * CHAR_BIT;
+  static_assert(STORAGE_LEN >= TOTAL_LEN);
+
+  // The exponent bias. Always positive.
+  LIBC_INLINE_VAR static constexpr int32_t EXP_BIAS =
+      (1U << (EXP_LEN - 1U)) - 1U;
+  static_assert(EXP_BIAS > 0);
+
+protected:
+  // The shift amount to get the *significand* part to the least significant
+  // bit. Always `0` but kept for consistency.
+  LIBC_INLINE_VAR static constexpr int SIG_MASK_SHIFT = 0;
+  // The shift amount to get the *exponent* part to the least significant bit.
+  LIBC_INLINE_VAR static constexpr int EXP_MASK_SHIFT = SIG_LEN;
+  // The shift amount to get the *sign* part to the least significant bit.
+  LIBC_INLINE_VAR static constexpr int SIGN_MASK_SHIFT = SIG_LEN + EXP_LEN;
+
+  // The bit pattern that keeps only the *significand* part.
+  LIBC_INLINE_VAR static constexpr StorageType SIG_MASK =
+      mask_trailing_ones<StorageType, SIG_LEN>() << SIG_MASK_SHIFT;
 
 public:
-  using FloatProperties<T>::EXP_MASK;
-  using FloatProperties<T>::EXP_BIAS;
-  using FloatProperties<T>::EXP_LEN;
-  using FloatProperties<T>::FRACTION_MASK;
-  using FloatProperties<T>::FRACTION_LEN;
+  // The bit pattern that keeps only the *exponent* part.
+  LIBC_INLINE_VAR static constexpr StorageType EXP_MASK =
+      mask_trailing_ones<StorageType, EXP_LEN>() << EXP_MASK_SHIFT;
+  // The bit pattern that keeps only the *sign* part.
+  LIBC_INLINE_VAR static constexpr StorageType SIGN_MASK =
+      mask_trailing_ones<StorageType, SIGN_LEN>() << SIGN_MASK_SHIFT;
+  // The bit pattern that keeps only the *exponent + significand* part.
+  LIBC_INLINE_VAR static constexpr StorageType EXP_SIG_MASK =
+      mask_trailing_ones<StorageType, EXP_LEN + SIG_LEN>();
+  // The bit pattern that keeps only the *sign + exponent + significand* part.
+  LIBC_INLINE_VAR static constexpr StorageType FP_MASK =
+      mask_trailing_ones<StorageType, TOTAL_LEN>();
+
+  static_assert((SIG_MASK & EXP_MASK & SIGN_MASK) == 0, "masks disjoint");
+  static_assert((SIG_MASK | EXP_MASK | SIGN_MASK) == FP_MASK, "masks cover");
 
 private:
-  using FloatProperties<T>::QUIET_NAN_MASK;
+  LIBC_INLINE static constexpr StorageType bit_at(int position) {
+    return StorageType(1) << position;
+  }
+
+public:
+  // The number of bits after the decimal dot when the number is in normal form.
+  LIBC_INLINE_VAR static constexpr int FRACTION_LEN =
+      UP::ENCODING == internal::FPEncoding::X86_ExtendedPrecision ? SIG_LEN - 1
+                                                                  : SIG_LEN;
+  LIBC_INLINE_VAR static constexpr uint32_t MANTISSA_PRECISION =
+      FRACTION_LEN + 1;
+  LIBC_INLINE_VAR static constexpr StorageType FRACTION_MASK =
+      mask_trailing_ones<StorageType, FRACTION_LEN>();
+
+protected:
+  // If a number x is a NAN, then it is a quiet NAN if:
+  //   QUIET_NAN_MASK & bits(x) != 0
+  LIBC_INLINE_VAR static constexpr StorageType QUIET_NAN_MASK =
+      UP::ENCODING == internal::FPEncoding::X86_ExtendedPrecision
+          ? bit_at(SIG_LEN - 1) | bit_at(SIG_LEN - 2) // 0b1100...
+          : bit_at(SIG_LEN - 1);                      // 0b1000...
+
+  // If a number x is a NAN, then it is a signalling NAN if:
+  //   SIGNALING_NAN_MASK & bits(x) != 0
+  LIBC_INLINE_VAR static constexpr StorageType SIGNALING_NAN_MASK =
+      UP::ENCODING == internal::FPEncoding::X86_ExtendedPrecision
+          ? bit_at(SIG_LEN - 1) | bit_at(SIG_LEN - 3) // 0b1010...
+          : bit_at(SIG_LEN - 2);                      // 0b0100...
+};
+
+//-----------------------------------------------------------------------------
+template <typename FP> LIBC_INLINE static constexpr FPType get_fp_type() {
+  if constexpr (cpp::is_same_v<FP, float> && __FLT_MANT_DIG__ == 24)
+    return FPType::IEEE754_Binary32;
+  else if constexpr (cpp::is_same_v<FP, double> && __DBL_MANT_DIG__ == 53)
+    return FPType::IEEE754_Binary64;
+  else if constexpr (cpp::is_same_v<FP, long double>) {
+    if constexpr (__LDBL_MANT_DIG__ == 53)
+      return FPType::IEEE754_Binary64;
+    else if constexpr (__LDBL_MANT_DIG__ == 64)
+      return FPType::X86_Binary80;
+    else if constexpr (__LDBL_MANT_DIG__ == 113)
+      return FPType::IEEE754_Binary128;
+  }
+#if defined(LIBC_COMPILER_HAS_C23_FLOAT16)
+  else if constexpr (cpp::is_same_v<FP, _Float16>)
+    return FPType::IEEE754_Binary16;
+#endif
+#if defined(LIBC_COMPILER_HAS_C23_FLOAT128)
+  else if constexpr (cpp::is_same_v<FP, _Float128>)
+    return FPType::IEEE754_Binary128;
+#endif
+#if defined(LIBC_COMPILER_HAS_FLOAT128_EXTENSION)
+  else if constexpr (cpp::is_same_v<FP, __float128>)
+    return FPType::IEEE754_Binary128;
+#endif
+  else
+    static_assert(cpp::always_false<FP>, "Unsupported type");
+}
+
+template <typename FP>
+struct FloatProperties : public FPProperties<get_fp_type<FP>()> {};
+
+namespace internal {
+
+// This is a temporary class to unify common methods and properties between
+// FPBits and FPBits<long double>.
+template <FPType fp_type> struct FPBitsCommon : private FPProperties<fp_type> {
+  using UP = FPProperties<fp_type>;
+  using typename UP::StorageType;
+  using UP::TOTAL_LEN;
+
+protected:
+  using UP::EXP_SIG_MASK;
+  using UP::QUIET_NAN_MASK;
 
 public:
-  using FloatProperties<T>::SIGN_MASK;
+  using UP::EXP_BIAS;
+  using UP::EXP_LEN;
+  using UP::EXP_MASK;
+  using UP::EXP_MASK_SHIFT;
+  using UP::FP_MASK;
+  using UP::FRACTION_LEN;
+  using UP::FRACTION_MASK;
+  using UP::SIGN_MASK;
 
   // Reinterpreting bits as an integer value and interpreting the bits of an
   // integer value as a floating point value is used in tests. So, a convenient
   // type is provided for such reinterpretations.
   StorageType bits;
 
+  LIBC_INLINE constexpr FPBitsCommon() : bits(0) {}
+  LIBC_INLINE explicit constexpr FPBitsCommon(StorageType bits) : bits(bits) {}
+
   LIBC_INLINE constexpr void set_mantissa(StorageType mantVal) {
     mantVal &= FRACTION_MASK;
     bits &= ~FRACTION_MASK;
@@ -65,15 +247,88 @@ public:
     return bits & FRACTION_MASK;
   }
 
-  LIBC_INLINE constexpr void set_biased_exponent(StorageType expVal) {
-    expVal = (expVal << FRACTION_LEN) & EXP_MASK;
+  LIBC_INLINE constexpr void set_sign(bool signVal) {
+    if (get_sign() != signVal)
+      bits ^= SIGN_MASK;
+  }
+
+  LIBC_INLINE constexpr bool get_sign() const {
+    return (bits & SIGN_MASK) != 0;
+  }
+
+  LIBC_INLINE constexpr void set_biased_exponent(StorageType biased) {
+    // clear exponent bits
     bits &= ~EXP_MASK;
-    bits |= expVal;
+    // set exponent bits
+    bits |= (biased << EXP_MASK_SHIFT) & EXP_MASK;
   }
 
   LIBC_INLINE constexpr uint16_t get_biased_exponent() const {
-    return uint16_t((bits & EXP_MASK) >> FRACTION_LEN);
+    return uint16_t((bits & EXP_MASK) >> EXP_MASK_SHIFT);
+  }
+
+  LIBC_INLINE constexpr int get_exponent() const {
+    return int(get_biased_exponent()) - EXP_BIAS;
+  }
+
+  // If the number is subnormal, the exponent is treated as if it were the
+  // minimum exponent for a normal number. This is to keep continuity between
+  // the normal and subnormal ranges, but it causes problems for functions where
+  // values are calculated from the exponent, since just subtracting the bias
+  // will give a slightly incorrect result. Additionally, zero has an exponent
+  // of zero, and that should actually be treated as zero.
+  LIBC_INLINE constexpr int get_explicit_exponent() const {
+    const int biased_exp = int(get_biased_exponent());
+    if (is_zero()) {
+      return 0;
+    } else if (biased_exp == 0) {
+      return 1 - EXP_BIAS;
+    } else {
+      return biased_exp - EXP_BIAS;
+    }
+  }
+
+  LIBC_INLINE constexpr StorageType uintval() const { return bits & FP_MASK; }
+
+  LIBC_INLINE constexpr bool is_zero() const {
+    return (bits & EXP_SIG_MASK) == 0;
   }
+};
+
+} // namespace internal
+
+// A generic class to represent single precision, double precision, and quad
+// precision IEEE 754 floating point formats.
+// On most platforms, the 'float' type corresponds to single precision floating
+// point numbers, the 'double' type corresponds to double precision floating
+// point numers, and the 'long double' type corresponds to the quad precision
+// floating numbers. On x86 platforms however, the 'long double' type maps to
+// an x87 floating point format. This format is an IEEE 754 extension format.
+// It is handled as an explicit specialization of this class.
+template <typename T>
+struct FPBits : public internal::FPBitsCommon<get_fp_type<T>()> {
+  static_assert(cpp::is_floating_point_v<T>,
+                "FPBits instantiated with invalid type.");
+  using UP = internal::FPBitsCommon<get_fp_type<T>()>;
+  using StorageType = typename UP::StorageType;
+  using UP::bits;
+
+private:
+  using UP::EXP_SIG_MASK;
+  using UP::QUIET_NAN_MASK;
+
+public:
+  using UP::EXP_BIAS;
+  using UP::EXP_LEN;
+  using UP::EXP_MASK;
+  using UP::EXP_MASK_SHIFT;
+  using UP::FRACTION_LEN;
+  using UP::FRACTION_MASK;
+  using UP::SIGN_MASK;
+  using UP::TOTAL_LEN;
+
+  using UP::get_biased_exponent;
+  using UP::is_zero;
 
   // The function return mantissa with the implicit bit set iff the current
   // value is a valid normal number.
@@ -84,19 +339,6 @@ public:
            (FRACTION_MASK & bits);
   }
 
-  LIBC_INLINE constexpr void set_sign(bool signVal) {
-    bits |= SIGN_MASK;
-    if (!signVal)
-      bits -= SIGN_MASK;
-  }
-
-  LIBC_INLINE constexpr bool get_sign() const {
-    return (bits & SIGN_MASK) != 0;
-  }
-
-  static_assert(sizeof(T) == sizeof(StorageType),
-                "Data type and integral representation have different sizes.");
-
   static constexpr int MAX_BIASED_EXPONENT = (1 << EXP_LEN) - 1;
   static constexpr StorageType MIN_SUBNORMAL = StorageType(1);
   static constexpr StorageType MAX_SUBNORMAL = FRACTION_MASK;
@@ -108,49 +350,21 @@ public:
   // type match.
   template <typename XType, cpp::enable_if_t<cpp::is_same_v<T, XType>, int> = 0>
   LIBC_INLINE constexpr explicit FPBits(XType x)
-      : bits(cpp::bit_cast<StorageType>(x)) {}
+      : UP(cpp::bit_cast<StorageType>(x)) {}
 
   template <typename XType,
             cpp::enable_if_t<cpp::is_same_v<XType, StorageType>, int> = 0>
-  LIBC_INLINE constexpr explicit FPBits(XType x) : bits(x) {}
-
-  LIBC_INLINE constexpr FPBits() : bits(0) {}
+  LIBC_INLINE constexpr explicit FPBits(XType x) : UP(x) {}
 
-  LIBC_INLINE constexpr T get_val() const { return cpp::bit_cast<T>(bits); }
+  LIBC_INLINE constexpr FPBits() : UP() {}
 
   LIBC_INLINE constexpr void set_val(T value) {
     bits = cpp::bit_cast<StorageType>(value);
   }
 
-  LIBC_INLINE constexpr explicit operator T() const { return get_val(); }
-
-  LIBC_INLINE constexpr StorageType uintval() const { return bits; }
-
-  LIBC_INLINE constexpr int get_exponent() const {
-    return int(get_biased_exponent()) - EXP_BIAS;
-  }
-
-  // If the number is subnormal, the exponent is treated as if it were the
-  // minimum exponent for a normal number. This is to keep continuity between
-  // the normal and subnormal ranges, but it causes problems for functions where
-  // values are calculated from the exponent, since just subtracting the bias
-  // will give a slightly incorrect result. Additionally, zero has an exponent
-  // of zero, and that should actually be treated as zero.
-  LIBC_INLINE constexpr int get_explicit_exponent() const {
-    const int biased_exp = int(get_biased_exponent());
-    if (is_zero()) {
-      return 0;
-    } else if (biased_exp == 0) {
-      return 1 - EXP_BIAS;
-    } else {
-      return biased_exp - EXP_BIAS;
-    }
-  }
+  LIBC_INLINE constexpr T get_val() const { return cpp::bit_cast<T>(bits); }
 
-  LIBC_INLINE constexpr bool is_zero() const {
-    // Remove sign bit by shift
-    return (bits << 1) == 0;
-  }
+  LIBC_INLINE constexpr explicit operator T() const { return get_val(); }
 
   LIBC_INLINE constexpr bool is_inf() const {
     return (bits & EXP_SIG_MASK) == EXP_MASK;
diff --git a/libc/src/__support/FPUtil/FloatProperties.h b/libc/src/__support/FPUtil/FloatProperties.h
deleted file mode 100644
index bcf1f7cfabd3..000000000000
--- a/libc/src/__support/FPUtil/FloatProperties.h
+++ /dev/null
@@ -1,211 +0,0 @@
-//===-- Properties of floating point numbers --------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_FLOATPROPERTIES_H
-#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_FLOATPROPERTIES_H
-
-#include "src/__support/UInt128.h"
-#include "src/__support/macros/attributes.h" // LIBC_INLINE, LIBC_INLINE_VAR
-#include "src/__support/macros/properties/float.h" // LIBC_COMPILER_HAS_FLOAT128
-#include "src/__support/math_extras.h"             // mask_trailing_ones
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE {
-namespace fputil {
-
-// The supported floating point types.
-enum class FPType {
-  IEEE754_Binary16,
-  IEEE754_Binary32,
-  IEEE754_Binary64,
-  IEEE754_Binary128,
-  X86_Binary80,
-};
-
-// For now 'FPEncoding', 'FPBaseProperties' and 'FPCommonProperties' are
-// implementation details.
-namespace internal {
-
-// The type of encoding for supported floating point types.
-enum class FPEncoding {
-  IEEE754,
-  X86_ExtendedPrecision,
-};
-
-template <FPType> struct FPBaseProperties {};
-
-template <> struct FPBaseProperties<FPType::IEEE754_Binary16> {
-  using StorageType = uint16_t;
-  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 16;
-  LIBC_INLINE_VAR static constexpr int SIG_LEN = 10;
-  LIBC_INLINE_VAR static constexpr int EXP_LEN = 5;
-  LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
-};
-
-template <> struct FPBaseProperties<FPType::IEEE754_Binary32> {
-  using StorageType = uint32_t;
-  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 32;
-  LIBC_INLINE_VAR static constexpr int SIG_LEN = 23;
-  LIBC_INLINE_VAR static constexpr int EXP_LEN = 8;
-  LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
-};
-
-template <> struct FPBaseProperties<FPType::IEEE754_Binary64> {
-  using StorageType = uint64_t;
-  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 64;
-  LIBC_INLINE_VAR static constexpr int SIG_LEN = 52;
-  LIBC_INLINE_VAR static constexpr int EXP_LEN = 11;
-  LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
-};
-
-template <> struct FPBaseProperties<FPType::IEEE754_Binary128> {
-  using StorageType = UInt128;
-  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 128;
-  LIBC_INLINE_VAR static constexpr int SIG_LEN = 112;
-  LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
-  LIBC_INLINE_VAR static constexpr auto ENCODING = FPEncoding::IEEE754;
-};
-
-template <> struct FPBaseProperties<FPType::X86_Binary80> {
-  using StorageType = UInt128;
-  LIBC_INLINE_VAR static constexpr int TOTAL_LEN = 80;
-  LIBC_INLINE_VAR static constexpr int SIG_LEN = 64;
-  LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
-  LIBC_INLINE_VAR static constexpr auto ENCODING =
-      FPEncoding::X86_ExtendedPrecision;
-};
-
-} // namespace internal
-
-template <FPType fp_type>
-struct FPProperties : public internal::FPBaseProperties<fp_type> {
-private:
-  using UP = internal::FPBaseProperties<fp_type>;
-
-public:
-  // The number of bits to represent sign. For documentation purpose, always 1.
-  LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
-  using UP::EXP_LEN;   // The number of bits for the *exponent* part
-  using UP::SIG_LEN;   // The number of bits for the *significand* part
-  using UP::TOTAL_LEN; // For convenience, the sum of `SIG_LEN`, `EXP_LEN`,
-                       // and `SIGN_LEN`.
-  static_assert(SIGN_LEN + EXP_LEN + SIG_LEN == TOTAL_LEN);
-
-  // An unsigned integer that is wide enough to contain all of the floating
-  // point bits.
-  using StorageType = typename UP::StorageType;
-
-  // The number of bits in StorageType.
-  LIBC_INLINE_VAR static constexpr int STORAGE_LEN =
-      sizeof(StorageType) * CHAR_BIT;
-  static_assert(STORAGE_LEN >= TOTAL_LEN);
-
-  // The exponent bias. Always positive.
-  LIBC_INLINE_VAR static constexpr int32_t EXP_BIAS =
-      (1U << (EXP_LEN - 1U)) - 1U;
-  static_assert(EXP_BIAS > 0);
-
-private:
-  // The shift amount to get the *significand* part to the least significant
-  // bit. Always `0` but kept for consistency.
-  LIBC_INLINE_VAR static constexpr int SIG_MASK_SHIFT = 0;
-  // The shift amount to get the *exponent* part to the least significant bit.
-  LIBC_INLINE_VAR static constexpr int EXP_MASK_SHIFT = SIG_LEN;
-  // The shift amount to get the *sign* part to the least significant bit.
-  LIBC_INLINE_VAR static constexpr int SIGN_MASK_SHIFT = SIG_LEN + EXP_LEN;
-
-  // The bit pattern that keeps only the *significand* part.
-  LIBC_INLINE_VAR static constexpr StorageType SIG_MASK =
-      mask_trailing_ones<StorageType, SIG_LEN>() << SIG_MASK_SHIFT;
-
-public:
-  // The bit pattern that keeps only the *exponent* part.
-  LIBC_INLINE_VAR static constexpr StorageType EXP_MASK =
-      mask_trailing_ones<StorageType, EXP_LEN>() << EXP_MASK_SHIFT;
-  // The bit pattern that keeps only the *sign* part.
-  LIBC_INLINE_VAR static constexpr StorageType SIGN_MASK =
-      mask_trailing_ones<StorageType, SIGN_LEN>() << SIGN_MASK_SHIFT;
-  // The bit pattern that keeps only the *exponent + significand* part.
-  LIBC_INLINE_VAR static constexpr StorageType EXP_SIG_MASK =
-      mask_trailing_ones<StorageType, EXP_LEN + SIG_LEN>();
-  // The bit pattern that keeps only the *sign + exponent + significand* part.
-  LIBC_INLINE_VAR static constexpr StorageType FP_MASK =
-      mask_trailing_ones<StorageType, TOTAL_LEN>();
-
-  static_assert((SIG_MASK & EXP_MASK & SIGN_MASK) == 0, "masks disjoint");
-  static_assert((SIG_MASK | EXP_MASK | SIGN_MASK) == FP_MASK, "masks cover");
-
-private:
-  LIBC_INLINE static constexpr StorageType bit_at(int position) {
-    return StorageType(1) << position;
-  }
-
-public:
-  // The number of bits after the decimal dot when the number is in normal form.
-  LIBC_INLINE_VAR static constexpr int FRACTION_LEN =
-      UP::ENCODING == internal::FPEncoding::X86_ExtendedPrecision ? SIG_LEN - 1
-                                                                  : SIG_LEN;
-  LIBC_INLINE_VAR static constexpr uint32_t MANTISSA_PRECISION =
-      FRACTION_LEN + 1;
-  LIBC_INLINE_VAR static constexpr StorageType FRACTION_MASK =
-      mask_trailing_ones<StorageType, FRACTION_LEN>();
-
-protected:
-  // If a number x is a NAN, then it is a quiet NAN if:
-  //   QUIET_NAN_MASK & bits(x) != 0
-  LIBC_INLINE_VAR static constexpr StorageType QUIET_NAN_MASK =
-      UP::ENCODING == internal::FPEncoding::X86_ExtendedPrecision
-          ? bit_at(SIG_LEN - 1) | bit_at(SIG_LEN - 2) // 0b1100...
-          : bit_at(SIG_LEN - 1);                      // 0b1000...
-
-  // If a number x is a NAN, then it is a signalling NAN if:
-  //   SIGNALING_NAN_MASK & bits(x) != 0
-  LIBC_INLINE_VAR static constexpr StorageType SIGNALING_NAN_MASK =
-      UP::ENCODING == internal::FPEncoding::X86_ExtendedPrecision
-          ? bit_at(SIG_LEN - 1) | bit_at(SIG_LEN - 3) // 0b1010...
-          : bit_at(SIG_LEN - 2);                      // 0b0100...
-};
-
-//-----------------------------------------------------------------------------
-template <typename FP> LIBC_INLINE static constexpr FPType get_fp_type() {
-  if constexpr (cpp::is_same_v<FP, float> && __FLT_MANT_DIG__ == 24)
-    return FPType::IEEE754_Binary32;
-  else if constexpr (cpp::is_same_v<FP, double> && __DBL_MANT_DIG__ == 53)
-    return FPType::IEEE754_Binary64;
-  else if constexpr (cpp::is_same_v<FP, long double>) {
-    if constexpr (__LDBL_MANT_DIG__ == 53)
-      return FPType::IEEE754_Binary64;
-    else if constexpr (__LDBL_MANT_DIG__ == 64)
-      return FPType::X86_Binary80;
-    else if constexpr (__LDBL_MANT_DIG__ == 113)
-      return FPType::IEEE754_Binary128;
-  }
-#if defined(LIBC_COMPILER_HAS_C23_FLOAT16)
-  else if constexpr (cpp::is_same_v<FP, _Float16>)
-    return FPType::IEEE754_Binary16;
-#endif
-#if defined(LIBC_COMPILER_HAS_C23_FLOAT128)
-  else if constexpr (cpp::is_same_v<FP, _Float128>)
-    return FPType::IEEE754_Binary128;
-#endif
-#if defined(LIBC_COMPILER_HAS_FLOAT128_EXTENSION)
-  else if constexpr (cpp::is_same_v<FP, __float128>)
-    return FPType::IEEE754_Binary128;
-#endif
-  else
-    static_assert(cpp::always_false<FP>, "Unsupported type");
-}
-
-template <typename FP>
-struct FloatProperties : public FPProperties<get_fp_type<FP>()> {};
-
-} // namespace fputil
-} // namespace LIBC_NAMESPACE
-
-#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_FLOATPROPERTIES_H
diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h
index 8ea753564ed2..a2064594e63a 100644
--- a/libc/src/__support/FPUtil/ManipulationFunctions.h
+++ b/libc/src/__support/FPUtil/ManipulationFunctions.h
@@ -10,7 +10,6 @@
 #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_MANIPULATIONFUNCTIONS_H
 
 #include "FPBits.h"
-#include "FloatProperties.h"
 #include "NearestIntegerOperations.h"
 #include "NormalFloat.h"
 
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index f8056fecb8ec..561345fd87cf 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -10,7 +10,6 @@
 #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H
 
 #include "FPBits.h"
-#include "FloatProperties.h"
 #include "multiply_add.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/UInt.h"
diff --git a/libc/src/__support/FPUtil/fpbits_str.h b/libc/src/__support/FPUtil/fpbits_str.h
index bab3d5f97a6b..ce368c89f95e 100644
--- a/libc/src/__support/FPUtil/fpbits_str.h
+++ b/libc/src/__support/FPUtil/fpbits_str.h
@@ -12,7 +12,6 @@
 #include "src/__support/CPP/string.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/FloatProperties.h"
 #include "src/__support/integer_to_string.h"
 #include "src/__support/macros/attributes.h"
 
diff --git a/libc/src/__support/FPUtil/generic/CMakeLists.txt b/libc/src/__support/FPUtil/generic/CMakeLists.txt
index b17f32521047..0ae62f40dc61 100644
--- a/libc/src/__support/FPUtil/generic/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt
@@ -23,7 +23,6 @@ add_header_library(
     libc.src.__support.CPP.bit
     libc.src.__support.CPP.type_traits
     libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.float_properties
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
@@ -39,7 +38,6 @@ add_header_library(
     libc.src.__support.CPP.bit
     libc.src.__support.CPP.type_traits
     libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.float_properties
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index c70069487d99..4ba9e1d2be39 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -13,7 +13,6 @@
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/FloatProperties.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/UInt128.h"
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
diff --git a/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h b/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
index b2b016adb661..1011e61f03fd 100644
--- a/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
+++ b/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
@@ -26,20 +26,27 @@
 namespace LIBC_NAMESPACE {
 namespace fputil {
 
-template <> struct FPBits<long double> : private FloatProperties<long double> {
-  using typename FloatProperties<long double>::StorageType;
-  using FloatProperties<long double>::TOTAL_LEN;
-  using FloatProperties<long double>::EXP_MASK;
-  using FloatProperties<long double>::EXP_BIAS;
-  using FloatProperties<long double>::EXP_LEN;
-  using FloatProperties<long double>::FRACTION_MASK;
-  using FloatProperties<long double>::FRACTION_LEN;
+template <>
+struct FPBits<long double>
+    : public internal::FPBitsCommon<FPType::X86_Binary80> {
+  using UP = internal::FPBitsCommon<FPType::X86_Binary80>;
+  using StorageType = typename UP::StorageType;
+  using UP::bits;
 
 private:
-  using FloatProperties<long double>::QUIET_NAN_MASK;
+  using UP::EXP_SIG_MASK;
+  using UP::QUIET_NAN_MASK;
 
 public:
-  using FloatProperties<long double>::SIGN_MASK;
+  using UP::EXP_BIAS;
+  using UP::EXP_LEN;
+  using UP::EXP_MASK;
+  using UP::EXP_MASK_SHIFT;
+  using UP::FP_MASK;
+  using UP::FRACTION_LEN;
+  using UP::FRACTION_MASK;
+  using UP::SIGN_MASK;
+  using UP::TOTAL_LEN;
 
   static constexpr int MAX_BIASED_EXPONENT = 0x7FFF;
   static constexpr StorageType MIN_SUBNORMAL = StorageType(1);
@@ -51,18 +58,6 @@ public:
       (StorageType(MAX_BIASED_EXPONENT - 1) << (FRACTION_LEN + 1)) |
       (StorageType(1) << FRACTION_LEN) | MAX_SUBNORMAL;
 
-  StorageType bits;
-
-  LIBC_INLINE constexpr void set_mantissa(StorageType mantVal) {
-    mantVal &= FRACTION_MASK;
-    bits &= ~FRACTION_MASK;
-    bits |= mantVal;
-  }
-
-  LIBC_INLINE constexpr StorageType get_mantissa() const {
-    return bits & FRACTION_MASK;
-  }
-
   LIBC_INLINE constexpr StorageType get_explicit_mantissa() const {
     // The x86 80 bit float represents the leading digit of the mantissa
     // explicitly. This is the mask for that bit.
@@ -70,16 +65,6 @@ public:
     return bits & (FRACTION_MASK | EXPLICIT_BIT_MASK);
   }
 
-  LIBC_INLINE constexpr void set_biased_exponent(StorageType expVal) {
-    expVal = (expVal << (TOTAL_LEN - 1 - EXP_LEN)) & EXP_MASK;
-    bits &= ~EXP_MASK;
-    bits |= expVal;
-  }
-
-  LIBC_INLINE constexpr uint16_t get_biased_exponent() const {
-    return uint16_t((bits & EXP_MASK) >> (TOTAL_LEN - 1 - EXP_LEN));
-  }
-
   LIBC_INLINE constexpr void set_implicit_bit(bool implicitVal) {
     bits &= ~(StorageType(1) << FRACTION_LEN);
     bits |= (StorageType(implicitVal) << FRACTION_LEN);
@@ -89,22 +74,12 @@ public:
     return bool((bits & (StorageType(1) << FRACTION_LEN)) >> FRACTION_LEN);
   }
 
-  LIBC_INLINE constexpr void set_sign(bool signVal) {
-    bits &= ~SIGN_MASK;
-    StorageType sign1 = StorageType(signVal) << (TOTAL_LEN - 1);
-    bits |= sign1;
-  }
-
-  LIBC_INLINE constexpr bool get_sign() const {
-    return bool((bits & SIGN_MASK) >> (TOTAL_LEN - 1));
-  }
-
-  LIBC_INLINE constexpr FPBits() : bits(0) {}
+  LIBC_INLINE constexpr FPBits() : UP() {}
 
   template <typename XType,
             cpp::enable_if_t<cpp::is_same_v<long double, XType>, int> = 0>
   LIBC_INLINE constexpr explicit FPBits(XType x)
-      : bits(cpp::bit_cast<StorageType>(x)) {
+      : UP(cpp::bit_cast<StorageType>(x)) {
     // bits starts uninitialized, and setting it to a long double only
     // overwrites the first 80 bits. This clears those upper bits.
     bits = bits & ((StorageType(1) << 80) - 1);
@@ -112,47 +87,16 @@ public:
 
   template <typename XType,
             cpp::enable_if_t<cpp::is_same_v<XType, StorageType>, int> = 0>
-  LIBC_INLINE constexpr explicit FPBits(XType x) : bits(x) {}
+  LIBC_INLINE constexpr explicit FPBits(XType x) : UP(x) {}
 
   LIBC_INLINE constexpr operator long double() {
     return cpp::bit_cast<long double>(bits);
   }
 
-  LIBC_INLINE constexpr StorageType uintval() {
-    // We zero the padding bits as they can contain garbage.
-    return bits & FP_MASK;
-  }
-
   LIBC_INLINE constexpr long double get_val() const {
     return cpp::bit_cast<long double>(bits);
   }
 
-  LIBC_INLINE constexpr int get_exponent() const {
-    return int(get_biased_exponent()) - EXP_BIAS;
-  }
-
-  // If the number is subnormal, the exponent is treated as if it were the
-  // minimum exponent for a normal number. This is to keep continuity between
-  // the normal and subnormal ranges, but it causes problems for functions where
-  // values are calculated from the exponent, since just subtracting the bias
-  // will give a slightly incorrect result. Additionally, zero has an exponent
-  // of zero, and that should actually be treated as zero.
-  LIBC_INLINE constexpr int get_explicit_exponent() const {
-    const int biased_exp = int(get_biased_exponent());
-    if (is_zero()) {
-      return 0;
-    } else if (biased_exp == 0) {
-      return 1 - EXP_BIAS;
-    } else {
-      return biased_exp - EXP_BIAS;
-    }
-  }
-
-  LIBC_INLINE constexpr bool is_zero() const {
-    return get_biased_exponent() == 0 && get_mantissa() == 0 &&
-           get_implicit_bit() == 0;
-  }
-
   LIBC_INLINE constexpr bool is_inf() const {
     return get_biased_exponent() == MAX_BIASED_EXPONENT &&
            get_mantissa() == 0 && get_implicit_bit() == 1;
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index cfd495c58618..79e05940f027 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -30,7 +30,7 @@ template <size_t Bits, bool Signed> struct BigInt {
   static_assert(Bits > 0 && Bits % 64 == 0,
                 "Number of bits in BigInt should be a multiple of 64.");
   LIBC_INLINE_VAR static constexpr size_t WORDCOUNT = Bits / 64;
-  uint64_t val[WORDCOUNT]{};
+  cpp::array<uint64_t, WORDCOUNT> val{};
 
   LIBC_INLINE_VAR static constexpr uint64_t MASK32 = 0xFFFFFFFFu;
 
@@ -954,6 +954,48 @@ struct make_signed<UInt<Bits>> : type_identity<Int<Bits>> {
                 "Number of bits in Int should be a multiple of 64.");
 };
 
+namespace internal {
+template <typename T> struct is_custom_uint : cpp::false_type {};
+template <size_t Bits> struct is_custom_uint<UInt<Bits>> : cpp::true_type {};
+} // namespace internal
+
+// bit_cast to UInt
+// Note: The standard scheme for SFINAE selection is to have exactly one
+// function instanciation valid at a time. This is usually done by having a
+// predicate in one function and the negated predicate in the other one.
+// e.g.
+// template<typename = cpp::enable_if_t< is_custom_uint<To>::value == true> ...
+// template<typename = cpp::enable_if_t< is_custom_uint<To>::value == false> ...
+//
+// Unfortunately this would make the default 'cpp::bit_cast' aware of
+// 'is_custom_uint' (or any other customization). To prevent exposing all
+// customizations in the original function, we create a different function with
+// four 'typename's instead of three - otherwise it would be considered as a
+// redeclaration of the same function leading to "error: template parameter
+// redefines default argument".
+template <typename To, typename From,
+          typename = cpp::enable_if_t<sizeof(To) == sizeof(From) &&
+                                      cpp::is_trivially_copyable<To>::value &&
+                                      cpp::is_trivially_copyable<From>::value>,
+          typename = cpp::enable_if_t<internal::is_custom_uint<To>::value>>
+LIBC_INLINE constexpr To bit_cast(const From &from) {
+  To out;
+  using Storage = decltype(out.val);
+  out.val = cpp::bit_cast<Storage>(from);
+  return out;
+}
+
+// bit_cast from UInt
+template <
+    typename To, size_t Bits,
+    typename = cpp::enable_if_t<sizeof(To) == sizeof(UInt<Bits>) &&
+                                cpp::is_trivially_constructible<To>::value &&
+                                cpp::is_trivially_copyable<To>::value &&
+                                cpp::is_trivially_copyable<UInt<Bits>>::value>>
+LIBC_INLINE constexpr To bit_cast(const UInt<Bits> &from) {
+  return cpp::bit_cast<To>(from.val);
+}
+
 } // namespace LIBC_NAMESPACE::cpp
 
 #endif // LLVM_LIBC_SRC___SUPPORT_UINT_H
diff --git a/libc/src/math/generic/exp.cpp b/libc/src/math/generic/exp.cpp
index 5428a0443088..c8e36404a781 100644
--- a/libc/src/math/generic/exp.cpp
+++ b/libc/src/math/generic/exp.cpp
@@ -51,6 +51,8 @@ constexpr double MLOG_2_EXP2_M12_MID = 0x1.718432a1b0e26p-47;
 constexpr double MLOG_2_EXP2_M12_MID_30 = 0x1.718432ap-47;
 constexpr double MLOG_2_EXP2_M12_LO = 0x1.b0e2633fe0685p-79;
 
+namespace {
+
 // Polynomial approximations with double precision:
 // Return expm1(dx) / x ~ 1 + dx / 2 + dx^2 / 6 + dx^3 / 24.
 // For |dx| < 2^-13 + 2^-30:
@@ -218,6 +220,8 @@ double set_exceptional(double x) {
   return x + static_cast<double>(FPBits::inf());
 }
 
+} // namespace
+
 LLVM_LIBC_FUNCTION(double, exp, (double x)) {
   using FPBits = typename fputil::FPBits<double>;
   using FloatProp = typename fputil::FloatProperties<double>;
diff --git a/libc/src/math/generic/exp10.cpp b/libc/src/math/generic/exp10.cpp
index aa66d4f17a3a..92b3a468a9cc 100644
--- a/libc/src/math/generic/exp10.cpp
+++ b/libc/src/math/generic/exp10.cpp
@@ -52,6 +52,8 @@ constexpr double ERR_D = 0x1.8p-63;
 // Errors when using double-double precision.
 constexpr double ERR_DD = 0x1.8p-99;
 
+namespace {
+
 // Polynomial approximations with double precision.  Generated by Sollya with:
 // > P = fpminimax((10^x - 1)/x, 3, [|D...|], [-2^-14, 2^-14]);
 // > P;
@@ -268,6 +270,8 @@ double set_exceptional(double x) {
   return x + static_cast<double>(FPBits::inf());
 }
 
+} // namespace
+
 LLVM_LIBC_FUNCTION(double, exp10, (double x)) {
   using FPBits = typename fputil::FPBits<double>;
   using FloatProp = typename fputil::FloatProperties<double>;
diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp
index 3e9f9c6855c4..44aeb14e231b 100644
--- a/libc/src/math/generic/exp2.cpp
+++ b/libc/src/math/generic/exp2.cpp
@@ -42,6 +42,8 @@ constexpr double ERR_D = 0x1.8p-63;
 // Errors when using double-double precision.
 constexpr double ERR_DD = 0x1.0p-100;
 
+namespace {
+
 // Polynomial approximations with double precision.  Generated by Sollya with:
 // > P = fpminimax((2^x - 1)/x, 3, [|D...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
 // > P;
@@ -243,6 +245,8 @@ double set_exceptional(double x) {
   return x + static_cast<double>(FPBits::inf());
 }
 
+} // namespace
+
 LLVM_LIBC_FUNCTION(double, exp2, (double x)) {
   using FPBits = typename fputil::FPBits<double>;
   using FloatProp = typename fputil::FloatProperties<double>;
diff --git a/libc/src/math/generic/expm1.cpp b/libc/src/math/generic/expm1.cpp
index e7cc24083975..deb3b0adc0ea 100644
--- a/libc/src/math/generic/expm1.cpp
+++ b/libc/src/math/generic/expm1.cpp
@@ -61,6 +61,8 @@ constexpr double MLOG_2_EXP2_M12_MID = 0x1.718432a1b0e26p-47;
 constexpr double MLOG_2_EXP2_M12_MID_30 = 0x1.718432ap-47;
 constexpr double MLOG_2_EXP2_M12_LO = 0x1.b0e2633fe0685p-79;
 
+namespace {
+
 // Polynomial approximations with double precision:
 // Return expm1(dx) / x ~ 1 + dx / 2 + dx^2 / 6 + dx^3 / 24.
 // For |dx| < 2^-13 + 2^-30:
@@ -269,6 +271,8 @@ double set_exceptional(double x) {
   return x + static_cast<double>(FPBits::inf());
 }
 
+} // namespace
+
 LLVM_LIBC_FUNCTION(double, expm1, (double x)) {
   using FPBits = typename fputil::FPBits<double>;
   using FloatProp = typename fputil::FloatProperties<double>;
diff --git a/libc/src/math/gpu/vendor/amdgpu/platform.h b/libc/src/math/gpu/vendor/amdgpu/platform.h
index d06a53d7371d..160a8508cd8b 100644
--- a/libc/src/math/gpu/vendor/amdgpu/platform.h
+++ b/libc/src/math/gpu/vendor/amdgpu/platform.h
@@ -72,6 +72,10 @@ extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9010;
 extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9012;
 #elif defined(__gfx940__)
 extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9400;
+#elif defined(__gfx941__)
+extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9401;
+#elif defined(__gfx942__)
+extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9402;
 #elif defined(__gfx1010__)
 extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 10100;
 #elif defined(__gfx1011__)
diff --git a/libc/src/stdio/generic/puts.cpp b/libc/src/stdio/generic/puts.cpp
index d8d69332566c..0f19ec526767 100644
--- a/libc/src/stdio/generic/puts.cpp
+++ b/libc/src/stdio/generic/puts.cpp
@@ -15,9 +15,26 @@
 
 namespace LIBC_NAMESPACE {
 
+namespace {
+
+// Simple helper to unlock the file once destroyed.
+struct ScopedLock {
+  ScopedLock(LIBC_NAMESPACE::File *stream) : stream(stream) { stream->lock(); }
+  ~ScopedLock() { stream->unlock(); }
+
+private:
+  LIBC_NAMESPACE::File *stream;
+};
+
+} // namespace
+
 LLVM_LIBC_FUNCTION(int, puts, (const char *__restrict str)) {
   cpp::string_view str_view(str);
-  auto result = LIBC_NAMESPACE::stdout->write(str, str_view.size());
+
+  // We need to lock the stream to ensure the newline is always appended.
+  ScopedLock lock(LIBC_NAMESPACE::stdout);
+
+  auto result = LIBC_NAMESPACE::stdout->write_unlocked(str, str_view.size());
   if (result.has_error())
     libc_errno = result.error;
   size_t written = result.value;
@@ -25,7 +42,7 @@ LLVM_LIBC_FUNCTION(int, puts, (const char *__restrict str)) {
     // The stream should be in an error state in this case.
     return EOF;
   }
-  result = LIBC_NAMESPACE::stdout->write("\n", 1);
+  result = LIBC_NAMESPACE::stdout->write_unlocked("\n", 1);
   if (result.has_error())
     libc_errno = result.error;
   written = result.value;
diff --git a/libc/src/stdio/printf_core/char_converter.h b/libc/src/stdio/printf_core/char_converter.h
index 9b1501ff24b0..13596b8ed4f2 100644
--- a/libc/src/stdio/printf_core/char_converter.h
+++ b/libc/src/stdio/printf_core/char_converter.h
@@ -9,8 +9,6 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_CHAR_CONVERTER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_CHAR_CONVERTER_H
 
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/common.h"
 #include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/writer.h"
@@ -21,10 +19,10 @@ namespace printf_core {
 LIBC_INLINE int convert_char(Writer *writer, const FormatSection &to_conv) {
   char c = static_cast<char>(to_conv.conv_val_raw);
 
-  constexpr int string_len = 1;
+  constexpr int STRING_LEN = 1;
 
   size_t padding_spaces =
-      to_conv.min_width > string_len ? to_conv.min_width - string_len : 0;
+      to_conv.min_width > STRING_LEN ? to_conv.min_width - STRING_LEN : 0;
 
   // If the padding is on the left side, write the spaces first.
   if (padding_spaces > 0 &&
diff --git a/libc/src/stdio/printf_core/converter_utils.h b/libc/src/stdio/printf_core/converter_utils.h
index 4540bba6346e..54f0a870d0ac 100644
--- a/libc/src/stdio/printf_core/converter_utils.h
+++ b/libc/src/stdio/printf_core/converter_utils.h
@@ -10,7 +10,6 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_CONVERTER_UTILS_H
 
 #include "src/__support/CPP/limits.h"
-#include "src/__support/common.h"
 #include "src/stdio/printf_core/core_structs.h"
 
 #include <inttypes.h>
diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h
index 37538362fa3e..7634d45568ab 100644
--- a/libc/src/stdio/printf_core/core_structs.h
+++ b/libc/src/stdio/printf_core/core_structs.h
@@ -53,7 +53,7 @@ struct FormatSection {
 
   // This operator is only used for testing and should be automatically
   // optimized out for release builds.
-  bool operator==(const FormatSection &other) const {
+  LIBC_INLINE bool operator==(const FormatSection &other) const {
     if (has_conv != other.has_conv)
       return false;
 
@@ -93,11 +93,11 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() {
   if constexpr (cpp::is_same_v<T, void>) {
     return TypeDesc{0, PrimaryType::Unknown};
   } else {
-    constexpr bool isPointer = cpp::is_pointer_v<T>;
-    constexpr bool isFloat = cpp::is_floating_point_v<T>;
-    return TypeDesc{sizeof(T), isPointer ? PrimaryType::Pointer
-                               : isFloat ? PrimaryType::Float
-                                         : PrimaryType::Integer};
+    constexpr bool IS_POINTER = cpp::is_pointer_v<T>;
+    constexpr bool IS_FLOAT = cpp::is_floating_point_v<T>;
+    return TypeDesc{sizeof(T), IS_POINTER ? PrimaryType::Pointer
+                               : IS_FLOAT ? PrimaryType::Float
+                                          : PrimaryType::Integer};
   }
 }
 
diff --git a/libc/src/stdio/printf_core/float_dec_converter.h b/libc/src/stdio/printf_core/float_dec_converter.h
index 458f494d5edf..78ce7af3a060 100644
--- a/libc/src/stdio/printf_core/float_dec_converter.h
+++ b/libc/src/stdio/printf_core/float_dec_converter.h
@@ -10,13 +10,8 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FLOAT_DEC_CONVERTER_H
 
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/FloatProperties.h"
 #include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/UInt.h"
-#include "src/__support/UInt128.h"
-#include "src/__support/common.h"
 #include "src/__support/float_to_string.h"
 #include "src/__support/integer_to_string.h"
 #include "src/__support/libc_assert.h"
@@ -103,14 +98,14 @@ class PaddingWriter {
   size_t min_width = 0;
 
 public:
-  PaddingWriter() {}
-  PaddingWriter(const FormatSection &to_conv, char init_sign_char)
+  LIBC_INLINE PaddingWriter() {}
+  LIBC_INLINE PaddingWriter(const FormatSection &to_conv, char init_sign_char)
       : left_justified((to_conv.flags & FormatFlags::LEFT_JUSTIFIED) > 0),
         leading_zeroes((to_conv.flags & FormatFlags::LEADING_ZEROES) > 0),
         sign_char(init_sign_char),
         min_width(to_conv.min_width > 0 ? to_conv.min_width : 0) {}
 
-  int write_left_padding(Writer *writer, size_t total_digits) {
+  LIBC_INLINE int write_left_padding(Writer *writer, size_t total_digits) {
     // The pattern is (spaces) (sign) (zeroes), but only one of spaces and
     // zeroes can be written, and only if the padding amount is positive.
     int padding_amount =
@@ -133,7 +128,7 @@ public:
     return 0;
   }
 
-  int write_right_padding(Writer *writer, size_t total_digits) {
+  LIBC_INLINE int write_right_padding(Writer *writer, size_t total_digits) {
     // If and only if the conversion is left justified, there may be trailing
     // spaces.
     int padding_amount =
@@ -170,7 +165,7 @@ class FloatWriter {
   Writer *writer;                   // Writes to the final output.
   PaddingWriter padding_writer; // Handles prefixes/padding, uses total_digits.
 
-  int flush_buffer(bool round_up_max_blocks = false) {
+  LIBC_INLINE int flush_buffer(bool round_up_max_blocks = false) {
     const char MAX_BLOCK_DIGIT = (round_up_max_blocks ? '0' : '9');
 
     // Write the most recent buffered block, and mark has_written
@@ -249,17 +244,18 @@ class FloatWriter {
                 (sizeof(int) * 8));
 
 public:
-  FloatWriter(Writer *init_writer, bool init_has_decimal_point,
-              const PaddingWriter &init_padding_writer)
+  LIBC_INLINE FloatWriter(Writer *init_writer, bool init_has_decimal_point,
+                          const PaddingWriter &init_padding_writer)
       : has_decimal_point(init_has_decimal_point), writer(init_writer),
         padding_writer(init_padding_writer) {}
 
-  void init(size_t init_total_digits, size_t init_digits_before_decimal) {
+  LIBC_INLINE void init(size_t init_total_digits,
+                        size_t init_digits_before_decimal) {
     total_digits = init_total_digits;
     digits_before_decimal = init_digits_before_decimal;
   }
 
-  void write_first_block(BlockInt block, bool exp_format = false) {
+  LIBC_INLINE void write_first_block(BlockInt block, bool exp_format = false) {
     const DecimalString buf(block);
     const cpp::string_view int_to_str = buf.view();
     size_t digits_buffered = int_to_str.size();
@@ -280,7 +276,7 @@ public:
     }
   }
 
-  int write_middle_block(BlockInt block) {
+  LIBC_INLINE int write_middle_block(BlockInt block) {
     if (block == MAX_BLOCK) { // Buffer max blocks in case of rounding
       ++max_block_count;
     } else { // If a non-max block has been found
@@ -301,9 +297,9 @@ public:
     return 0;
   }
 
-  int write_last_block(BlockInt block, size_t block_digits,
-                       RoundDirection round, int exponent = 0,
-                       char exp_char = '\0') {
+  LIBC_INLINE int write_last_block(BlockInt block, size_t block_digits,
+                                   RoundDirection round, int exponent = 0,
+                                   char exp_char = '\0') {
     bool has_exp = (exp_char != '\0');
 
     char end_buff[BLOCK_SIZE];
@@ -458,13 +454,13 @@ public:
     return WRITE_OK;
   }
 
-  int write_zeroes(uint32_t num_zeroes) {
+  LIBC_INLINE int write_zeroes(uint32_t num_zeroes) {
     RET_IF_RESULT_NEGATIVE(flush_buffer());
     RET_IF_RESULT_NEGATIVE(writer->write('0', num_zeroes));
     return 0;
   }
 
-  int right_pad() {
+  LIBC_INLINE int right_pad() {
     return padding_writer.write_right_padding(writer, total_digits);
   }
 };
diff --git a/libc/src/stdio/printf_core/float_hex_converter.h b/libc/src/stdio/printf_core/float_hex_converter.h
index a3a8c0420bef..5ccae81b430c 100644
--- a/libc/src/stdio/printf_core/float_hex_converter.h
+++ b/libc/src/stdio/printf_core/float_hex_converter.h
@@ -10,10 +10,8 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FLOAT_HEX_CONVERTER_H
 
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
 #include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/float_inf_nan_converter.h"
diff --git a/libc/src/stdio/printf_core/float_inf_nan_converter.h b/libc/src/stdio/printf_core/float_inf_nan_converter.h
index a0310dc88b56..8669dc374cb2 100644
--- a/libc/src/stdio/printf_core/float_inf_nan_converter.h
+++ b/libc/src/stdio/printf_core/float_inf_nan_converter.h
@@ -10,7 +10,6 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FLOAT_INF_NAN_CONVERTER_H
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/common.h"
 #include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/writer.h"
diff --git a/libc/src/stdio/printf_core/int_converter.h b/libc/src/stdio/printf_core/int_converter.h
index 13fcf3f1aa2e..7744d801cbc1 100644
--- a/libc/src/stdio/printf_core/int_converter.h
+++ b/libc/src/stdio/printf_core/int_converter.h
@@ -11,7 +11,6 @@
 
 #include "src/__support/CPP/span.h"
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/common.h"
 #include "src/__support/integer_to_string.h"
 #include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index f1994517e1ab..ab491655275f 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -10,9 +10,6 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 
 #include "src/__support/CPP/optional.h"
-#include "src/__support/CPP/type_traits.h"
-#include "src/__support/arg_list.h"
-#include "src/__support/common.h"
 #include "src/__support/str_to_integer.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/printf_config.h"
diff --git a/libc/src/stdio/printf_core/ptr_converter.h b/libc/src/stdio/printf_core/ptr_converter.h
index 73c6e608a59a..c5d4086647ec 100644
--- a/libc/src/stdio/printf_core/ptr_converter.h
+++ b/libc/src/stdio/printf_core/ptr_converter.h
@@ -9,9 +9,6 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PTR_CONVERTER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PTR_CONVERTER_H
 
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/common.h"
-#include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/int_converter.h"
 #include "src/stdio/printf_core/string_converter.h"
diff --git a/libc/src/stdio/printf_core/string_converter.h b/libc/src/stdio/printf_core/string_converter.h
index 158315311e9e..04dc5a06da22 100644
--- a/libc/src/stdio/printf_core/string_converter.h
+++ b/libc/src/stdio/printf_core/string_converter.h
@@ -10,7 +10,6 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_STRING_CONVERTER_H
 
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/common.h"
 #include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/writer.h"
diff --git a/libc/src/stdio/printf_core/write_int_converter.h b/libc/src/stdio/printf_core/write_int_converter.h
index 35cafacd5a8c..0310905f36f1 100644
--- a/libc/src/stdio/printf_core/write_int_converter.h
+++ b/libc/src/stdio/printf_core/write_int_converter.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_WRITE_INT_CONVERTER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_WRITE_INT_CONVERTER_H
 
-#include "src/__support/CPP/limits.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/writer.h"
 
diff --git a/libc/src/stdio/printf_core/writer.cpp b/libc/src/stdio/printf_core/writer.cpp
index c831ca14c9d9..f8ecd829af3a 100644
--- a/libc/src/stdio/printf_core/writer.cpp
+++ b/libc/src/stdio/printf_core/writer.cpp
@@ -8,9 +8,7 @@
 
 #include "writer.h"
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/macros/optimization.h"
 #include "src/stdio/printf_core/core_structs.h"
-#include "src/string/memory_utils/inline_memcpy.h"
 #include "src/string/memory_utils/inline_memset.h"
 #include <stddef.h>
 
diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h
index e4f503abc34c..67513eca9728 100644
--- a/libc/src/stdio/printf_core/writer.h
+++ b/libc/src/stdio/printf_core/writer.h
@@ -45,7 +45,7 @@ struct WriteBuffer {
   // write as much of new_str to the buffer as it can. The current position in
   // the buffer will be reset iff stream_writer is called. Calling this with an
   // empty string will flush the buffer if relevant.
-  int overflow_write(cpp::string_view new_str) {
+  LIBC_INLINE int overflow_write(cpp::string_view new_str) {
     // If there is a stream_writer, write the contents of the buffer, then
     // new_str, then clear the buffer.
     if (stream_writer != nullptr) {
diff --git a/libc/src/stdio/scanf_core/converter_utils.h b/libc/src/stdio/scanf_core/converter_utils.h
index a14f35796d27..a25e8a73e99a 100644
--- a/libc/src/stdio/scanf_core/converter_utils.h
+++ b/libc/src/stdio/scanf_core/converter_utils.h
@@ -9,11 +9,9 @@
 #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_CONVERTER_UTILS_H
 #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_CONVERTER_UTILS_H
 
-#include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/str_to_float.h"
 #include "src/stdio/scanf_core/core_structs.h"
-#include "src/stdio/scanf_core/reader.h"
 
 #include <stddef.h>
 
diff --git a/libc/src/stdio/scanf_core/core_structs.h b/libc/src/stdio/scanf_core/core_structs.h
index 246f770e0cab..29e1bf2e47f3 100644
--- a/libc/src/stdio/scanf_core/core_structs.h
+++ b/libc/src/stdio/scanf_core/core_structs.h
@@ -11,7 +11,6 @@
 
 #include "src/__support/CPP/bitset.h"
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/FPUtil/FPBits.h"
 
 #include <inttypes.h>
 #include <stddef.h>
@@ -46,7 +45,7 @@ struct FormatSection {
 
   cpp::bitset<256> scan_set;
 
-  bool operator==(const FormatSection &other) {
+  LIBC_INLINE bool operator==(const FormatSection &other) {
     if (has_conv != other.has_conv)
       return false;
 
diff --git a/libc/src/stdio/scanf_core/current_pos_converter.h b/libc/src/stdio/scanf_core/current_pos_converter.h
index fd62383e3940..be25cefed151 100644
--- a/libc/src/stdio/scanf_core/current_pos_converter.h
+++ b/libc/src/stdio/scanf_core/current_pos_converter.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_CURRENT_POS_CONVERTER_H
 #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_CURRENT_POS_CONVERTER_H
 
-#include "src/__support/common.h"
 #include "src/stdio/scanf_core/converter_utils.h"
 #include "src/stdio/scanf_core/core_structs.h"
 #include "src/stdio/scanf_core/reader.h"
diff --git a/libc/src/stdio/scanf_core/parser.h b/libc/src/stdio/scanf_core/parser.h
index 7f3a53be3570..5ae9009bc4a2 100644
--- a/libc/src/stdio/scanf_core/parser.h
+++ b/libc/src/stdio/scanf_core/parser.h
@@ -10,7 +10,6 @@
 #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
 
 #include "src/__support/arg_list.h"
-#include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/str_to_integer.h"
 #include "src/stdio/scanf_core/core_structs.h"
diff --git a/libc/src/stdio/scanf_core/reader.h b/libc/src/stdio/scanf_core/reader.h
index d8647fe2c4ec..f750c4341a8d 100644
--- a/libc/src/stdio/scanf_core/reader.h
+++ b/libc/src/stdio/scanf_core/reader.h
@@ -38,10 +38,11 @@ class Reader {
 
 public:
   // TODO: Set buff_len with a proper constant
-  Reader(ReadBuffer *string_buffer) : rb(string_buffer) {}
+  LIBC_INLINE Reader(ReadBuffer *string_buffer) : rb(string_buffer) {}
 
-  Reader(void *stream, StreamGetc stream_getc_in, StreamUngetc stream_ungetc_in,
-         ReadBuffer *stream_buffer = nullptr)
+  LIBC_INLINE Reader(void *stream, StreamGetc stream_getc_in,
+                     StreamUngetc stream_ungetc_in,
+                     ReadBuffer *stream_buffer = nullptr)
       : rb(stream_buffer), input_stream(stream), stream_getc(stream_getc_in),
         stream_ungetc(stream_ungetc_in) {}
 
@@ -63,7 +64,7 @@ public:
   // this is a file reader, else c is ignored.
   void ungetc(char c);
 
-  size_t chars_read() { return cur_chars_read; }
+  LIBC_INLINE size_t chars_read() { return cur_chars_read; }
 };
 
 } // namespace scanf_core
diff --git a/libc/src/time/gpu/time_utils.h b/libc/src/time/gpu/time_utils.h
index 53548181e17e..ffab6438c711 100644
--- a/libc/src/time/gpu/time_utils.h
+++ b/libc/src/time/gpu/time_utils.h
@@ -17,7 +17,8 @@ namespace LIBC_NAMESPACE {
 // AMDGPU does not have a single set frequency. Different architectures and
 // cards can have vary values. Here we default to a few known values, but for
 // complete support the frequency needs to be read from the kernel driver.
-#if defined(__gfx1010__) || defined(__gfx1011__) || defined(__gfx1012__) ||    \
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) ||       \
+    defined(__gfx1010__) || defined(__gfx1011__) || defined(__gfx1012__) ||    \
     defined(__gfx1013__) || defined(__gfx1030__) || defined(__gfx1031__) ||    \
     defined(__gfx1032__) || defined(__gfx1033__) || defined(__gfx1034__) ||    \
     defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1100__) ||    \
@@ -27,7 +28,7 @@ namespace LIBC_NAMESPACE {
 constexpr uint64_t clock_freq = 100000000;
 #elif defined(__gfx900__) || defined(__gfx902__) || defined(__gfx904__) ||     \
     defined(__gfx906__) || defined(__gfx908__) || defined(__gfx909__) ||       \
-    defined(__gfx90a__) || defined(__gfx90c__) || defined(__gfx940__)
+    defined(__gfx90a__) || defined(__gfx90c__)
 // These architectures use a 25 MHz fixed frequency clock expect for Vega 10
 // which is actually 27 Mhz. We default to 25 MHz in all cases anyway.
 constexpr uint64_t clock_freq = 25000000;
@@ -36,8 +37,9 @@ constexpr uint64_t clock_freq = 25000000;
 constexpr uint64_t clock_freq = 0;
 #endif
 
-// We provide an externally visible symbol such that the runtime can set this to
-// the correct value. If it is not set we try to default to the known values.
+// We provide an externally visible symbol such that the runtime can set
+// this to the correct value. If it is not set we try to default to the
+// known values.
 extern "C" [[gnu::visibility("protected")]] uint64_t
     [[clang::address_space(4)]] __llvm_libc_clock_freq;
 #define GPU_CLOCKS_PER_SEC static_cast<clock_t>(__llvm_libc_clock_freq)
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index fe8b3c4c84c3..a92e6da56096 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -42,6 +42,8 @@ add_libc_test(
   DEPENDS
     libc.src.__support.high_precision_decimal
     libc.src.__support.uint128
+  # FIXME Test segfaults on gfx90a GPU
+  UNIT_TEST_ONLY
 )
 
 add_libc_test(
diff --git a/libc/test/src/__support/str_to_fp_test.h b/libc/test/src/__support/str_to_fp_test.h
index 1e7343f865b7..ba6d46293cd0 100644
--- a/libc/test/src/__support/str_to_fp_test.h
+++ b/libc/test/src/__support/str_to_fp_test.h
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/FPUtil/FloatProperties.h"
+#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/UInt128.h"
 #include "src/__support/str_to_float.h"
 #include "src/errno/libc_errno.h"
diff --git a/libc/test/src/__support/uint_test.cpp b/libc/test/src/__support/uint_test.cpp
index 971bac55bd9d..0ad72c35645c 100644
--- a/libc/test/src/__support/uint_test.cpp
+++ b/libc/test/src/__support/uint_test.cpp
@@ -10,19 +10,62 @@
 #include "src/__support/UInt.h"
 
 #include "test/UnitTest/Test.h"
+#include <math.h> // HUGE_VALF, HUGE_VALF
 
-// We want to test LIBC_NAMESPACE::cpp::UInt<128> explicitly. So, for
+namespace LIBC_NAMESPACE {
+
+using LL_UInt64 = cpp::UInt<64>;
+// We want to test cpp::UInt<128> explicitly. So, for
 // convenience, we use a sugar which does not conflict with the UInt128 type
 // which can resolve to __uint128_t if the platform has it.
-using LL_UInt128 = LIBC_NAMESPACE::cpp::UInt<128>;
-using LL_UInt192 = LIBC_NAMESPACE::cpp::UInt<192>;
-using LL_UInt256 = LIBC_NAMESPACE::cpp::UInt<256>;
-using LL_UInt320 = LIBC_NAMESPACE::cpp::UInt<320>;
-using LL_UInt512 = LIBC_NAMESPACE::cpp::UInt<512>;
-using LL_UInt1024 = LIBC_NAMESPACE::cpp::UInt<1024>;
+using LL_UInt128 = cpp::UInt<128>;
+using LL_UInt192 = cpp::UInt<192>;
+using LL_UInt256 = cpp::UInt<256>;
+using LL_UInt320 = cpp::UInt<320>;
+using LL_UInt512 = cpp::UInt<512>;
+using LL_UInt1024 = cpp::UInt<1024>;
+
+using LL_Int128 = cpp::Int<128>;
+using LL_Int192 = cpp::Int<192>;
+
+TEST(LlvmLibcUIntClassTest, BitCastToFromDouble) {
+  static_assert(cpp::is_trivially_copyable<LL_UInt64>::value);
+  static_assert(sizeof(LL_UInt64) == sizeof(double));
+  const double inf = HUGE_VAL;
+  const double max = DBL_MAX;
+  const double array[] = {0.0, 0.1, 1.0, max, inf};
+  for (double value : array) {
+    LL_UInt64 back = cpp::bit_cast<LL_UInt64>(value);
+    double forth = cpp::bit_cast<double>(back);
+    EXPECT_TRUE(value == forth);
+  }
+}
 
-using LL_Int128 = LIBC_NAMESPACE::cpp::Int<128>;
-using LL_Int192 = LIBC_NAMESPACE::cpp::Int<192>;
+#ifdef __SIZEOF_INT128__
+TEST(LlvmLibcUIntClassTest, BitCastToFromNativeUint128) {
+  static_assert(cpp::is_trivially_copyable<LL_UInt128>::value);
+  static_assert(sizeof(LL_UInt128) == sizeof(__uint128_t));
+  const __uint128_t array[] = {0, 1, ~__uint128_t(0)};
+  for (__uint128_t value : array) {
+    LL_UInt128 back = cpp::bit_cast<LL_UInt128>(value);
+    __uint128_t forth = cpp::bit_cast<__uint128_t>(back);
+    EXPECT_TRUE(value == forth);
+  }
+}
+#endif
+
+#ifdef LIBC_COMPILER_HAS_FLOAT128
+TEST(LlvmLibcUIntClassTest, BitCastToFromNativeFloat128) {
+  static_assert(cpp::is_trivially_copyable<LL_UInt128>::value);
+  static_assert(sizeof(LL_UInt128) == sizeof(float128));
+  const float128 array[] = {0, 0.1, 1};
+  for (float128 value : array) {
+    LL_UInt128 back = cpp::bit_cast<LL_UInt128>(value);
+    float128 forth = cpp::bit_cast<float128>(back);
+    EXPECT_TRUE(value == forth);
+  }
+}
+#endif
 
 TEST(LlvmLibcUIntClassTest, BasicInit) {
   LL_UInt128 half_val(12345);
@@ -634,3 +677,5 @@ TEST(LlvmLibcUIntClassTest, ConstructorFromUInt128Tests) {
 }
 
 #endif // __SIZEOF_INT128__
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt
index 416307a20d7d..d9fa0e31df0e 100644
--- a/libc/utils/MPFRWrapper/CMakeLists.txt
+++ b/libc/utils/MPFRWrapper/CMakeLists.txt
@@ -12,7 +12,6 @@ if(LIBC_TESTS_CAN_USE_MPFR)
     libc.src.__support.CPP.string_view 
     libc.src.__support.CPP.type_traits 
     libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.float_properties
     libc.src.__support.FPUtil.fpbits_str
     LibcTest.unit
   )
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 6b9400ea453f..2a079eeb3a99 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -11,7 +11,6 @@
 #include "src/__support/CPP/string.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/FloatProperties.h"
 #include "src/__support/FPUtil/fpbits_str.h"
 #include "test/UnitTest/FPMatcher.h"
 
diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 79608c631f1e..f5cda9aaa5dc 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -57,6 +57,7 @@ Implemented Papers
 - P2871R3 - Remove Deprecated Unicode Conversion Facets from C++26
 - P2870R3 - Remove basic_string::reserve()
 - P2909R4 - Fix formatting of code units as integers (Dude, where’s my ``char``?)
+- P0521R0 - Proposed Resolution for CA 14 (shared_ptr use_count/unique)
 
 
 Improvements and New Features
@@ -83,6 +84,9 @@ Improvements and New Features
 - The ``_LIBCPP_ENABLE_CXX26_REMOVED_STRING_RESERVE`` macro has been added to make
   the function ``std::basic_string<...>::reserve()`` available.
 
+- The ``_LIBCPP_ENABLE_CXX20_REMOVED_SHARED_PTR_UNIQUE`` macro has been added to make
+  the function ``std::shared_ptr<...>::unique()`` available.
+
 
 Deprecations and Removals
 -------------------------
@@ -166,6 +170,18 @@ ABI Affecting Changes
   to throw a different exception when attempting allocations that are too large
   (``std::bad_alloc`` vs ``std::length_error``).
 
+- The layout of some views inside ``std::ranges`` that use the ``movable-box`` exposition-only type as an implementation 
+  detail has changed in order to fix a bug which could result in overwriting user data following the ``movable-box``
+  <https://github.com/llvm/llvm-project/issues/70506>. 
+  This was caused by incorrect usage of the ``[[no_unique_address]]`` attribute inside the implementation of ``movable-box``. 
+  This only affects the layout of the following views: ``take_while_view``, ``filter_view``, ``single_view``, ``drop_while_view``, 
+  ``repeat_view``, ``transform_view``, ``chunk_by_view``. In order to avoid silent breakage, an ABI tag has been added to 
+  these views such that their mangled name will be different starting in this version of libc++. 
+  As a result, attempting to call a function that expects one of these views will fail to link until the code has been rebuilt 
+  against a matching version of libc++. In practice, we believe it is unusual for these views to appear at ABI boundaries so this 
+  should not be a major problem for most users. However it is probably worth auditing ranges-heavy code for ABI boundaries that 
+  would contain these views, or for types that contain these views as members and which are passed across ABI boundaries.
+
 Build System Changes
 --------------------
 
diff --git a/libcxx/docs/Status/Cxx17Papers.csv b/libcxx/docs/Status/Cxx17Papers.csv
index 9010402effc5..8952391afc83 100644
--- a/libcxx/docs/Status/Cxx17Papers.csv
+++ b/libcxx/docs/Status/Cxx17Papers.csv
@@ -87,7 +87,7 @@
 "`P0513R0 <https://wg21.link/P0513R0>`__","LWG","Poisoning the Hash","Issaquah","|Complete|","5.0"
 "`P0516R0 <https://wg21.link/P0516R0>`__","LWG","Clarify That shared_future's Copy Operations have Wide Contracts","Issaquah","|Complete|","4.0"
 "`P0517R0 <https://wg21.link/P0517R0>`__","LWG","Make future_error Constructible","Issaquah","|Complete|","4.0"
-"`P0521R0 <https://wg21.link/P0521R0>`__","LWG","Proposed Resolution for CA 14 (shared_ptr use_count/unique)","Issaquah","|Nothing To Do|","n/a"
+"`P0521R0 <https://wg21.link/P0521R0>`__","LWG","Proposed Resolution for CA 14 (shared_ptr use_count/unique)","Issaquah","|Complete|","18.0"
 "","","","","",""
 "`P0156R2 <https://wg21.link/P0156R2>`__","LWG","Variadic Lock guard(rev 5)","Kona","|Complete|","5.0"
 "`P0270R3 <https://wg21.link/P0270R3>`__","CWG","Removing C dependencies from signal handler wording","Kona","",""
diff --git a/libcxx/docs/Status/Cxx20.rst b/libcxx/docs/Status/Cxx20.rst
index 227b3197d82e..2deb82547d63 100644
--- a/libcxx/docs/Status/Cxx20.rst
+++ b/libcxx/docs/Status/Cxx20.rst
@@ -44,7 +44,7 @@ Paper Status
    .. [#note-P0645] P0645: The paper is implemented but still marked as an incomplete feature
       (the feature-test macro is not set).
    .. [#note-P0966] P0966: It was previously erroneously marked as complete in version 8.0. See `bug 45368 <https://llvm.org/PR45368>`__.
-   .. [#note-P0619] P0619: Only sections D.8, D.9, D.10 and D.13 are implemented. Sections D.4, D.7, D.11, D.12, and D.14 remain undone.
+   .. [#note-P0619] P0619: Only sections D.8, D.9, D.10 and D.13 are implemented. Sections D.4, D.7, D.11, and D.12 remain undone.
    .. [#note-P0883.1] P0883: shared_ptr and floating-point changes weren't applied as they themselves aren't implemented yet.
    .. [#note-P0883.2] P0883: ``ATOMIC_FLAG_INIT`` was marked deprecated in version 14.0, but was undeprecated with the implementation of LWG3659 in version 15.0.
    .. [#note-P2231] P2231: Optional is complete. The changes to variant haven't been implemented yet.
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index 8d9f795da977..e1bbf39b9634 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -296,6 +296,10 @@ C++17 Specific Configuration Macros
 
 C++20 Specific Configuration Macros
 -----------------------------------
+**_LIBCPP_ENABLE_CXX20_REMOVED_SHARED_PTR_UNIQUE**
+  This macro is used to re-enable the function
+  ``std::shared_ptr<...>::unique()``.
+
 **_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES**:
   This macro is used to re-enable all the features removed in C++20. The effect
   is equivalent to manually defining each macro listed below.
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 746d5812fba0..0fe3ab44d246 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -914,10 +914,8 @@ set(files
   expected
   experimental/__config
   experimental/__memory
-  experimental/__simd/abi_tag.h
   experimental/__simd/aligned_tag.h
   experimental/__simd/declaration.h
-  experimental/__simd/internal_declaration.h
   experimental/__simd/reference.h
   experimental/__simd/scalar.h
   experimental/__simd/simd.h
diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index 754e597130c5..7d7631b6e98a 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -21,8 +21,11 @@
 #include <__fwd/bit_reference.h>
 #include <__iterator/segmented_iterator.h>
 #include <__string/constexpr_c_functions.h>
+#include <__type_traits/is_integral.h>
 #include <__type_traits/is_same.h>
+#include <__type_traits/is_signed.h>
 #include <__utility/move.h>
+#include <limits>
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 #  include <cwchar>
@@ -76,6 +79,22 @@ __find_impl(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
 }
 #endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 
+// TODO: This should also be possible to get right with different signedness
+// cast integral types to allow vectorization
+template <class _Tp,
+          class _Up,
+          class _Proj,
+          __enable_if_t<__is_identity<_Proj>::value && !__libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
+                            is_integral<_Tp>::value && is_integral<_Up>::value &&
+                            is_signed<_Tp>::value == is_signed<_Up>::value,
+                        int> = 0>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp*
+__find_impl(_Tp* __first, _Tp* __last, const _Up& __value, _Proj& __proj) {
+  if (__value < numeric_limits<_Tp>::min() || __value > numeric_limits<_Tp>::max())
+    return __last;
+  return std::__find_impl(__first, __last, _Tp(__value), __proj);
+}
+
 // __bit_iterator implementation
 template <bool _ToFind, class _Cp, bool _IsConst>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, _IsConst>
diff --git a/libcxx/include/__algorithm/ranges_max.h b/libcxx/include/__algorithm/ranges_max.h
index 782ce2670f05..0f89cb2ff5bf 100644
--- a/libcxx/include/__algorithm/ranges_max.h
+++ b/libcxx/include/__algorithm/ranges_max.h
@@ -54,7 +54,8 @@ struct __fn {
             indirect_strict_weak_order<projected<const _Tp*, _Proj>> _Comp = ranges::less>
   _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr _Tp
   operator()(initializer_list<_Tp> __il, _Comp __comp = {}, _Proj __proj = {}) const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__il.begin() != __il.end(), "initializer_list must contain at least one element");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __il.begin() != __il.end(), "initializer_list must contain at least one element");
 
     auto __comp_lhs_rhs_swapped = [&](auto&& __lhs, auto&& __rhs) -> bool { return std::invoke(__comp, __rhs, __lhs); };
     return *ranges::__min_element_impl(__il.begin(), __il.end(), __comp_lhs_rhs_swapped, __proj);
@@ -69,7 +70,7 @@ struct __fn {
     auto __first = ranges::begin(__r);
     auto __last  = ranges::end(__r);
 
-    _LIBCPP_ASSERT_UNCATEGORIZED(__first != __last, "range must contain at least one element");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__first != __last, "range must contain at least one element");
 
     if constexpr (forward_range<_Rp> && !__is_cheap_to_copy<range_value_t<_Rp>>) {
       auto __comp_lhs_rhs_swapped = [&](auto&& __lhs, auto&& __rhs) -> bool {
diff --git a/libcxx/include/__algorithm/ranges_min.h b/libcxx/include/__algorithm/ranges_min.h
index be15b4536734..8757358cdf37 100644
--- a/libcxx/include/__algorithm/ranges_min.h
+++ b/libcxx/include/__algorithm/ranges_min.h
@@ -53,7 +53,8 @@ struct __fn {
             indirect_strict_weak_order<projected<const _Tp*, _Proj>> _Comp = ranges::less>
   _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr _Tp
   operator()(initializer_list<_Tp> __il, _Comp __comp = {}, _Proj __proj = {}) const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__il.begin() != __il.end(), "initializer_list must contain at least one element");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __il.begin() != __il.end(), "initializer_list must contain at least one element");
     return *ranges::__min_element_impl(__il.begin(), __il.end(), __comp, __proj);
   }
 
@@ -65,7 +66,7 @@ struct __fn {
   operator()(_Rp&& __r, _Comp __comp = {}, _Proj __proj = {}) const {
     auto __first = ranges::begin(__r);
     auto __last  = ranges::end(__r);
-    _LIBCPP_ASSERT_UNCATEGORIZED(__first != __last, "range must contain at least one element");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__first != __last, "range must contain at least one element");
     if constexpr (forward_range<_Rp> && !__is_cheap_to_copy<range_value_t<_Rp>>) {
       return *ranges::__min_element_impl(__first, __last, __comp, __proj);
     } else {
diff --git a/libcxx/include/__algorithm/ranges_minmax.h b/libcxx/include/__algorithm/ranges_minmax.h
index a5b5cf9bd0ab..22a62b620c93 100644
--- a/libcxx/include/__algorithm/ranges_minmax.h
+++ b/libcxx/include/__algorithm/ranges_minmax.h
@@ -65,7 +65,8 @@ struct __fn {
             indirect_strict_weak_order<projected<const _Type*, _Proj>> _Comp = ranges::less>
   _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr ranges::minmax_result<_Type>
   operator()(initializer_list<_Type> __il, _Comp __comp = {}, _Proj __proj = {}) const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__il.begin() != __il.end(), "initializer_list has to contain at least one element");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __il.begin() != __il.end(), "initializer_list has to contain at least one element");
     auto __iters = std::__minmax_element_impl(__il.begin(), __il.end(), __comp, __proj);
     return ranges::minmax_result<_Type>{*__iters.first, *__iters.second};
   }
@@ -80,7 +81,7 @@ struct __fn {
     auto __last   = ranges::end(__r);
     using _ValueT = range_value_t<_Range>;
 
-    _LIBCPP_ASSERT_UNCATEGORIZED(__first != __last, "range has to contain at least one element");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__first != __last, "range has to contain at least one element");
 
     if constexpr (forward_range<_Range>) {
       // Special-case the one element case. Avoid repeatedly initializing objects from the result of an iterator
diff --git a/libcxx/include/__algorithm/sample.h b/libcxx/include/__algorithm/sample.h
index cc29dd686f6b..ebe5180b7eec 100644
--- a/libcxx/include/__algorithm/sample.h
+++ b/libcxx/include/__algorithm/sample.h
@@ -89,7 +89,7 @@ _LIBCPP_HIDE_FROM_ABI _SampleIterator __sample(
     _SampleIterator __output_iter,
     _Distance __n,
     _UniformRandomNumberGenerator& __g) {
-  _LIBCPP_ASSERT_UNCATEGORIZED(__n >= 0, "N must be a positive number.");
+  _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n >= 0, "N must be a positive number.");
 
   using _PopIterCategory = typename _IterOps<_AlgPolicy>::template __iterator_category<_PopulationIterator>;
   using _Difference      = typename _IterOps<_AlgPolicy>::template __difference_type<_PopulationIterator>;
diff --git a/libcxx/include/__config b/libcxx/include/__config
index adff13e714cb..40e6da8bc03a 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -200,6 +200,16 @@
 #    define _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION
 #  endif
 
+// We had some bugs where we use [[no_unique_address]] together with construct_at,
+// which causes UB as the call on construct_at could write to overlapping subobjects
+//
+// https://github.com/llvm/llvm-project/issues/70506
+// https://github.com/llvm/llvm-project/issues/70494
+//
+// To fix the bug we had to change the ABI of some classes to remove [[no_unique_address]] under certain conditions.
+// The below macro is used for all classes whose ABI have changed as part of fixing these bugs.
+#  define _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG __attribute__((__abi_tag__("subobj_fix_2023")))
+
 // Changes the iterator type of select containers (see below) to a bounded iterator that keeps track of whether it's
 // within the bounds of the original container and asserts it on every dereference.
 //
diff --git a/libcxx/include/__format/formatter_output.h b/libcxx/include/__format/formatter_output.h
index 89854f67f5fc..31e06425703a 100644
--- a/libcxx/include/__format/formatter_output.h
+++ b/libcxx/include/__format/formatter_output.h
@@ -246,7 +246,7 @@ __write(_Iterator __first,
         output_iterator<const iter_value_t<_Iterator>&> auto __out_it,
         __format_spec::__parsed_specifications<_ParserCharT> __specs,
         ptrdiff_t __size) -> decltype(__out_it) {
-  _LIBCPP_ASSERT_UNCATEGORIZED(__first <= __last, "Not a valid range");
+  _LIBCPP_ASSERT_VALID_INPUT_RANGE(__first <= __last, "Not a valid range");
   return __formatter::__write(basic_string_view{__first, __last}, std::move(__out_it), __specs, __size);
 }
 
@@ -259,7 +259,7 @@ __write(_Iterator __first,
         _Iterator __last,
         output_iterator<const iter_value_t<_Iterator>&> auto __out_it,
         __format_spec::__parsed_specifications<_ParserCharT> __specs) -> decltype(__out_it) {
-  _LIBCPP_ASSERT_UNCATEGORIZED(__first <= __last, "Not a valid range");
+  _LIBCPP_ASSERT_VALID_INPUT_RANGE(__first <= __last, "Not a valid range");
   return __formatter::__write(__first, __last, std::move(__out_it), __specs, __last - __first);
 }
 
@@ -273,7 +273,7 @@ _LIBCPP_HIDE_FROM_ABI auto __write_transformed(
     output_iterator<const _CharT&> auto __out_it,
     __format_spec::__parsed_specifications<_ParserCharT> __specs,
     _UnaryOperation __op) -> decltype(__out_it) {
-  _LIBCPP_ASSERT_UNCATEGORIZED(__first <= __last, "Not a valid range");
+  _LIBCPP_ASSERT_VALID_INPUT_RANGE(__first <= __last, "Not a valid range");
 
   ptrdiff_t __size = __last - __first;
   if (__size >= __specs.__width_)
diff --git a/libcxx/include/__format/parser_std_format_spec.h b/libcxx/include/__format/parser_std_format_spec.h
index 9a91179fdfb5..e38729db965c 100644
--- a/libcxx/include/__format/parser_std_format_spec.h
+++ b/libcxx/include/__format/parser_std_format_spec.h
@@ -591,7 +591,7 @@ private:
           || (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 2)
 #    endif
   _LIBCPP_HIDE_FROM_ABI constexpr bool __parse_fill_align(_Iterator& __begin, _Iterator __end, bool __use_range_fill) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         __begin != __end,
         "when called with an empty input the function will cause "
         "undefined behavior by evaluating data not in the input");
@@ -624,7 +624,7 @@ private:
   template <contiguous_iterator _Iterator>
     requires(same_as<_CharT, wchar_t> && sizeof(wchar_t) == 4)
   _LIBCPP_HIDE_FROM_ABI constexpr bool __parse_fill_align(_Iterator& __begin, _Iterator __end, bool __use_range_fill) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         __begin != __end,
         "when called with an empty input the function will cause "
         "undefined behavior by evaluating data not in the input");
@@ -652,7 +652,7 @@ private:
   // range-fill and tuple-fill are identical
   template <contiguous_iterator _Iterator>
   _LIBCPP_HIDE_FROM_ABI constexpr bool __parse_fill_align(_Iterator& __begin, _Iterator __end, bool __use_range_fill) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         __begin != __end,
         "when called with an empty input the function will cause "
         "undefined behavior by evaluating data not in the input");
diff --git a/libcxx/include/__iterator/common_iterator.h b/libcxx/include/__iterator/common_iterator.h
index cc49d62cd04d..7b3f4610d531 100644
--- a/libcxx/include/__iterator/common_iterator.h
+++ b/libcxx/include/__iterator/common_iterator.h
@@ -77,7 +77,7 @@ public:
     requires convertible_to<const _I2&, _Iter> && convertible_to<const _S2&, _Sent>
   _LIBCPP_HIDE_FROM_ABI constexpr common_iterator(const common_iterator<_I2, _S2>& __other)
       : __hold_([&]() -> variant<_Iter, _Sent> {
-          _LIBCPP_ASSERT_UNCATEGORIZED(
+          _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
               !__other.__hold_.valueless_by_exception(), "Attempted to construct from a valueless common_iterator");
           if (__other.__hold_.index() == 0)
             return variant<_Iter, _Sent>{in_place_index<0>, std::__unchecked_get<0>(__other.__hold_)};
@@ -88,7 +88,7 @@ public:
     requires convertible_to<const _I2&, _Iter> && convertible_to<const _S2&, _Sent> &&
              assignable_from<_Iter&, const _I2&> && assignable_from<_Sent&, const _S2&>
   _LIBCPP_HIDE_FROM_ABI common_iterator& operator=(const common_iterator<_I2, _S2>& __other) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__other.__hold_.valueless_by_exception(), "Attempted to assign from a valueless common_iterator");
 
     auto __idx       = __hold_.index();
@@ -110,7 +110,7 @@ public:
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator*() {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__hold_), "Attempted to dereference a non-dereferenceable common_iterator");
     return *std::__unchecked_get<_Iter>(__hold_);
   }
@@ -118,7 +118,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator*() const
     requires __dereferenceable<const _Iter>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__hold_), "Attempted to dereference a non-dereferenceable common_iterator");
     return *std::__unchecked_get<_Iter>(__hold_);
   }
@@ -129,7 +129,7 @@ public:
                __i.operator->();
              } || is_reference_v<iter_reference_t<_I2>> || constructible_from<iter_value_t<_I2>, iter_reference_t<_I2>>)
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__hold_), "Attempted to dereference a non-dereferenceable common_iterator");
     if constexpr (is_pointer_v<_Iter> || requires(const _Iter& __i) { __i.operator->(); }) {
       return std::__unchecked_get<_Iter>(__hold_);
@@ -142,14 +142,14 @@ public:
   }
 
   _LIBCPP_HIDE_FROM_ABI common_iterator& operator++() {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__hold_), "Attempted to increment a non-dereferenceable common_iterator");
     ++std::__unchecked_get<_Iter>(__hold_);
     return *this;
   }
 
   _LIBCPP_HIDE_FROM_ABI decltype(auto) operator++(int) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__hold_), "Attempted to increment a non-dereferenceable common_iterator");
     if constexpr (forward_iterator<_Iter>) {
       auto __tmp = *this;
@@ -170,9 +170,9 @@ public:
     requires sentinel_for<_Sent, _I2>
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool
   operator==(const common_iterator& __x, const common_iterator<_I2, _S2>& __y) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__x.__hold_.valueless_by_exception(), "Attempted to compare a valueless common_iterator");
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__y.__hold_.valueless_by_exception(), "Attempted to compare a valueless common_iterator");
 
     auto __x_index = __x.__hold_.index();
@@ -191,9 +191,9 @@ public:
     requires sentinel_for<_Sent, _I2> && equality_comparable_with<_Iter, _I2>
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool
   operator==(const common_iterator& __x, const common_iterator<_I2, _S2>& __y) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__x.__hold_.valueless_by_exception(), "Attempted to compare a valueless common_iterator");
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__y.__hold_.valueless_by_exception(), "Attempted to compare a valueless common_iterator");
 
     auto __x_index = __x.__hold_.index();
@@ -215,9 +215,9 @@ public:
     requires sized_sentinel_for<_Sent, _I2>
   _LIBCPP_HIDE_FROM_ABI friend constexpr iter_difference_t<_I2>
   operator-(const common_iterator& __x, const common_iterator<_I2, _S2>& __y) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__x.__hold_.valueless_by_exception(), "Attempted to subtract from a valueless common_iterator");
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__y.__hold_.valueless_by_exception(), "Attempted to subtract a valueless common_iterator");
 
     auto __x_index = __x.__hold_.index();
@@ -239,7 +239,7 @@ public:
   iter_move(const common_iterator& __i) noexcept(noexcept(ranges::iter_move(std::declval<const _Iter&>())))
     requires input_iterator<_Iter>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__i.__hold_), "Attempted to iter_move a non-dereferenceable common_iterator");
     return ranges::iter_move(std::__unchecked_get<_Iter>(__i.__hold_));
   }
@@ -248,9 +248,9 @@ public:
   _LIBCPP_HIDE_FROM_ABI friend constexpr void
   iter_swap(const common_iterator& __x, const common_iterator<_I2, _S2>& __y) noexcept(
       noexcept(ranges::iter_swap(std::declval<const _Iter&>(), std::declval<const _I2&>()))) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__x.__hold_), "Attempted to iter_swap a non-dereferenceable common_iterator");
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_I2>(__y.__hold_), "Attempted to iter_swap a non-dereferenceable common_iterator");
     return ranges::iter_swap(std::__unchecked_get<_Iter>(__x.__hold_), std::__unchecked_get<_I2>(__y.__hold_));
   }
diff --git a/libcxx/include/__iterator/counted_iterator.h b/libcxx/include/__iterator/counted_iterator.h
index c72ac677ff2f..008c52fa87ce 100644
--- a/libcxx/include/__iterator/counted_iterator.h
+++ b/libcxx/include/__iterator/counted_iterator.h
@@ -105,14 +105,14 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr iter_difference_t<_Iter> count() const noexcept { return __count_; }
 
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator*() {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__count_ > 0, "Iterator is equal to or past end.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__count_ > 0, "Iterator is equal to or past end.");
     return *__current_;
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator*() const
     requires __dereferenceable<const _Iter>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__count_ > 0, "Iterator is equal to or past end.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__count_ > 0, "Iterator is equal to or past end.");
     return *__current_;
   }
 
@@ -229,7 +229,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator[](iter_difference_t<_Iter> __n) const
     requires random_access_iterator<_Iter>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__n < __count_, "Subscript argument must be less than size.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < __count_, "Subscript argument must be less than size.");
     return __current_[__n];
   }
 
@@ -253,7 +253,7 @@ public:
   iter_move(const counted_iterator& __i) noexcept(noexcept(ranges::iter_move(__i.__current_)))
     requires input_iterator<_Iter>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__i.__count_ > 0, "Iterator must not be past end of range.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__i.__count_ > 0, "Iterator must not be past end of range.");
     return ranges::iter_move(__i.__current_);
   }
 
@@ -261,7 +261,8 @@ public:
   _LIBCPP_HIDE_FROM_ABI friend constexpr void
   iter_swap(const counted_iterator& __x,
             const counted_iterator<_I2>& __y) noexcept(noexcept(ranges::iter_swap(__x.__current_, __y.__current_))) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__x.__count_ > 0 && __y.__count_ > 0, "Iterators must not be past end of range.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __x.__count_ > 0 && __y.__count_ > 0, "Iterators must not be past end of range.");
     return ranges::iter_swap(__x.__current_, __y.__current_);
   }
 
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index a868093026c5..9aa938b22031 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -723,7 +723,9 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI long use_count() const _NOEXCEPT { return __cntrl_ ? __cntrl_->use_count() : 0; }
 
-  _LIBCPP_HIDE_FROM_ABI bool unique() const _NOEXCEPT { return use_count() == 1; }
+#if _LIBCPP_STD_VER < 20 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_SHARED_PTR_UNIQUE)
+  _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_HIDE_FROM_ABI bool unique() const _NOEXCEPT { return use_count() == 1; }
+#endif
 
   _LIBCPP_HIDE_FROM_ABI explicit operator bool() const _NOEXCEPT { return get() != nullptr; }
 
diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h
index 7bf5e3c5e4e6..db473eaa50a6 100644
--- a/libcxx/include/__memory/unique_ptr.h
+++ b/libcxx/include/__memory/unique_ptr.h
@@ -132,10 +132,6 @@ public:
 private:
   __compressed_pair<pointer, deleter_type> __ptr_;
 
-  struct __nat {
-    int __for_bool_;
-  };
-
   typedef _LIBCPP_NODEBUG __unique_ptr_deleter_sfinae<_Dp> _DeleterSFINAE;
 
   template <bool _Dummy>
diff --git a/libcxx/include/__ranges/chunk_by_view.h b/libcxx/include/__ranges/chunk_by_view.h
index e46998687646..3ecc018cac9d 100644
--- a/libcxx/include/__ranges/chunk_by_view.h
+++ b/libcxx/include/__ranges/chunk_by_view.h
@@ -54,7 +54,8 @@ namespace ranges {
 
 template <forward_range _View, indirect_binary_predicate<iterator_t<_View>, iterator_t<_View>> _Pred>
   requires view<_View> && is_object_v<_Pred>
-class chunk_by_view : public view_interface<chunk_by_view<_View, _Pred>> {
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG chunk_by_view
+    : public view_interface<chunk_by_view<_View, _Pred>> {
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
   _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Pred> __pred_;
 
diff --git a/libcxx/include/__ranges/drop_view.h b/libcxx/include/__ranges/drop_view.h
index 2b89c6877a78..83bb598b0a0c 100644
--- a/libcxx/include/__ranges/drop_view.h
+++ b/libcxx/include/__ranges/drop_view.h
@@ -90,6 +90,10 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr auto begin()
     requires(!(__simple_view<_View> && random_access_range<const _View> && sized_range<const _View>))
   {
+    if constexpr (random_access_range<_View> && sized_range<_View>) {
+      const auto __dist = std::min(ranges::distance(__base_), __count_);
+      return ranges::begin(__base_) + __dist;
+    }
     if constexpr (_UseCache)
       if (__cached_begin_.__has_value())
         return *__cached_begin_;
@@ -103,7 +107,8 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr auto begin() const
     requires random_access_range<const _View> && sized_range<const _View>
   {
-    return ranges::next(ranges::begin(__base_), __count_, ranges::end(__base_));
+    const auto __dist = std::min(ranges::distance(__base_), __count_);
+    return ranges::begin(__base_) + __dist;
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr auto end()
diff --git a/libcxx/include/__ranges/drop_while_view.h b/libcxx/include/__ranges/drop_while_view.h
index 677b5bc66d44..eb3783eb42f1 100644
--- a/libcxx/include/__ranges/drop_while_view.h
+++ b/libcxx/include/__ranges/drop_while_view.h
@@ -45,7 +45,8 @@ namespace ranges {
 
 template <view _View, class _Pred>
   requires input_range<_View> && is_object_v<_Pred> && indirect_unary_predicate<const _Pred, iterator_t<_View>>
-class drop_while_view : public view_interface<drop_while_view<_View, _Pred>> {
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG drop_while_view
+    : public view_interface<drop_while_view<_View, _Pred>> {
 public:
   _LIBCPP_HIDE_FROM_ABI drop_while_view()
     requires default_initializable<_View> && default_initializable<_Pred>
diff --git a/libcxx/include/__ranges/filter_view.h b/libcxx/include/__ranges/filter_view.h
index 08d50ab01104..868ad128e894 100644
--- a/libcxx/include/__ranges/filter_view.h
+++ b/libcxx/include/__ranges/filter_view.h
@@ -51,7 +51,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace ranges {
 template <input_range _View, indirect_unary_predicate<iterator_t<_View>> _Pred>
   requires view<_View> && is_object_v<_Pred>
-class filter_view : public view_interface<filter_view<_View, _Pred>> {
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG filter_view : public view_interface<filter_view<_View, _Pred>> {
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
   _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Pred> __pred_;
 
diff --git a/libcxx/include/__ranges/movable_box.h b/libcxx/include/__ranges/movable_box.h
index 6615533d3743..9b38877494ea 100644
--- a/libcxx/include/__ranges/movable_box.h
+++ b/libcxx/include/__ranges/movable_box.h
@@ -134,6 +134,20 @@ concept __doesnt_need_empty_state =
          // 2. Otherwise, movable-box<T> should store only a T if either T models movable or
          //    is_nothrow_move_constructible_v<T> is true.
          : movable<_Tp> || is_nothrow_move_constructible_v<_Tp>);
+
+// When _Tp doesn't have an assignment operator, we must implement __movable_box's assignment operator
+// by doing destroy_at followed by construct_at. However, that implementation strategy leads to UB if the nested
+// _Tp is potentially overlapping, as it is doing a non-transparent replacement of the sub-object, which means that
+// we're not considered "nested" inside the movable-box anymore, and since we're not nested within it, [basic.life]/1.5
+// says that we essentially just reused the storage of the movable-box for a completely unrelated object and ended the
+// movable-box's lifetime.
+// https://github.com/llvm/llvm-project/issues/70494#issuecomment-1845646490
+//
+// Hence, when the _Tp doesn't have an assignment operator, we can't risk making it a potentially-overlapping
+// subobject because of the above, and we don't use [[no_unique_address]] in that case.
+template <class _Tp>
+concept __can_use_no_unique_address = (copy_constructible<_Tp> ? copyable<_Tp> : movable<_Tp>);
+
 #  else
 
 template <class _Tp>
@@ -144,23 +158,45 @@ concept __doesnt_need_empty_state_for_move = movable<_Tp> || is_nothrow_move_con
 
 template <class _Tp>
 concept __doesnt_need_empty_state = __doesnt_need_empty_state_for_copy<_Tp> && __doesnt_need_empty_state_for_move<_Tp>;
+
+template <class _Tp>
+concept __can_use_no_unique_address = copyable<_Tp>;
 #  endif
 
+template <class _Tp>
+struct __movable_box_holder {
+  _Tp __val_;
+
+  template <class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __movable_box_holder(in_place_t, _Args&&... __args)
+      : __val_(std::forward<_Args>(__args)...) {}
+};
+
+template <class _Tp>
+  requires __can_use_no_unique_address<_Tp>
+struct __movable_box_holder<_Tp> {
+  _LIBCPP_NO_UNIQUE_ADDRESS _Tp __val_;
+
+  template <class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __movable_box_holder(in_place_t, _Args&&... __args)
+      : __val_(std::forward<_Args>(__args)...) {}
+};
+
 template <__movable_box_object _Tp>
   requires __doesnt_need_empty_state<_Tp>
 class __movable_box<_Tp> {
-  _LIBCPP_NO_UNIQUE_ADDRESS _Tp __val_;
+  _LIBCPP_NO_UNIQUE_ADDRESS __movable_box_holder<_Tp> __holder_;
 
 public:
   template <class... _Args>
     requires is_constructible_v<_Tp, _Args...>
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit __movable_box(in_place_t, _Args&&... __args) noexcept(
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __movable_box(in_place_t __inplace, _Args&&... __args) noexcept(
       is_nothrow_constructible_v<_Tp, _Args...>)
-      : __val_(std::forward<_Args>(__args)...) {}
+      : __holder_(__inplace, std::forward<_Args>(__args)...) {}
 
   _LIBCPP_HIDE_FROM_ABI constexpr __movable_box() noexcept(is_nothrow_default_constructible_v<_Tp>)
     requires default_initializable<_Tp>
-      : __val_() {}
+      : __holder_(in_place_t{}) {}
 
   _LIBCPP_HIDE_FROM_ABI __movable_box(__movable_box const&) = default;
   _LIBCPP_HIDE_FROM_ABI __movable_box(__movable_box&&)      = default;
@@ -176,27 +212,29 @@ public:
   // Implementation of assignment operators in case we perform optimization (2)
   _LIBCPP_HIDE_FROM_ABI constexpr __movable_box& operator=(__movable_box const& __other) noexcept {
     static_assert(is_nothrow_copy_constructible_v<_Tp>);
+    static_assert(!__can_use_no_unique_address<_Tp>);
     if (this != std::addressof(__other)) {
-      std::destroy_at(std::addressof(__val_));
-      std::construct_at(std::addressof(__val_), __other.__val_);
+      std::destroy_at(std::addressof(__holder_.__val_));
+      std::construct_at(std::addressof(__holder_.__val_), __other.__holder_.__val_);
     }
     return *this;
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr __movable_box& operator=(__movable_box&& __other) noexcept {
     static_assert(is_nothrow_move_constructible_v<_Tp>);
+    static_assert(!__can_use_no_unique_address<_Tp>);
     if (this != std::addressof(__other)) {
-      std::destroy_at(std::addressof(__val_));
-      std::construct_at(std::addressof(__val_), std::move(__other.__val_));
+      std::destroy_at(std::addressof(__holder_.__val_));
+      std::construct_at(std::addressof(__holder_.__val_), std::move(__other.__holder_.__val_));
     }
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr _Tp const& operator*() const noexcept { return __val_; }
-  _LIBCPP_HIDE_FROM_ABI constexpr _Tp& operator*() noexcept { return __val_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp const& operator*() const noexcept { return __holder_.__val_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp& operator*() noexcept { return __holder_.__val_; }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr const _Tp* operator->() const noexcept { return std::addressof(__val_); }
-  _LIBCPP_HIDE_FROM_ABI constexpr _Tp* operator->() noexcept { return std::addressof(__val_); }
+  _LIBCPP_HIDE_FROM_ABI constexpr const _Tp* operator->() const noexcept { return std::addressof(__holder_.__val_); }
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp* operator->() noexcept { return std::addressof(__holder_.__val_); }
 
   _LIBCPP_HIDE_FROM_ABI constexpr bool __has_value() const noexcept { return true; }
 };
diff --git a/libcxx/include/__ranges/repeat_view.h b/libcxx/include/__ranges/repeat_view.h
index 459a1e229613..479eca96acb0 100644
--- a/libcxx/include/__ranges/repeat_view.h
+++ b/libcxx/include/__ranges/repeat_view.h
@@ -68,7 +68,7 @@ struct __fn;
 template <move_constructible _Tp, semiregular _Bound = unreachable_sentinel_t>
   requires(is_object_v<_Tp> && same_as<_Tp, remove_cv_t<_Tp>> &&
            (__integer_like_with_usable_difference_type<_Bound> || same_as<_Bound, unreachable_sentinel_t>))
-class repeat_view : public view_interface<repeat_view<_Tp, _Bound>> {
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG repeat_view : public view_interface<repeat_view<_Tp, _Bound>> {
   friend struct views::__take::__fn;
   friend struct views::__drop::__fn;
   class __iterator;
@@ -119,7 +119,7 @@ public:
   }
 
 private:
-  __movable_box<_Tp> __value_;
+  _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Tp> __value_;
   _LIBCPP_NO_UNIQUE_ADDRESS _Bound __bound_ = _Bound();
 };
 
diff --git a/libcxx/include/__ranges/single_view.h b/libcxx/include/__ranges/single_view.h
index b0b2c1d9f3c0..0ae2036a66a9 100644
--- a/libcxx/include/__ranges/single_view.h
+++ b/libcxx/include/__ranges/single_view.h
@@ -37,8 +37,8 @@ template <move_constructible _Tp>
 template <copy_constructible _Tp>
 #  endif
   requires is_object_v<_Tp>
-class single_view : public view_interface<single_view<_Tp>> {
-  __movable_box<_Tp> __value_;
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG single_view : public view_interface<single_view<_Tp>> {
+  _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Tp> __value_;
 
 public:
   _LIBCPP_HIDE_FROM_ABI single_view()
diff --git a/libcxx/include/__ranges/subrange.h b/libcxx/include/__ranges/subrange.h
index a7a3c1efa70e..a41978275b78 100644
--- a/libcxx/include/__ranges/subrange.h
+++ b/libcxx/include/__ranges/subrange.h
@@ -101,8 +101,8 @@ public:
     requires(_Kind == subrange_kind::sized)
       : __begin_(std::move(__iter)), __end_(std::move(__sent)), __size_(__n) {
     if constexpr (sized_sentinel_for<_Sent, _Iter>)
-      _LIBCPP_ASSERT_UNCATEGORIZED((__end_ - __begin_) == static_cast<iter_difference_t<_Iter>>(__n),
-                                   "std::ranges::subrange was passed an invalid size hint");
+      _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS((__end_ - __begin_) == static_cast<iter_difference_t<_Iter>>(__n),
+                                          "std::ranges::subrange was passed an invalid size hint");
   }
 
   template <__different_from<subrange> _Range>
diff --git a/libcxx/include/__ranges/take_while_view.h b/libcxx/include/__ranges/take_while_view.h
index a6f7f80ca76b..4534210d9794 100644
--- a/libcxx/include/__ranges/take_while_view.h
+++ b/libcxx/include/__ranges/take_while_view.h
@@ -43,7 +43,8 @@ namespace ranges {
 
 template <view _View, class _Pred>
   requires input_range<_View> && is_object_v<_Pred> && indirect_unary_predicate<const _Pred, iterator_t<_View>>
-class take_while_view : public view_interface<take_while_view<_View, _Pred>> {
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG take_while_view
+    : public view_interface<take_while_view<_View, _Pred>> {
   template <bool>
   class __sentinel;
 
diff --git a/libcxx/include/__ranges/transform_view.h b/libcxx/include/__ranges/transform_view.h
index 55c6ce587bd6..744f597ccef5 100644
--- a/libcxx/include/__ranges/transform_view.h
+++ b/libcxx/include/__ranges/transform_view.h
@@ -67,7 +67,8 @@ template <input_range _View, move_constructible _Fn>
 template <input_range _View, copy_constructible _Fn>
 #  endif
   requires __transform_view_constraints<_View, _Fn>
-class transform_view : public view_interface<transform_view<_View, _Fn>> {
+class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG transform_view
+    : public view_interface<transform_view<_View, _Fn>> {
   template <bool>
   class __iterator;
   template <bool>
diff --git a/libcxx/include/__ranges/view_interface.h b/libcxx/include/__ranges/view_interface.h
index 3216e0bd6ff2..84dd1c316de3 100644
--- a/libcxx/include/__ranges/view_interface.h
+++ b/libcxx/include/__ranges/view_interface.h
@@ -109,7 +109,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) front()
     requires forward_range<_D2>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !empty(), "Precondition `!empty()` not satisfied. `.front()` called on an empty view.");
     return *ranges::begin(__derived());
   }
@@ -118,7 +118,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) front() const
     requires forward_range<const _D2>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !empty(), "Precondition `!empty()` not satisfied. `.front()` called on an empty view.");
     return *ranges::begin(__derived());
   }
@@ -127,7 +127,8 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) back()
     requires bidirectional_range<_D2> && common_range<_D2>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(!empty(), "Precondition `!empty()` not satisfied. `.back()` called on an empty view.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        !empty(), "Precondition `!empty()` not satisfied. `.back()` called on an empty view.");
     return *ranges::prev(ranges::end(__derived()));
   }
 
@@ -135,7 +136,8 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) back() const
     requires bidirectional_range<const _D2> && common_range<const _D2>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(!empty(), "Precondition `!empty()` not satisfied. `.back()` called on an empty view.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        !empty(), "Precondition `!empty()` not satisfied. `.back()` called on an empty view.");
     return *ranges::prev(ranges::end(__derived()));
   }
 
diff --git a/libcxx/include/__type_traits/datasizeof.h b/libcxx/include/__type_traits/datasizeof.h
index 5688e3293a69..3a8b15160107 100644
--- a/libcxx/include/__type_traits/datasizeof.h
+++ b/libcxx/include/__type_traits/datasizeof.h
@@ -28,13 +28,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
 struct __libcpp_datasizeof {
-#if __has_cpp_attribute(__no_unique_address__)
+#if __has_extension(datasizeof)
+  static const size_t value = __datasizeof(_Tp);
+#else
+// NOLINTNEXTLINE(readability-redundant-preprocessor) This is https://llvm.org/PR64825
+#  if __has_cpp_attribute(__no_unique_address__)
   template <class = char>
   struct _FirstPaddingByte {
     [[__no_unique_address__]] _Tp __v_;
     char __first_padding_byte_;
   };
-#else
+#  else
   template <bool = __libcpp_is_final<_Tp>::value || !is_class<_Tp>::value>
   struct _FirstPaddingByte : _Tp {
     char __first_padding_byte_;
@@ -45,7 +49,7 @@ struct __libcpp_datasizeof {
     _Tp __v_;
     char __first_padding_byte_;
   };
-#endif
+#  endif // __has_cpp_attribute(__no_unique_address__)
 
   // _FirstPaddingByte<> is sometimes non-standard layout. Using `offsetof` is UB in that case, but GCC and Clang allow
   // the use as an extension.
@@ -53,6 +57,7 @@ struct __libcpp_datasizeof {
   _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-offsetof")
   static const size_t value = offsetof(_FirstPaddingByte<>, __first_padding_byte_);
   _LIBCPP_DIAGNOSTIC_POP
+#endif   // __has_extension(datasizeof)
 };
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__utility/is_pointer_in_range.h b/libcxx/include/__utility/is_pointer_in_range.h
index e859562e7457..68cdfea6f945 100644
--- a/libcxx/include/__utility/is_pointer_in_range.h
+++ b/libcxx/include/__utility/is_pointer_in_range.h
@@ -35,7 +35,7 @@ template <class _Tp, class _Up, __enable_if_t<__is_less_than_comparable<const _T
 _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_SANITIZE("address") bool __is_pointer_in_range(
     const _Tp* __begin, const _Tp* __end, const _Up* __ptr) {
   if (__libcpp_is_constant_evaluated()) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__builtin_constant_p(__begin <= __end), "__begin and __end do not form a range");
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(__builtin_constant_p(__begin <= __end), "__begin and __end do not form a range");
 
     // If this is not a constant during constant evaluation we know that __ptr is not part of the allocation where
     // [__begin, __end) is.
diff --git a/libcxx/include/deque b/libcxx/include/deque
index d0520b635bcc..fca8b3d6e2c7 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -998,15 +998,19 @@ private:
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_new(size_type __current_size) const _NOEXCEPT {
+    (void)__current_size;
+#ifndef _LIBCPP_HAS_NO_ASAN
     if (__current_size == 0)
       __annotate_from_to(0, __map_.size() * __block_size, __asan_poison, __asan_back_moved);
     else {
       __annotate_from_to(0, __start_, __asan_poison, __asan_front_moved);
       __annotate_from_to(__start_ + __current_size, __map_.size() * __block_size, __asan_poison, __asan_back_moved);
     }
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_delete() const _NOEXCEPT {
+#ifndef _LIBCPP_HAS_NO_ASAN
     if (empty()) {
       for (size_t __i = 0; __i < __map_.size(); ++__i) {
         __annotate_whole_block(__i, __asan_unposion);
@@ -1015,30 +1019,52 @@ private:
       __annotate_from_to(0, __start_, __asan_unposion, __asan_front_moved);
       __annotate_from_to(__start_ + size(), __map_.size() * __block_size, __asan_unposion, __asan_back_moved);
     }
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_increase_front(size_type __n) const _NOEXCEPT {
+    (void)__n;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_from_to(__start_ - __n, __start_, __asan_unposion, __asan_front_moved);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_increase_back(size_type __n) const _NOEXCEPT {
+    (void)__n;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_from_to(__start_ + size(), __start_ + size() + __n, __asan_unposion, __asan_back_moved);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_shrink_front(size_type __old_size, size_type __old_start) const _NOEXCEPT {
+    (void)__old_size;
+    (void)__old_start;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_from_to(__old_start, __old_start + (__old_size - size()), __asan_poison, __asan_front_moved);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_shrink_back(size_type __old_size, size_type __old_start) const _NOEXCEPT {
+    (void)__old_size;
+    (void)__old_start;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_from_to(__old_start + size(), __old_start + __old_size, __asan_poison, __asan_back_moved);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_poison_block(const void* __beginning, const void* __end) const _NOEXCEPT {
+    (void)__beginning;
+    (void)__end;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_double_ended_contiguous_container(__beginning, __end, __beginning, __end, __end, __end);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void
   __annotate_whole_block(size_t __block_index, __asan_annotation_type __annotation_type) const _NOEXCEPT {
+    (void)__block_index;
+    (void)__annotation_type;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __map_const_iterator __block_it = __map_.begin() + __block_index;
     const void* __block_start       = std::__to_address(*__block_it);
     const void* __block_end         = std::__to_address(*__block_it + __block_size);
@@ -1049,6 +1075,7 @@ private:
       __annotate_double_ended_contiguous_container(
           __block_start, __block_end, __block_start, __block_start, __block_start, __block_end);
     }
+#endif
   }
 #if !defined(_LIBCPP_HAS_NO_ASAN)
 
diff --git a/libcxx/include/experimental/__simd/abi_tag.h b/libcxx/include/experimental/__simd/abi_tag.h
deleted file mode 100644
index cec5be65ce5c..000000000000
--- a/libcxx/include/experimental/__simd/abi_tag.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP_EXPERIMENTAL___SIMD_ABI_TAG_H
-#define _LIBCPP_EXPERIMENTAL___SIMD_ABI_TAG_H
-
-#include <cstddef>
-#include <experimental/__config>
-#include <experimental/__simd/internal_declaration.h>
-
-#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
-
-_LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
-inline namespace parallelism_v2 {
-namespace simd_abi {
-
-using scalar = __scalar;
-
-// TODO: make this platform dependent
-template <int _Np>
-using fixed_size = __vec_ext<_Np>;
-
-template <class _Tp>
-inline constexpr int max_fixed_size = 32;
-
-// TODO: make this platform dependent
-template <class _Tp>
-using compatible = __vec_ext<16 / sizeof(_Tp)>;
-
-// TODO: make this platform dependent
-template <class _Tp>
-using native = __vec_ext<_LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES / sizeof(_Tp)>;
-
-// TODO: make this platform dependent
-template <class _Tp, size_t _Np, class... _Abis>
-struct deduce {
-  using type = fixed_size<_Np>;
-};
-
-// TODO: make this platform dependent
-template <class _Tp, size_t _Np, class... _Abis>
-using deduce_t = typename deduce<_Tp, _Np, _Abis...>::type;
-
-} // namespace simd_abi
-} // namespace parallelism_v2
-_LIBCPP_END_NAMESPACE_EXPERIMENTAL
-
-#endif // _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
-#endif // _LIBCPP_EXPERIMENTAL___SIMD_ABI_TAG_H
diff --git a/libcxx/include/experimental/__simd/aligned_tag.h b/libcxx/include/experimental/__simd/aligned_tag.h
index d216a21c073f..edbb3b24931f 100644
--- a/libcxx/include/experimental/__simd/aligned_tag.h
+++ b/libcxx/include/experimental/__simd/aligned_tag.h
@@ -10,10 +10,10 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H
 
-#include <__bit/bit_ceil.h>
 #include <__memory/assume_aligned.h>
 #include <cstddef>
 #include <experimental/__config>
+#include <experimental/__simd/traits.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
 
@@ -30,9 +30,12 @@ struct element_aligned_tag {
   }
 };
 
+template <>
+inline constexpr bool is_simd_flag_type_v<element_aligned_tag> = true;
+
 struct vector_aligned_tag {
   template <class _Tp, class _Up = typename _Tp::value_type>
-  static constexpr size_t __alignment = std::__bit_ceil(sizeof(_Up) * _Tp::size());
+  static constexpr size_t __alignment = memory_alignment_v<_Tp, _Up>;
 
   template <class _Tp, class _Up>
   static _LIBCPP_HIDE_FROM_ABI constexpr _Up* __apply(_Up* __ptr) {
@@ -40,6 +43,9 @@ struct vector_aligned_tag {
   }
 };
 
+template <>
+inline constexpr bool is_simd_flag_type_v<vector_aligned_tag> = true;
+
 template <size_t _Np>
 struct overaligned_tag {
   template <class _Tp, class _Up = typename _Tp::value_type>
@@ -51,6 +57,9 @@ struct overaligned_tag {
   }
 };
 
+template <size_t _Np>
+inline constexpr bool is_simd_flag_type_v<overaligned_tag<_Np>> = true;
+
 inline constexpr element_aligned_tag element_aligned{};
 
 inline constexpr vector_aligned_tag vector_aligned{};
diff --git a/libcxx/include/experimental/__simd/declaration.h b/libcxx/include/experimental/__simd/declaration.h
index 065faeaec384..7b45d035c271 100644
--- a/libcxx/include/experimental/__simd/declaration.h
+++ b/libcxx/include/experimental/__simd/declaration.h
@@ -10,13 +10,63 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H
 
+#include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
 
 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
 inline namespace parallelism_v2 {
+namespace simd_abi {
+template <int>
+struct __vec_ext;
+struct __scalar;
+
+using scalar = __scalar;
+
+// TODO: make this platform dependent
+template <int _Np>
+using fixed_size = __vec_ext<_Np>;
+
+template <class _Tp>
+inline constexpr int max_fixed_size = 32;
+
+// TODO: make this platform dependent
+template <class _Tp>
+using compatible = __vec_ext<16 / sizeof(_Tp)>;
+
+// TODO: make this platform dependent
+template <class _Tp>
+using native = __vec_ext<_LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES / sizeof(_Tp)>;
+
+// TODO: make this platform dependent
+template <class _Tp, size_t _Np, class... _Abis>
+struct deduce {
+  using type = fixed_size<_Np>;
+};
+
+// TODO: make this platform dependent
+template <class _Tp, size_t _Np, class... _Abis>
+using deduce_t = typename deduce<_Tp, _Np, _Abis...>::type;
+
+} // namespace simd_abi
+
+template <class _Tp, class _Abi>
+struct __simd_storage;
+
+template <class _Tp, class _Abi>
+struct __mask_storage;
+
+template <class _Tp, class _Abi>
+struct __simd_operations;
+
+template <class _Tp, class _Abi>
+struct __mask_operations;
+
+struct element_aligned_tag;
+struct vector_aligned_tag;
+template <size_t>
+struct overaligned_tag;
 
 template <class _Tp, class _Abi = simd_abi::compatible<_Tp>>
 class simd;
diff --git a/libcxx/include/experimental/__simd/internal_declaration.h b/libcxx/include/experimental/__simd/internal_declaration.h
deleted file mode 100644
index 9ad1ad1ae319..000000000000
--- a/libcxx/include/experimental/__simd/internal_declaration.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP_EXPERIMENTAL___SIMD_INTERNAL_DECLARATION_H
-#define _LIBCPP_EXPERIMENTAL___SIMD_INTERNAL_DECLARATION_H
-
-#include <experimental/__config>
-
-#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
-
-_LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
-inline namespace parallelism_v2 {
-namespace simd_abi {
-template <int>
-struct __vec_ext;
-struct __scalar;
-} // namespace simd_abi
-
-template <class _Tp, class _Abi>
-struct __simd_storage;
-
-template <class _Tp, class _Abi>
-struct __mask_storage;
-
-template <class _Tp, class _Abi>
-struct __simd_operations;
-
-template <class _Tp, class _Abi>
-struct __mask_operations;
-
-} // namespace parallelism_v2
-_LIBCPP_END_NAMESPACE_EXPERIMENTAL
-
-#endif // _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
-#endif // _LIBCPP_EXPERIMENTAL___SIMD_INTERNAL_DECLARATION_H
diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h
index 53fa1c29f374..5eeff4c1e82a 100644
--- a/libcxx/include/experimental/__simd/scalar.h
+++ b/libcxx/include/experimental/__simd/scalar.h
@@ -12,7 +12,7 @@
 
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/internal_declaration.h>
+#include <experimental/__simd/declaration.h>
 #include <experimental/__simd/traits.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h
index ffb328eb345b..c345811fee7f 100644
--- a/libcxx/include/experimental/__simd/simd.h
+++ b/libcxx/include/experimental/__simd/simd.h
@@ -15,9 +15,7 @@
 #include <__utility/forward.h>
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
 #include <experimental/__simd/declaration.h>
-#include <experimental/__simd/internal_declaration.h>
 #include <experimental/__simd/reference.h>
 #include <experimental/__simd/traits.h>
 #include <experimental/__simd/utility.h>
diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h
index 325b8409e3b6..db03843b46e3 100644
--- a/libcxx/include/experimental/__simd/simd_mask.h
+++ b/libcxx/include/experimental/__simd/simd_mask.h
@@ -13,9 +13,7 @@
 #include <__type_traits/is_same.h>
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
 #include <experimental/__simd/declaration.h>
-#include <experimental/__simd/internal_declaration.h>
 #include <experimental/__simd/reference.h>
 #include <experimental/__simd/traits.h>
 
diff --git a/libcxx/include/experimental/__simd/traits.h b/libcxx/include/experimental/__simd/traits.h
index 9b4abe9d0c23..ec25b4bfa7f9 100644
--- a/libcxx/include/experimental/__simd/traits.h
+++ b/libcxx/include/experimental/__simd/traits.h
@@ -10,14 +10,12 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_TRAITS_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_TRAITS_H
 
+#include <__bit/bit_ceil.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_same.h>
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
-#include <experimental/__simd/aligned_tag.h>
 #include <experimental/__simd/declaration.h>
-#include <experimental/__simd/internal_declaration.h>
 #include <experimental/__simd/utility.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
@@ -47,15 +45,6 @@ struct is_simd_mask : bool_constant<is_simd_mask_v<_Tp>> {};
 template <class _Tp>
 inline constexpr bool is_simd_flag_type_v = false;
 
-template <>
-inline constexpr bool is_simd_flag_type_v<element_aligned_tag> = true;
-
-template <>
-inline constexpr bool is_simd_flag_type_v<vector_aligned_tag> = true;
-
-template <size_t _Np>
-inline constexpr bool is_simd_flag_type_v<overaligned_tag<_Np>> = true;
-
 template <class _Tp>
 struct is_simd_flag_type : bool_constant<is_simd_flag_type_v<_Tp>> {};
 
@@ -71,7 +60,7 @@ inline constexpr size_t simd_size_v = simd_size<_Tp, _Abi>::value;
 template <class _Tp,
           class _Up = typename _Tp::value_type,
           bool      = (is_simd_v<_Tp> && __is_vectorizable_v<_Up>) || (is_simd_mask_v<_Tp> && is_same_v<_Up, bool>)>
-struct memory_alignment : integral_constant<size_t, vector_aligned_tag::__alignment<_Tp, _Up>> {};
+struct memory_alignment : integral_constant<size_t, std::__bit_ceil(sizeof(_Up) * _Tp::size())> {};
 
 template <class _Tp, class _Up>
 struct memory_alignment<_Tp, _Up, false> {};
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index baaeda6a7401..07ba032f493b 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -15,7 +15,7 @@
 #include <__utility/integer_sequence.h>
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/internal_declaration.h>
+#include <experimental/__simd/declaration.h>
 #include <experimental/__simd/traits.h>
 #include <experimental/__simd/utility.h>
 
@@ -38,11 +38,11 @@ struct __simd_storage<_Tp, simd_abi::__vec_ext<_Np>> {
   _Tp __data __attribute__((__vector_size__(std::__bit_ceil((sizeof(_Tp) * _Np)))));
 
   _LIBCPP_HIDE_FROM_ABI _Tp __get(size_t __idx) const noexcept {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__idx >= 0 && __idx < _Np, "Index is out of bounds");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__idx >= 0 && __idx < _Np, "Index is out of bounds");
     return __data[__idx];
   }
   _LIBCPP_HIDE_FROM_ABI void __set(size_t __idx, _Tp __v) noexcept {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__idx >= 0 && __idx < _Np, "Index is out of bounds");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__idx >= 0 && __idx < _Np, "Index is out of bounds");
     __data[__idx] = __v;
   }
 };
diff --git a/libcxx/include/experimental/simd b/libcxx/include/experimental/simd
index 56858832857c..adca9faa47bb 100644
--- a/libcxx/include/experimental/simd
+++ b/libcxx/include/experimental/simd
@@ -78,7 +78,6 @@ inline namespace parallelism_v2 {
 #endif
 
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
 #include <experimental/__simd/aligned_tag.h>
 #include <experimental/__simd/declaration.h>
 #include <experimental/__simd/scalar.h>
diff --git a/libcxx/include/memory b/libcxx/include/memory
index 71e812064646..ee245d5fd2dc 100644
--- a/libcxx/include/memory
+++ b/libcxx/include/memory
@@ -629,7 +629,7 @@ public:
     T& operator*() const noexcept;
     T* operator->() const noexcept;
     long use_count() const noexcept;
-    bool unique() const noexcept;
+    bool unique() const noexcept;  // deprected in C++17, removed in C++20
     explicit operator bool() const noexcept;
     template<class U> bool owner_before(shared_ptr<U> const& b) const noexcept;
     template<class U> bool owner_before(weak_ptr<U> const& b) const noexcept;
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index a37e96205cf2..d10670d4faaf 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -530,10 +530,8 @@ module std_experimental [system] {
     export *
   }
   module simd {
-    module abi_tag              { private header "experimental/__simd/abi_tag.h" }
     module aligned_tag          { private header "experimental/__simd/aligned_tag.h" }
     module declaration          { private header "experimental/__simd/declaration.h" }
-    module internal_declaration { private header "experimental/__simd/internal_declaration.h" }
     module reference            { private header "experimental/__simd/reference.h" }
     module scalar               { private header "experimental/__simd/scalar.h" }
     module simd                 { private header "experimental/__simd/simd.h" }
diff --git a/libcxx/include/string b/libcxx/include/string
index fdffca5aed18..c676182fba8b 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -1903,23 +1903,34 @@ private:
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_new(size_type __current_size) const _NOEXCEPT {
+    (void) __current_size;
+#if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + capacity() + 1, data() + __current_size + 1);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_delete() const _NOEXCEPT {
+#if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + size() + 1, data() + capacity() + 1);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_increase(size_type __n) const _NOEXCEPT {
+    (void) __n;
+#if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + size() + 1, data() + size() + 1 + __n);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_shrink(size_type __old_size) const _NOEXCEPT {
+    (void) __old_size;
+#if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + __old_size + 1, data() + size() + 1);
+#endif
   }
 
   template <size_type __a>
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 3abc917f5c0e..0098273a195f 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -845,19 +845,30 @@ private:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_new(size_type __current_size) const _NOEXCEPT {
+    (void)__current_size;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_contiguous_container(data(), data() + capacity(), data() + capacity(), data() + __current_size);
+#endif
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_delete() const _NOEXCEPT {
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_contiguous_container(data(), data() + capacity(), data() + size(), data() + capacity());
+#endif
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_increase(size_type __n) const _NOEXCEPT {
+    (void)__n;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_contiguous_container(data(), data() + capacity(), data() + size(), data() + size() + __n);
+#endif
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_shrink(size_type __old_size) const _NOEXCEPT {
+    (void)__old_size;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_contiguous_container(data(), data() + capacity(), data() + __old_size, data() + size());
+#endif
   }
 
   struct _ConstructTransaction {
diff --git a/libcxx/modules/std/algorithm.inc b/libcxx/modules/std/algorithm.inc
index f6b35efa144f..75e8a3af78de 100644
--- a/libcxx/modules/std/algorithm.inc
+++ b/libcxx/modules/std/algorithm.inc
@@ -16,7 +16,9 @@ export namespace std {
     using std::ranges::in_in_result;
     using std::ranges::in_out_out_result;
     using std::ranges::in_out_result;
+#if _LIBCPP_STD_VER >= 23
     using std::ranges::in_value_result;
+#endif
     using std::ranges::min_max_result;
     // using std::ranges::out_value_result;
   } // namespace ranges
@@ -40,6 +42,7 @@ export namespace std {
     using std::ranges::none_of;
   }
 
+#if _LIBCPP_STD_VER >= 23
   // [alg.contains], contains
   namespace ranges {
     using std::ranges::contains;
@@ -47,6 +50,7 @@ export namespace std {
     using std::ranges::contains_subrange;
 #endif
   } // namespace ranges
+#endif // _LIBCPP_STD_VER >= 23
 
   // [alg.foreach], for each
   using std::for_each;
diff --git a/libcxx/src/support/ibm/xlocale_zos.cpp b/libcxx/src/support/ibm/xlocale_zos.cpp
index 4c20997b4eb7..9a90e08e11cf 100644
--- a/libcxx/src/support/ibm/xlocale_zos.cpp
+++ b/libcxx/src/support/ibm/xlocale_zos.cpp
@@ -103,7 +103,7 @@ locale_t uselocale(locale_t newloc) {
       tokenized.push_back(s);
     }
 
-    _LIBCPP_ASSERT_UNCATEGORIZED(tokenized.size() >= _NCAT, "locale-name list is too short");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(tokenized.size() >= _NCAT, "locale-name list is too short");
 
     previous_loc->lc_collate  = tokenized[LC_COLLATE];
     previous_loc->lc_ctype    = tokenized[LC_CTYPE];
diff --git a/libcxx/test/libcxx/algorithms/alg.sorting/assert.min.max.pass.cpp b/libcxx/test/libcxx/algorithms/alg.sorting/assert.min.max.pass.cpp
index b23b4d4530ee..bd9dfd4549c4 100644
--- a/libcxx/test/libcxx/algorithms/alg.sorting/assert.min.max.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.sorting/assert.min.max.pass.cpp
@@ -10,7 +10,7 @@
 
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: libcpp-hardening-mode=none
 // XFAIL: availability-verbose_abort-missing
 
 #include <algorithm>
diff --git a/libcxx/test/std/containers/sequences/deque/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp
index 37e87d59503e..37e87d59503e 100644
--- a/libcxx/test/std/containers/sequences/deque/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/counted.iterator/assert.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/counted.iterator/assert.pass.cpp
new file mode 100644
index 000000000000..f803b2cad75b
--- /dev/null
+++ b/libcxx/test/libcxx/iterators/predef.iterators/counted.iterator/assert.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none
+// XFAIL: availability-verbose_abort-missing
+
+#include <iterator>
+
+#include "check_assertion.h"
+#include "test_iterators.h"
+
+int main(int, char**) {
+  using Iter = std::counted_iterator<int*>;
+  int a[]    = {1, 2, 3};
+  Iter valid_i(a, 1);
+
+  {
+    Iter i;
+
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Iterator is equal to or past end.");
+    TEST_LIBCPP_ASSERT_FAILURE(i[999], "Subscript argument must be less than size.");
+    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::iter_move(i), "Iterator must not be past end of range.");
+    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::iter_swap(i, valid_i), "Iterators must not be past end of range.");
+    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::iter_swap(valid_i, i), "Iterators must not be past end of range.");
+    std::ranges::iter_swap(valid_i, valid_i); // Ok
+  }
+
+  { // Check the `const` overload of `operator*`.
+    const Iter i;
+
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Iterator is equal to or past end.");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/iterators.common/assert.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/iterators.common/assert.pass.cpp
new file mode 100644
index 000000000000..ea4574fc1a9c
--- /dev/null
+++ b/libcxx/test/libcxx/iterators/predef.iterators/iterators.common/assert.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none
+// XFAIL: availability-verbose_abort-missing
+
+#include <iterator>
+
+#include "check_assertion.h"
+#include "test_iterators.h"
+
+int main(int, char**) {
+  using Iter = std::common_iterator<int*, sentinel_wrapper<int*>>;
+  int a[]    = {1, 2, 3};
+  sentinel_wrapper<int*> s;
+  Iter valid_i = a;
+
+  {
+    Iter i = s;
+
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable common_iterator");
+
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-dereferenceable common_iterator");
+    TEST_LIBCPP_ASSERT_FAILURE(i++, "Attempted to increment a non-dereferenceable common_iterator");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::ranges::iter_move(i), "Attempted to iter_move a non-dereferenceable common_iterator");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::ranges::iter_swap(i, valid_i), "Attempted to iter_swap a non-dereferenceable common_iterator");
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::ranges::iter_swap(valid_i, i), "Attempted to iter_swap a non-dereferenceable common_iterator");
+    std::ranges::iter_swap(valid_i, valid_i); // Ok
+  }
+
+  { // Check the `const` overload of `operator*`.
+    const Iter i = s;
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable common_iterator");
+  }
+
+  { // Check `operator->`.
+    struct Foo {
+      int x = 0;
+    };
+
+    std::common_iterator<Foo*, sentinel_wrapper<Foo*>> i = sentinel_wrapper<Foo*>();
+    TEST_LIBCPP_ASSERT_FAILURE(i->x, "Attempted to dereference a non-dereferenceable common_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/no_unique_address.compile.pass.cpp
new file mode 100644
index 000000000000..e696598f618b
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/no_unique_address.compile.pass.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// XFAIL: msvc
+
+// This test ensures that we use `[[no_unique_address]]` in `chunk_by_view`.
+
+#include <ranges>
+
+struct View : std::ranges::view_base {
+  int* begin() const;
+  int* end() const;
+};
+
+struct Pred {
+  template <class... Args>
+  bool operator()(const Args&...) const;
+};
+
+template <class View>
+struct Test {
+  [[no_unique_address]] View view;
+  char c;
+};
+
+// [[no_unique_address]] applied to _View
+struct ViewWithPadding : View {
+  alignas(128) char c;
+};
+
+static_assert(sizeof(Test<std::ranges::chunk_by_view<ViewWithPadding, Pred>>) ==
+              sizeof(std::ranges::chunk_by_view<ViewWithPadding, Pred>));
+
+// [[no_unique_address]] applied to movable-box
+struct PredWithPadding : Pred {
+  alignas(128) char c;
+};
+
+static_assert(sizeof(Test<std::ranges::chunk_by_view<View, PredWithPadding>>) ==
+              sizeof(std::ranges::chunk_by_view<View, PredWithPadding>));
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/empty_object.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/empty_object.pass.cpp
new file mode 100644
index 000000000000..9712635a0010
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/empty_object.pass.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// This test ensures that <movable-box> behaves correctly when it holds an empty type.
+
+#include <ranges>
+
+#include <cassert>
+#include <utility>
+
+bool copied = false;
+bool moved  = false;
+
+struct Empty {
+  Empty() noexcept {}
+  Empty(Empty const&) noexcept { copied = true; }
+  Empty(Empty&&) noexcept { moved = true; }
+  Empty& operator=(Empty const&) = delete;
+  Empty& operator=(Empty&&)      = delete;
+};
+
+using Box = std::ranges::__movable_box<Empty>;
+
+struct Inherit : Box {};
+
+struct Hold : Box {
+  [[no_unique_address]] Inherit member;
+};
+
+int main(int, char**) {
+  Hold box;
+
+  Box& base   = static_cast<Box&>(box);
+  Box& member = static_cast<Box&>(box.member);
+
+  // Despite [[no_unique_address]], the two objects have the same type so they
+  // can't share the same address.
+  assert(&base != &member);
+
+  // Make sure that we do perform the copy-construction, which wouldn't be the
+  // case if the two <movable-box>s had the same address.
+  base = member;
+  assert(copied);
+
+  base = std::move(member);
+  assert(moved);
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/no_unique_address.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/no_unique_address.pass.cpp
index e21191f98dea..4ec6c888f9c1 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/no_unique_address.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.move.wrap/no_unique_address.pass.cpp
@@ -8,49 +8,71 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// This test ensures that <copyable-box> behaves correctly when it holds an empty type.
+// clang-cl and cl currently don't support [[no_unique_address]]
+// XFAIL: msvc
 
 #include <ranges>
 
 #include <cassert>
 #include <utility>
 
-bool copied = false;
-bool moved  = false;
+#include "test_macros.h"
 
-struct Empty {
-  Empty() noexcept {}
-  Empty(Empty const&) noexcept { copied = true; }
-  Empty(Empty&&) noexcept { moved = true; }
-  Empty& operator=(Empty const&) = delete;
-  Empty& operator=(Empty&&)      = delete;
-};
+template <class T, bool ExpectNoUniqueAddress>
+void test_no_unique_address() {
+  struct Test {
+    [[no_unique_address]] std::ranges::__movable_box<T> box_;
+    bool b2;
+  };
 
-using Box = std::ranges::__movable_box<Empty>;
+  if constexpr (ExpectNoUniqueAddress) {
+    static_assert(sizeof(Test) == sizeof(bool));
+  } else {
+    static_assert(sizeof(Test) > sizeof(bool));
+  }
+}
 
-struct Inherit : Box {};
+struct Copyable {};
 
-struct Hold : Box {
-  [[no_unique_address]] Inherit member;
+struct NotCopyAssignable {
+  constexpr NotCopyAssignable()                          = default;
+  constexpr NotCopyAssignable(const NotCopyAssignable&)  = default;
+  NotCopyAssignable& operator=(const NotCopyAssignable&) = delete;
 };
 
-int main(int, char**) {
-  Hold box;
+struct NotMoveAssignable {
+  constexpr NotMoveAssignable()                          = default;
+  constexpr NotMoveAssignable(const NotMoveAssignable&)  = default;
+  NotMoveAssignable& operator=(const NotMoveAssignable&) = default;
+  constexpr NotMoveAssignable(NotMoveAssignable&&)       = default;
+  NotMoveAssignable& operator=(NotMoveAssignable&&)      = delete;
+};
 
-  Box& base   = static_cast<Box&>(box);
-  Box& member = static_cast<Box&>(box.member);
+struct MoveOnly {
+  constexpr MoveOnly()                 = default;
+  constexpr MoveOnly(const MoveOnly&)  = delete;
+  MoveOnly& operator=(const MoveOnly&) = delete;
+  constexpr MoveOnly(MoveOnly&&)       = default;
+  MoveOnly& operator=(MoveOnly&&)      = default;
+};
 
-  // Despite [[no_unique_address]], the two objects have the same type so they
-  // can't share the same address.
-  assert(&base != &member);
+struct MoveOnlyNotAssignable {
+  constexpr MoveOnlyNotAssignable()                              = default;
+  constexpr MoveOnlyNotAssignable(const MoveOnlyNotAssignable&)  = delete;
+  MoveOnlyNotAssignable& operator=(const MoveOnlyNotAssignable&) = delete;
+  constexpr MoveOnlyNotAssignable(MoveOnlyNotAssignable&&)       = default;
+  MoveOnlyNotAssignable& operator=(MoveOnlyNotAssignable&&)      = delete;
+};
 
-  // Make sure that we do perform the copy-construction, which wouldn't be the
-  // case if the two <copyable-box>s had the same address.
-  base = member;
-  assert(copied);
+int main(int, char**) {
+  test_no_unique_address<Copyable, true>();
+  test_no_unique_address<NotCopyAssignable, false>();
+  test_no_unique_address<NotMoveAssignable, false>();
 
-  base = std::move(member);
-  assert(moved);
+#if TEST_STD_VER >= 23
+  test_no_unique_address<MoveOnly, true>();
+  test_no_unique_address<MoveOnlyNotAssignable, false>();
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/libcxx/ranges/range.factories/range.repeat.view/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.factories/range.repeat.view/no_unique_address.compile.pass.cpp
new file mode 100644
index 000000000000..6ab2ee173ca1
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.factories/range.repeat.view/no_unique_address.compile.pass.cpp
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// XFAIL: msvc
+
+// This test ensures that we use `[[no_unique_address]]` in `repeat_view`.
+
+#include <ranges>
+
+struct Empty {};
+
+struct Test {
+  [[no_unique_address]] std::ranges::repeat_view<Empty> v;
+  bool b;
+};
+
+static_assert(sizeof(Test) == sizeof(bool));
diff --git a/libcxx/test/libcxx/ranges/range.factories/range.single.view/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.factories/range.single.view/no_unique_address.compile.pass.cpp
new file mode 100644
index 000000000000..10883b838578
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.factories/range.single.view/no_unique_address.compile.pass.cpp
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: msvc
+
+// This test ensures that we use `[[no_unique_address]]` in `single_view`.
+
+#include <ranges>
+
+struct Empty {};
+
+struct Test {
+  [[no_unique_address]] std::ranges::single_view<Empty> v;
+  bool b;
+};
+
+static_assert(sizeof(Test) == sizeof(bool));
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
index 965e82a7b403..3260a8cbc7f8 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
@@ -46,11 +46,9 @@ void test() {
   {
     std::expected<int, int> e;
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected<T, E> for {{.*}} is ill-formed.}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected<T, E> for {{.*}} is ill-formed.}}
   }
 
@@ -58,27 +56,21 @@ void test() {
   {
     const std::expected<int, int> e;
     e.transform_error(return_unexpected<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     e.transform_error(return_no_object<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 
   // Test && overload
   {
     std::expected<int, int> e;
     std::move(e).transform_error(return_unexpected<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     std::move(e).transform_error(return_no_object<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 
   // Test const&& overload
   {
     const std::expected<int, int> e;
     std::move(e).transform_error(return_unexpected<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     std::move(e).transform_error(return_no_object<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* 2 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 }
 // clang-format on
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
index 09aa1332e980..21dc24768791 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
@@ -46,11 +46,9 @@ void test() {
   {
     std::expected<void, int> e;
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected<T, E> with a E that is not a valid argument for unexpected<E> is ill-formed}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected<T, E> with a E that is not a valid argument for unexpected<E> is ill-formed}}
   }
 
@@ -58,27 +56,21 @@ void test() {
   {
     const std::expected<void, int> e;
     e.transform_error(return_unexpected<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     e.transform_error(return_no_object<const int &>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 
   // Test && overload
   {
     std::expected<void, int> e;
     std::move(e).transform_error(return_unexpected<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     std::move(e).transform_error(return_no_object<int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 
   // Test const&& overload
   {
     const std::expected<void, int> e;
     std::move(e).transform_error(return_unexpected<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     std::move(e).transform_error(return_no_object<const int&&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
-    // expected-error-re@*:* {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
   }
 }
 // clang-format on
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
index 0afc573aa177..0676da13e90f 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare
 // ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare
 // MSVC warning C4389: '==': signed/unsigned mismatch
 // ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4389
@@ -162,6 +163,45 @@ void test_deque() {
   }
 }
 
+template <class T>
+struct TestIntegerPromotions1 {
+  template <class U>
+  TEST_CONSTEXPR_CXX20 void test(T val, U to_find) {
+    bool expect_match = val == to_find;
+    assert(std::find(&val, &val + 1, to_find) == (expect_match ? &val : &val + 1));
+  }
+
+  template <class U>
+  TEST_CONSTEXPR_CXX20 void operator()() {
+    test<U>(0, 0);
+    test<U>(0, 1);
+    test<U>(1, 1);
+    test<U>(0, -1);
+    test<U>(-1, -1);
+    test<U>(0, U(-127));
+    test<U>(T(-127), U(-127));
+    test<U>(T(-128), U(-128));
+    test<U>(T(-129), U(-129));
+    test<U>(T(255), U(255));
+    test<U>(T(256), U(256));
+    test<U>(T(257), U(257));
+    test<U>(0, std::numeric_limits<U>::min());
+    test<U>(T(std::numeric_limits<U>::min()), std::numeric_limits<U>::min());
+    test<U>(0, std::numeric_limits<U>::min() + 1);
+    test<U>(T(std::numeric_limits<U>::min() + 1), std::numeric_limits<U>::min() + 1);
+    test<U>(0, std::numeric_limits<U>::max());
+    test<U>(T(std::numeric_limits<U>::max()), std::numeric_limits<U>::max());
+    test<U>(T(std::numeric_limits<U>::max() - 1), std::numeric_limits<U>::max() - 1);
+  }
+};
+
+struct TestIntegerPromotions {
+  template <class T>
+  TEST_CONSTEXPR_CXX20 void operator()() {
+    types::for_each(types::integral_types(), TestIntegerPromotions1<T>());
+  }
+};
+
 TEST_CONSTEXPR_CXX20 bool test() {
   types::for_each(types::integer_types(), TestTypes<char>());
   types::for_each(types::integer_types(), TestTypes<int>());
@@ -181,6 +221,8 @@ TEST_CONSTEXPR_CXX20 bool test() {
   }
 #endif
 
+  types::for_each(types::integral_types(), TestIntegerPromotions());
+
   return true;
 }
 
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.uni/rand.dist.uni.int/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.uni/rand.dist.uni.int/eval.pass.cpp
index 669f90d5292d..3a2942b74596 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.uni/rand.dist.uni.int/eval.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.uni/rand.dist.uni.int/eval.pass.cpp
@@ -26,12 +26,6 @@
 
 #include "test_macros.h"
 
-// The __int128 conversions to/from floating point crash on MinGW on x86_64.
-// This is fixed in Clang 14 by https://reviews.llvm.org/D110413.
-#if defined(__x86_64__) && defined(__MINGW32__) && defined(__clang_major__) && __clang_major__ < 14
- #define TEST_BUGGY_I128_FP
-#endif
-
 template <class T>
 T sqr(T x) {
     return x * x;
@@ -127,7 +121,7 @@ int main(int, char**)
     test_statistics<std::int8_t, std::minstd_rand0>();
     test_statistics<std::uint8_t, std::minstd_rand0>();
 
-#if !defined(TEST_HAS_NO_INT128) && !defined(TEST_BUGGY_I128_FP)
+#if !defined(TEST_HAS_NO_INT128)
     test_statistics<__int128_t, std::minstd_rand0>();
     test_statistics<__uint128_t, std::minstd_rand0>();
 
diff --git a/libcxx/test/std/ranges/range.adaptors/range.drop/begin.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.drop/begin.pass.cpp
index cff088453d22..8c28769acf7f 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.drop/begin.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.drop/begin.pass.cpp
@@ -82,6 +82,44 @@ constexpr bool test() {
   static_assert(!BeginInvocable<const ForwardView>);
 
   {
+    // non-common non-simple view,
+    // The wording of the standard is:
+    // Returns: ranges::next(ranges::begin(base_), count_, ranges::end(base_))
+    // Note that "Returns" is used here, meaning that we don't have to do it this way.
+    // In fact, this will use ranges::advance that has O(n) on non-common range.
+    // but [range.range] requires "amortized constant time" for ranges::begin and ranges::end
+    // Here, we test that begin() is indeed constant time, by creating a customized
+    // sentinel and counting how many times the sentinel eq function is called.
+    // It should be 0 times, but since this test (or any test under libcxx/test/std) is
+    // also used by other implementations, we relax the condition to that
+    // sentinel_cmp_calls is a constant number.
+    int sentinel_cmp_calls_1 = 0;
+    int sentinel_cmp_calls_2 = 0;
+    using NonCommonView      = MaybeSimpleNonCommonView<false>;
+    static_assert(std::ranges::random_access_range<NonCommonView>);
+    static_assert(std::ranges::sized_range<NonCommonView>);
+    std::ranges::drop_view dropView9_1(NonCommonView{{}, 0, &sentinel_cmp_calls_1}, 4);
+    std::ranges::drop_view dropView9_2(NonCommonView{{}, 0, &sentinel_cmp_calls_2}, 6);
+    assert(dropView9_1.begin() == globalBuff + 4);
+    assert(dropView9_2.begin() == globalBuff + 6);
+    assert(sentinel_cmp_calls_1 == sentinel_cmp_calls_2);
+  }
+
+  {
+    // non-common simple view, same as above.
+    int sentinel_cmp_calls_1 = 0;
+    int sentinel_cmp_calls_2 = 0;
+    using NonCommonView      = MaybeSimpleNonCommonView<true>;
+    static_assert(std::ranges::random_access_range<NonCommonView>);
+    static_assert(std::ranges::sized_range<NonCommonView>);
+    std::ranges::drop_view dropView10_1(NonCommonView{{}, 0, &sentinel_cmp_calls_1}, 4);
+    std::ranges::drop_view dropView10_2(NonCommonView{{}, 0, &sentinel_cmp_calls_2}, 6);
+    assert(dropView10_1.begin() == globalBuff + 4);
+    assert(dropView10_2.begin() == globalBuff + 6);
+    assert(sentinel_cmp_calls_1 == sentinel_cmp_calls_2);
+  }
+
+  {
     static_assert(std::ranges::random_access_range<const SimpleView>);
     static_assert(std::ranges::sized_range<const SimpleView>);
     LIBCPP_STATIC_ASSERT(std::ranges::__simple_view<SimpleView>);
diff --git a/libcxx/test/std/ranges/range.adaptors/range.drop/types.h b/libcxx/test/std/ranges/range.adaptors/range.drop/types.h
index 32bbddc05ed9..1fc3f05bf5ea 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.drop/types.h
+++ b/libcxx/test/std/ranges/range.adaptors/range.drop/types.h
@@ -14,6 +14,38 @@
 
 int globalBuff[8];
 
+template <class T>
+struct sentinel {
+  T* ptr_;
+  int* num_of_sentinel_cmp_calls;
+
+public:
+  friend constexpr bool operator==(sentinel const s, T* const ptr) noexcept {
+    ++(*s.num_of_sentinel_cmp_calls);
+    return {s.ptr_ == ptr};
+  }
+  friend constexpr bool operator==(T* const ptr, sentinel const s) noexcept {
+    ++(*s.num_of_sentinel_cmp_calls);
+    return {s.ptr_ == ptr};
+  }
+  friend constexpr bool operator!=(sentinel const s, T* const ptr) noexcept { return !(s == ptr); }
+  friend constexpr bool operator!=(T* const ptr, sentinel const s) noexcept { return !(s == ptr); }
+};
+
+template <bool IsSimple>
+struct MaybeSimpleNonCommonView : std::ranges::view_base {
+  int start_;
+  int* num_of_sentinel_cmp_calls;
+  constexpr std::size_t size() const { return 8; }
+  constexpr int* begin() { return globalBuff + start_; }
+  constexpr std::conditional_t<IsSimple, int*, const int*> begin() const { return globalBuff + start_; }
+  constexpr sentinel<int> end() { return sentinel<int>{globalBuff + size(), num_of_sentinel_cmp_calls}; }
+  constexpr auto end() const {
+    return std::conditional_t<IsSimple, sentinel<int>, sentinel<const int>>{
+        globalBuff + size(), num_of_sentinel_cmp_calls};
+  }
+};
+
 struct MoveOnlyView : std::ranges::view_base {
   int start_;
   constexpr explicit MoveOnlyView(int start = 0) : start_(start) {}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.range.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.range.pass.cpp
index 91df304b79af..8eeaa3dae36d 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.range.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/ctor.range.pass.cpp
@@ -22,22 +22,21 @@
 #include <utility>
 
 #include "test_convertible.h"
+#include "test_macros.h"
 #include "types.h"
 
 struct ElementWithCounting {
   int* times_copied = nullptr;
-  int* times_moved = nullptr;
+  int* times_moved  = nullptr;
 
   constexpr ElementWithCounting(int& copies_ctr, int& moves_ctr) : times_copied(&copies_ctr), times_moved(&moves_ctr) {}
 
   constexpr ElementWithCounting(const ElementWithCounting& rhs)
-      : times_copied(rhs.times_copied)
-      , times_moved(rhs.times_moved) {
+      : times_copied(rhs.times_copied), times_moved(rhs.times_moved) {
     ++(*times_copied);
   }
   constexpr ElementWithCounting(ElementWithCounting&& rhs)
-      : times_copied(rhs.times_copied)
-      , times_moved(rhs.times_moved) {
+      : times_copied(rhs.times_copied), times_moved(rhs.times_moved) {
     ++(*times_moved);
   }
 
@@ -48,18 +47,15 @@ struct RangeWithCounting {
   using value_type = ElementWithCounting;
 
   int* times_copied = nullptr;
-  int* times_moved = nullptr;
+  int* times_moved  = nullptr;
 
   constexpr RangeWithCounting(int& copies_ctr, int& moves_ctr) : times_copied(&copies_ctr), times_moved(&moves_ctr) {}
 
   constexpr RangeWithCounting(const RangeWithCounting& rhs)
-    : times_copied(rhs.times_copied)
-    , times_moved(rhs.times_moved) {
+      : times_copied(rhs.times_copied), times_moved(rhs.times_moved) {
     ++(*times_copied);
   }
-  constexpr RangeWithCounting(RangeWithCounting&& rhs)
-    : times_copied(rhs.times_copied)
-    , times_moved(rhs.times_moved) {
+  constexpr RangeWithCounting(RangeWithCounting&& rhs) : times_copied(rhs.times_copied), times_moved(rhs.times_moved) {
     ++(*times_moved);
   }
 
@@ -67,10 +63,10 @@ struct RangeWithCounting {
   constexpr const ElementWithCounting* end() const { return nullptr; }
 
   constexpr RangeWithCounting& operator=(const RangeWithCounting&) = default;
-  constexpr RangeWithCounting& operator=(RangeWithCounting&&) = default;
+  constexpr RangeWithCounting& operator=(RangeWithCounting&&)      = default;
   constexpr bool operator==(const RangeWithCounting&) const { return true; }
 };
-static_assert( std::ranges::forward_range<RangeWithCounting>);
+static_assert(std::ranges::forward_range<RangeWithCounting>);
 static_assert(!std::ranges::view<RangeWithCounting>);
 
 struct StrView : std::ranges::view_base {
@@ -86,9 +82,9 @@ struct StrView : std::ranges::view_base {
   constexpr std::string_view::const_iterator end() const { return buffer_.end(); }
   constexpr bool operator==(const StrView& rhs) const { return buffer_ == rhs.buffer_; }
 };
-static_assert( std::ranges::random_access_range<StrView>);
-static_assert( std::ranges::view<StrView>);
-static_assert( std::is_copy_constructible_v<StrView>);
+static_assert(std::ranges::random_access_range<StrView>);
+static_assert(std::ranges::view<StrView>);
+static_assert(std::is_copy_constructible_v<StrView>);
 
 // SFINAE tests.
 
@@ -131,7 +127,7 @@ constexpr bool test() {
 
   // Make sure the arguments are moved, not copied.
   {
-    using Range = RangeWithCounting;
+    using Range   = RangeWithCounting;
     using Element = ElementWithCounting;
     using Pattern = std::ranges::single_view<Element>;
 
@@ -144,10 +140,13 @@ constexpr bool test() {
       Element element(element_copied, element_moved);
 
       std::ranges::lazy_split_view<View, Pattern> v(range, element);
-      assert(range_copied == 0); // `ref_view` does neither copy...
-      assert(range_moved == 0); // ...nor move the element.
+      assert(range_copied == 0);   // `ref_view` does neither copy...
+      assert(range_moved == 0);    // ...nor move the element.
       assert(element_copied == 1); // The element is copied into the argument...
+#ifndef TEST_COMPILER_GCC
+      //https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995
       assert(element_moved == 1); // ...and moved into the member variable.
+#endif
     }
 
     // Arguments are rvalues.
@@ -160,7 +159,10 @@ constexpr bool test() {
       assert(range_copied == 0);
       assert(range_moved == 1); // `owning_view` moves the given argument.
       assert(element_copied == 0);
+#ifndef TEST_COMPILER_GCC
+      //https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995
       assert(element_moved == 1);
+#endif
     }
   }
 
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp
index 32fed6f8caa7..cfe394e89183 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp
@@ -108,5 +108,22 @@ int main(int, char**) {
   }
 #endif
 
+  // GH issue #70506
+  // movable_box::operator= overwrites underlying view
+  {
+    auto f = [l = 0.0L, b = false](int i) {
+      (void)l;
+      (void)b;
+      return i;
+    };
+
+    auto v1 = std::vector{1, 2, 3, 4} | std::views::transform(f);
+    auto v2 = std::vector{1, 2, 3, 4} | std::views::transform(f);
+
+    v1             = std::move(v2);
+    int expected[] = {1, 2, 3, 4};
+    assert(std::equal(v1.begin(), v1.end(), expected, expected + 4));
+  }
+
   return 0;
 }
diff --git a/libcxx/test/std/ranges/ranges_robust_against_no_unique_address.pass.cpp b/libcxx/test/std/ranges/ranges_robust_against_no_unique_address.pass.cpp
new file mode 100644
index 000000000000..3b35271d649c
--- /dev/null
+++ b/libcxx/test/std/ranges/ranges_robust_against_no_unique_address.pass.cpp
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// Test that views that use __movable_box do not overwrite overlapping subobjects.
+// https://github.com/llvm/llvm-project/issues/70506
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+
+struct Pred {
+  alignas(128) bool a{};
+
+  Pred() noexcept            = default;
+  Pred(const Pred&) noexcept = default;
+  Pred(Pred&&) noexcept      = default;
+
+  Pred& operator=(const Pred&) = delete;
+  Pred& operator=(Pred&&)      = delete;
+
+  constexpr bool operator()(const auto&...) const { return true; }
+};
+
+struct View : std::ranges::view_base {
+  constexpr int* begin() const { return nullptr; }
+  constexpr int* end() const { return nullptr; }
+};
+
+template <class View>
+struct S {
+  [[no_unique_address]] View view{};
+  char c = 42;
+};
+
+template <class View>
+constexpr void testOne() {
+  S<View> s1;
+  assert(s1.c == 42);
+  s1.view = View{};
+  assert(s1.c == 42);
+}
+
+constexpr bool test() {
+  testOne<std::ranges::transform_view<View, Pred>>();
+  testOne<std::ranges::filter_view<View, Pred>>();
+  testOne<std::ranges::drop_while_view<View, Pred>>();
+  testOne<std::ranges::take_while_view<View, Pred>>();
+  testOne<std::ranges::single_view<Pred>>();
+
+#if TEST_STD_VER >= 23
+  testOne<std::ranges::chunk_by_view<View, Pred>>();
+  testOne<std::ranges::repeat_view<Pred>>();
+#endif
+  return true;
+}
+
+int main(int, char**) {
+  static_assert(test());
+  test();
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/unique.deprecated_in_cxx17.verify.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/unique.deprecated_in_cxx17.verify.cpp
new file mode 100644
index 000000000000..eae0f6ec757e
--- /dev/null
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/unique.deprecated_in_cxx17.verify.cpp
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_SHARED_PTR_UNIQUE
+
+// <memory>
+
+// shared_ptr
+
+// bool unique() const; // deprecated in C++17, removed in C++20
+
+#include <memory>
+
+void f() {
+  const std::shared_ptr<int> p;
+  p.unique(); // expected-warning {{'unique' is deprecated}}
+}
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/unique.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/unique.pass.cpp
index 0b7d29dbcc06..c767701d55c9 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/unique.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/unique.pass.cpp
@@ -10,7 +10,9 @@
 
 // shared_ptr
 
-// bool unique() const;
+// bool unique() const; // deprecated in C++17, removed in C++20
+
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX20_REMOVED_SHARED_PTR_UNIQUE
 
 #include <memory>
 #include <cassert>
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/unique.removed_in_cxx20.verify.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/unique.removed_in_cxx20.verify.cpp
new file mode 100644
index 000000000000..c149f032e9a4
--- /dev/null
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/unique.removed_in_cxx20.verify.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <memory>
+
+// shared_ptr
+
+// bool unique() const; // deprecated in C++17, removed in C++20
+
+#include <memory>
+
+void f() {
+  const std::shared_ptr<int> p;
+  p.unique(); // expected-error {{no member named 'unique' in 'std::shared_ptr<int>}}
+}
diff --git a/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp b/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp
index 35f020da45c4..fbbee26ab26b 100644
--- a/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp
+++ b/libcxx/test/tools/clang_tidy_checks/header_exportable_declarations.cpp
@@ -275,7 +275,7 @@ static bool is_module(header_exportable_declarations::FileType file_type) {
 
   case header_exportable_declarations::FileType::Unknown:
     llvm::errs() << "This should be unreachable.\n";
-    break;
+    return false;
   }
 }
 
diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 90f25519fad1..5a53a18bcc5f 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -2794,7 +2794,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseClassEnumType();
   Node *parseQualifiedType();
 
-  Node *parseEncoding();
+  Node *parseEncoding(bool ParseParams = true);
   bool parseCallOffset();
   Node *parseSpecialName();
 
@@ -2911,7 +2911,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseDestructorName();
 
   /// Top-level entry point into the parser.
-  Node *parse();
+  Node *parse(bool ParseParams = true);
 };
 
 const char* parse_discriminator(const char* first, const char* last);
@@ -5405,7 +5405,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
 //            ::= <data name>
 //            ::= <special-name>
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
+Node *AbstractManglingParser<Derived, Alloc>::parseEncoding(bool ParseParams) {
   // The template parameters of an encoding are unrelated to those of the
   // enclosing context.
   SaveTemplateParams SaveTemplateParamsScope(this);
@@ -5431,6 +5431,16 @@ Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
   if (IsEndOfEncoding())
     return Name;
 
+  // ParseParams may be false at the top level only, when called from parse().
+  // For example in the mangled name _Z3fooILZ3BarEET_f, ParseParams may be
+  // false when demangling 3fooILZ3BarEET_f but is always true when demangling
+  // 3Bar.
+  if (!ParseParams) {
+    while (consume())
+      ;
+    return Name;
+  }
+
   Node *Attrs = nullptr;
   if (consumeIf("Ua9enable_ifI")) {
     size_t BeforeArgs = Names.size();
@@ -5895,9 +5905,9 @@ AbstractManglingParser<Derived, Alloc>::parseTemplateArgs(bool TagTemplates) {
 // extension      ::= ___Z <encoding> _block_invoke<decimal-digit>+
 // extension      ::= ___Z <encoding> _block_invoke_<decimal-digit>+
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parse() {
+Node *AbstractManglingParser<Derived, Alloc>::parse(bool ParseParams) {
   if (consumeIf("_Z") || consumeIf("__Z")) {
-    Node *Encoding = getDerived().parseEncoding();
+    Node *Encoding = getDerived().parseEncoding(ParseParams);
     if (Encoding == nullptr)
       return nullptr;
     if (look() == '.') {
@@ -5911,7 +5921,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parse() {
   }
 
   if (consumeIf("___Z") || consumeIf("____Z")) {
-    Node *Encoding = getDerived().parseEncoding();
+    Node *Encoding = getDerived().parseEncoding(ParseParams);
     if (Encoding == nullptr || !consumeIf("_block_invoke"))
       return nullptr;
     bool RequireNumber = consumeIf('_');
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 4f11affb35ed..cd2985b035bc 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -1951,6 +1951,9 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   parseMerge(".00cfg=.rdata");
   parseMerge(".bss=.data");
 
+  if (isArm64EC(config->machine))
+    parseMerge(".wowthk=.text");
+
   if (config->mingw) {
     parseMerge(".ctors=.rdata");
     parseMerge(".dtors=.rdata");
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 2982165530c0..2e34a6c5cfa2 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -560,7 +560,7 @@ void Writer::createECCodeMap() {
   codeMap.clear();
 
   std::optional<chpe_range_type> lastType;
-  Chunk *first = nullptr, *last = nullptr;
+  Chunk *first, *last;
 
   auto closeRange = [&]() {
     if (lastType) {
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 1c3e015efc16..996f9957a63c 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -463,6 +463,7 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
   case R_LARCH_B16:
   case R_LARCH_B21:
   case R_LARCH_B26:
+  case R_LARCH_CALL36:
     return R_PLT_PC;
   case R_LARCH_GOT_PC_HI20:
   case R_LARCH_GOT64_PC_LO20:
@@ -590,6 +591,25 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
     write32le(loc, setD10k16(read32le(loc), val >> 2));
     return;
 
+  case R_LARCH_CALL36: {
+    // This relocation is designed for adjancent pcaddu18i+jirl pairs that
+    // are patched in one time. Because of sign extension of these insns'
+    // immediate fields, the relocation range is [-128G - 0x20000, +128G -
+    // 0x20000) (of course must be 4-byte aligned).
+    if (((int64_t)val + 0x20000) != llvm::SignExtend64(val + 0x20000, 38))
+      reportRangeError(loc, rel, Twine(val), llvm::minIntN(38) - 0x20000,
+                       llvm::maxIntN(38) - 0x20000);
+    checkAlignment(loc, val, 4, rel);
+    // Since jirl performs sign extension on the offset immediate, adds (1<<17)
+    // to original val to get the correct hi20.
+    uint32_t hi20 = extractBits(val + (1 << 17), 37, 18);
+    // Despite the name, the lower part is actually 18 bits with 4-byte aligned.
+    uint32_t lo16 = extractBits(val, 17, 2);
+    write32le(loc, setJ20(read32le(loc), hi20));
+    write32le(loc + 4, setK16(read32le(loc + 4), lo16));
+    return;
+  }
+
   // Relocs intended for `addi`, `ld` or `st`.
   case R_LARCH_PCALA_LO12:
     // We have to again inspect the insn word to handle the R_LARCH_PCALA_LO12
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 81468a20dfb5..5dfb57fda432 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -898,10 +898,16 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
   const TargetInfo &target = *elf::target;
   const auto emachine = config->emachine;
   const bool isDebug = isDebugSection(*this);
-  const bool isDebugLocOrRanges =
-      isDebug && (name == ".debug_loc" || name == ".debug_ranges");
   const bool isDebugLine = isDebug && name == ".debug_line";
   std::optional<uint64_t> tombstone;
+  if (isDebug) {
+    if (name == ".debug_loc" || name == ".debug_ranges")
+      tombstone = 1;
+    else if (name == ".debug_names")
+      tombstone = UINT64_MAX; // tombstone value
+    else
+      tombstone = 0;
+  }
   for (const auto &patAndValue : llvm::reverse(config->deadRelocInNonAlloc))
     if (patAndValue.first.match(this->name)) {
       tombstone = patAndValue.second;
@@ -946,8 +952,7 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
       return;
     }
 
-    if (tombstone ||
-        (isDebug && (type == target.symbolicRel || expr == R_DTPREL))) {
+    if (tombstone && (expr == R_ABS || expr == R_DTPREL)) {
       // Resolve relocations in .debug_* referencing (discarded symbols or ICF
       // folded section symbols) to a tombstone value. Resolving to addend is
       // unsatisfactory because the result address range may collide with a
@@ -978,8 +983,13 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
       // value. Enable -1 in a future release.
       if (!sym.getOutputSection() || (ds && ds->folded && !isDebugLine)) {
         // If -z dead-reloc-in-nonalloc= is specified, respect it.
-        const uint64_t value = tombstone ? SignExtend64<bits>(*tombstone)
-                                         : (isDebugLocOrRanges ? 1 : 0);
+        uint64_t value = SignExtend64<bits>(*tombstone);
+        // For a 32-bit local TU reference in .debug_names, X86_64::relocate
+        // requires that the unsigned value for R_X86_64_32 is truncated to
+        // 32-bit. Other 64-bit targets's don't discern signed/unsigned 32-bit
+        // absolute relocations and do not need this change.
+        if (emachine == EM_X86_64 && type == R_X86_64_32)
+          value = static_cast<uint32_t>(value);
         target.relocateNoSym(bufLoc, type, value);
         continue;
       }
diff --git a/lld/MachO/InputFiles.h b/lld/MachO/InputFiles.h
index 2e37e7ba5a00..5e550c167c23 100644
--- a/lld/MachO/InputFiles.h
+++ b/lld/MachO/InputFiles.h
@@ -140,7 +140,7 @@ protected:
 
   InputFile(Kind, const llvm::MachO::InterfaceFile &);
 
-  // If true, this input's arch is compatiable with target.
+  // If true, this input's arch is compatible with target.
   bool compatArch = true;
 
 private:
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index f92e6cda31e5..01e73b789f9a 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -132,7 +132,7 @@ def check_category_conflicts : Flag<["--"], "check-category-conflicts">,
 def lto_debug_pass_manager: Flag<["--"], "lto-debug-pass-manager">,
     HelpText<"Debug new pass manager">, Group<grp_lld>;
 def cs_profile_generate: Flag<["--"], "cs-profile-generate">,
-    HelpText<"Perform context senstive PGO instrumentation">, Group<grp_lld>;
+    HelpText<"Perform context sensitive PGO instrumentation">, Group<grp_lld>;
 def cs_profile_path: Joined<["--"], "cs-profile-path=">,
     HelpText<"Context sensitive profile file path">, Group<grp_lld>;
 defm pgo_warn_mismatch: BB<"pgo-warn-mismatch",
diff --git a/lld/test/COFF/merge-wowthk.s b/lld/test/COFF/merge-wowthk.s
new file mode 100644
index 000000000000..a1358fc30e4c
--- /dev/null
+++ b/lld/test/COFF/merge-wowthk.s
@@ -0,0 +1,44 @@
+// REQUIRES: aarch64
+
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %s -o %t-arm64ec.obj
+// RUN: llvm-mc -filetype=obj -triple=aarch64-windows %s -o %t-arm64.obj
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o %t-loadcfg.obj
+
+// Check that .wowthk section is merged into .text on ARM64EC target.
+
+// RUN: lld-link -out:%t.dll -machine:arm64ec %t-arm64ec.obj %t-loadcfg.obj -dll -noentry
+// RUN: llvm-objdump -d %t.dll | FileCheck  -check-prefix=DISASM %s
+// DISASM:      0000000180001000 <.text>:
+// DISASM-NEXT: 180001000: 52800040     mov     w0, #0x2                // =2
+// DISASM-NEXT: 180001004: d65f03c0     ret
+// DISASM-NEXT: 180001008: 52800060     mov     w0, #0x3                // =3
+// DISASM-NEXT: 18000100c: d65f03c0     ret
+
+// Check that .wowthk section is not merged on aarch64 target.
+
+// RUN: lld-link -out:%t.dll -machine:arm64 %t-arm64.obj -dll -noentry
+// RUN: llvm-objdump -d %t.dll | FileCheck -check-prefix=DISASM2 %s
+// DISASM2:      0000000180001000 <.text>:
+// DISASM2-NEXT: 180001000: 52800040     mov     w0, #0x2                // =2
+// DISASM2-NEXT: 180001004: d65f03c0     ret
+// DISASM2-EMPTY:
+// DISASM2-NEXT: Disassembly of section .wowthk:
+// DISASM2-EMPTY:
+// DISASM2-NEXT: 0000000180002000 <.wowthk>:
+// DISASM2-NEXT: 180002000: 52800060     mov     w0, #0x3                // =3
+// DISASM2-NEXT: 180002004: d65f03c0     ret
+
+
+        .text
+        .globl arm64ec_func_sym
+        .p2align 2, 0x0
+arm64ec_func_sym:
+        mov w0, #2
+        ret
+
+        .section .wowthk$aa, "x"
+        .globl wowthk_sym
+        .p2align 3, 0x0
+wowthk_sym:
+        mov w0, #3
+        ret
diff --git a/lld/test/COFF/savetemps.ll b/lld/test/COFF/savetemps.ll
index 46a4958d2f78..64d0566108b8 100644
--- a/lld/test/COFF/savetemps.ll
+++ b/lld/test/COFF/savetemps.ll
@@ -18,7 +18,7 @@
 ; RUN: llvm-objdump -s %T/savetemps/savetemps.exe.lto.obj | \
 ; RUN:     FileCheck --check-prefix=CHECK-OBJDUMP %s
 
-; CHECK: define i32 @main()
+; CHECK: define {{(noundef )?}}i32 @main()
 ; CHECK-OBJDUMP: file format coff
 
 target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/lld/test/ELF/debug-dead-reloc-32.s b/lld/test/ELF/debug-dead-reloc-32.s
index 99335b44f51c..1aa43148689e 100644
--- a/lld/test/ELF/debug-dead-reloc-32.s
+++ b/lld/test/ELF/debug-dead-reloc-32.s
@@ -13,6 +13,8 @@
 # CHECK-NEXT:  0000 01000000
 # CHECK-NEXT: Contents of section .debug_addr:
 # CHECK-NEXT:  0000 00000000
+# CHECK-NEXT: Contents of section .debug_names:
+# CHECK-NEXT:  0000 ffffffff
 
 ## -z dead-reloc-in-nonalloc= can override the tombstone value.
 # RUN: ld.lld -z dead-reloc-in-nonalloc=.debug_loc=42 -z dead-reloc-in-nonalloc=.debug_addr=0xfffffffffffffffe %t.o -o %t1
@@ -38,3 +40,12 @@
 ## Resolved to UINT32_C(0), with the addend ignored.
 .section .debug_addr
   .long .text.1+8
+
+.section  .debug_info,"eG",@progbits,5657452045627120676,comdat
+.Ltu_begin0:
+
+.section .debug_names
+## .debug_names may reference a local type unit defined in a COMDAT .debug_info
+## section (-g -gpubnames -fdebug-types-section). If the referenced section is
+## non-prevailing, resolve to UINT32_MAX.
+.long .Ltu_begin0
diff --git a/lld/test/ELF/debug-dead-reloc.s b/lld/test/ELF/debug-dead-reloc.s
index cfa41e00eab0..1a8823737ed5 100644
--- a/lld/test/ELF/debug-dead-reloc.s
+++ b/lld/test/ELF/debug-dead-reloc.s
@@ -21,9 +21,12 @@
 # CHECK:      Contents of section .debug_addr:
 # CHECK-NEXT:  0000 {{.*}}00 00000000 {{.*}}00 00000000
 # CHECK-NEXT:  0010 00000000 00000000 {{.*}}00 00000000
+# CHECK:      Contents of section .debug_names:
+# CHECK-NEXT:  0000 00000000 00000000 00000000 ffffffff .
+# CHECK-NEXT:  0010 ffffffff ffffffff                   .
 # CHECK:      Contents of section .debug_foo:
-# CHECK-NEXT:  0000 00000000 00000000 08000000 00000000
-# CHECK-NEXT:  0010 00000000 00000000 08000000 00000000
+# CHECK-NEXT:  0000 00000000 00000000 00000000 00000000
+# CHECK-NEXT:  0010 00000000 00000000 00000000 00000000
 
 # REL:      Relocations [
 # REL-NEXT:   .rela.text {
@@ -43,6 +46,12 @@
 # REL-NEXT:     0x10 R_X86_64_NONE - 0x18
 # REL-NEXT:     0x18 R_X86_64_64 group 0x20
 # REL-NEXT:   }
+# REL-NEXT:   .rela.debug_names {
+# REL-NEXT:     0x0 R_X86_64_32 .debug_info 0x0
+# REL-NEXT:     0x4 R_X86_64_64 .debug_info 0x0
+# REL-NEXT:     0xC R_X86_64_NONE - 0x0
+# REL-NEXT:     0x10 R_X86_64_NONE - 0x0
+# REL-NEXT:   }
 # REL-NEXT:   .rela.debug_foo {
 # REL-NEXT:     0x0 R_X86_64_NONE - 0x8
 # REL-NEXT:     0x8 R_X86_64_NONE - 0x8
@@ -82,6 +91,17 @@ group:
 ## resolved to the prevailing copy.
   .quad group+32
 
+.section  .debug_info,"G",@progbits,5657452045627120676,comdat
+.Ltu_begin0:
+
+.section .debug_names
+## .debug_names may reference a local type unit defined in a COMDAT .debug_info
+## section (-g -gpubnames -fdebug-types-section). If the referenced section is
+## non-prevailing, resolve to UINT32_MAX.
+.long .Ltu_begin0
+## ... or UINT64_MAX for DWARF64.
+.quad .Ltu_begin0
+
 .section .debug_foo
   .quad .text.1+8
 
diff --git a/lld/test/ELF/loongarch-call36.s b/lld/test/ELF/loongarch-call36.s
new file mode 100644
index 000000000000..b593fdf1f604
--- /dev/null
+++ b/lld/test/ELF/loongarch-call36.s
@@ -0,0 +1,69 @@
+# REQUIRES: loongarch
+
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc --filetype=obj --triple=loongarch64-unknown-elf %t/a.s -o %t/a.o
+
+# RUN: ld.lld %t/a.o --section-start=.text=0x20010 --section-start=.sec.foo=0x60020 -o %t/exe1
+# RUN: llvm-objdump --no-show-raw-insn -d %t/exe1 | FileCheck --match-full-lines %s --check-prefix=EXE1
+## hi20 = target - pc + (1 << 17) >> 18 = 0x60020 - 0x20010 + 0x20000 >> 18 = 1
+## lo18 = target - pc & (1 << 18) - 1 = 0x60020 - 0x20010 & 0x3ffff = 16
+# EXE1:      20010: pcaddu18i $t0, 1
+# EXE1-NEXT: 20014: jirl $zero, $t0, 16
+
+# RUN: ld.lld %t/a.o --section-start=.text=0x20010 --section-start=.sec.foo=0x40020 -o %t/exe2
+# RUN: llvm-objdump --no-show-raw-insn -d %t/exe2 | FileCheck --match-full-lines %s --check-prefix=EXE2
+## hi20 = target - pc + (1 << 17) >> 18 = 0x40020 - 0x20010 + 0x20000 >> 18 = 1
+## lo18 = target - pc & (1 << 18) - 1 = 0x40020 - 0x20010 & 0x3ffff = -131056
+# EXE2:      20010: pcaddu18i $t0, 1
+# EXE2-NEXT: 20014: jirl $zero, $t0, -131056
+
+# RUN: ld.lld %t/a.o -shared -T %t/a.t -o %t/a.so
+# RUN: llvm-readelf -x .got.plt %t/a.so | FileCheck --check-prefix=GOTPLT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/a.so | FileCheck --check-prefix=SO %s
+## PLT should be present in this case.
+# SO:    Disassembly of section .plt:
+# SO:    <.plt>:
+##       foo@plt:
+# SO:    1234520:  pcaddu12i $t3, 64{{$}}
+# SO-NEXT:         ld.d $t3, $t3, 544{{$}}
+# SO-NEXT:         jirl $t1, $t3, 0
+# SO-NEXT:         nop
+
+# SO:   Disassembly of section .text:
+# SO:   <_start>:
+## hi20 = foo@plt - pc + (1 << 17) >> 18 = 0x1234520 - 0x1274670 + 0x20000 >> 18 = -1
+## lo18 = foo@plt - pc & (1 << 18) - 1 = 0x1234520 - 0x1274670 & 0x3ffff = -336
+# SO-NEXT: pcaddu18i $t0, -1{{$}}
+# SO-NEXT: jirl $zero, $t0, -336{{$}}
+
+# GOTPLT:      section '.got.plt':
+# GOTPLT-NEXT: 0x01274730 00000000 00000000 00000000 00000000
+# GOTPLT-NEXT: 0x01274740 00452301 00000000
+
+# RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x2000020000 -o /dev/null 2>&1 | \
+# RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-RANGE %s
+# ERROR-RANGE: error: [[FILE]]:(.text+0x0): relocation R_LARCH_CALL36 out of range: 137438953472 is not in [-137439084544, 137438822399]; references 'foo'
+
+## Impossible case in reality becasue all LoongArch instructions are fixed 4-bytes long.
+# RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x40001 -o /dev/null 2>&1 | \
+# RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-ALIGN %s
+# ERROR-ALIGN: error: [[FILE]]:(.text+0x0): improper alignment for relocation R_LARCH_CALL36: 0x20001 is not aligned to 4 bytes
+
+#--- a.t
+SECTIONS {
+ .plt   0x1234500: { *(.plt) }
+ .text  0x1274670: { *(.text) }
+}
+
+#--- a.s
+.text
+.global _start
+_start:
+  .reloc ., R_LARCH_CALL36, foo
+  pcaddu18i $t0, 0
+  jirl      $zero, $t0, 0
+
+.section .sec.foo,"awx"
+.global foo
+foo:
+  ret
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll
index 74e437747df3..eb2e9970f723 100644
--- a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll
@@ -183,7 +183,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ;; Prevent the vtables from being dead code eliminated.
 @llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ]
 
-; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start
+; CHECK-COMMON-IR-LABEL: define dso_local {{(noundef )?}}i32 @_start
 define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll
index 15040b8707ae..9dacbc32175a 100644
--- a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll
@@ -53,7 +53,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ;; Prevent the vtables from being dead code eliminated.
 @llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ], section "llvm.metadata"
 
-; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start
+; CHECK-COMMON-IR-LABEL: define dso_local noundef i32 @_start
 define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
   ;; Call function built with RegularLTO
   %RegularLTOResult = call i32 @RegularLTO(ptr %obj, i32 %a)
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll
index 3f3ea2cc5a37..2c6f63f75529 100644
--- a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll
@@ -67,7 +67,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ;; Prevent the vtables from being dead code eliminated.
 @llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ]
 
-; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start
+; CHECK-COMMON-IR-LABEL: define dso_local {{(noundef )?}}i32 @_start
 define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
diff --git a/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll b/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll
index 7686fa978d4d..2a52c5ad8ae4 100644
--- a/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll
+++ b/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll
@@ -130,7 +130,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 ;; Prevent the vtables from being dead code eliminated.
 @llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D]
 
-; CHECK-IR-LABEL: define dso_local i32 @_start
+; CHECK-IR-LABEL: define dso_local {{(noundef )?}}i32 @_start
 define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
diff --git a/lld/test/ELF/lto/devirt_vcall_vis_public.ll b/lld/test/ELF/lto/devirt_vcall_vis_public.ll
index 1c0d55f7d73a..a827fea465fd 100644
--- a/lld/test/ELF/lto/devirt_vcall_vis_public.ll
+++ b/lld/test/ELF/lto/devirt_vcall_vis_public.ll
@@ -65,7 +65,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 ; Prevent the vtables from being dead code eliminated.
 @llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D]
 
-; CHECK-IR-LABEL: define dso_local i32 @_start
+; CHECK-IR-LABEL: define dso_local {{(noundef )?}}i32 @_start
 define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
diff --git a/lld/wasm/InputFiles.cpp b/lld/wasm/InputFiles.cpp
index 96ac1e1610dd..5709a5ced584 100644
--- a/lld/wasm/InputFiles.cpp
+++ b/lld/wasm/InputFiles.cpp
@@ -680,16 +680,7 @@ Symbol *ObjFile::createUndefined(const WasmSymbol &sym, bool isCalledDirectly) {
   llvm_unreachable("unknown symbol kind");
 }
 
-
-StringRef strip(StringRef s) {
-  while (s.starts_with(" ")) {
-    s = s.drop_front();
-  }
-  while (s.ends_with(" ")) {
-    s = s.drop_back();
-  }
-  return s;
-}
+StringRef strip(StringRef s) { return s.trim(' '); }
 
 void StubFile::parse() {
   bool first = true;
diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index 4a53d7ef3d0d..7844d93d78d2 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -44,6 +44,10 @@ endif()
 
 if (WIN32)
   add_definitions(-D_ENABLE_EXTENDED_ALIGNED_STORAGE)
+  if (NOT MSVC)
+    # _BSD_SOURCE is required for MinGW's getopt.h to define optreset
+    add_definitions(-D_BSD_SOURCE)
+  endif()
 endif()
 
 if (LLDB_ENABLE_PYTHON)
diff --git a/lldb/include/lldb/Host/HostGetOpt.h b/lldb/include/lldb/Host/HostGetOpt.h
index 746e03e1bd1e..52cfdf4dbb89 100644
--- a/lldb/include/lldb/Host/HostGetOpt.h
+++ b/lldb/include/lldb/Host/HostGetOpt.h
@@ -11,10 +11,6 @@
 
 #if !defined(_MSC_VER) && !defined(__NetBSD__)
 
-#ifdef _WIN32
-#define _BSD_SOURCE // Required so that getopt.h defines optreset
-#endif
-
 #include <getopt.h>
 #include <unistd.h>
 
diff --git a/lldb/source/Host/linux/HostInfoLinux.cpp b/lldb/source/Host/linux/HostInfoLinux.cpp
index c66f787db0cf..723f0c2fb3fd 100644
--- a/lldb/source/Host/linux/HostInfoLinux.cpp
+++ b/lldb/source/Host/linux/HostInfoLinux.cpp
@@ -123,7 +123,7 @@ llvm::StringRef HostInfoLinux::GetDistributionId() {
         if (strstr(distribution_id, distributor_id_key)) {
           // strip newlines
           std::string id_string(distribution_id + strlen(distributor_id_key));
-          llvm::erase_value(id_string, '\n');
+          llvm::erase(id_string, '\n');
 
           // lower case it and convert whitespace to underscores
           std::transform(
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index e1275ce711fc..00651df48b62 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -1160,7 +1160,11 @@ Status CommandInterpreter::AddUserCommand(llvm::StringRef name,
 
   if (UserCommandExists(name)) {
     if (!can_replace) {
-      result.SetErrorString("user command exists and force replace not set");
+      result.SetErrorStringWithFormatv(
+          "user command \"{0}\" already exists and force replace was not set "
+          "by --overwrite or 'settings set interpreter.require-overwrite "
+          "false'",
+          name);
       return result;
     }
     if (cmd_sp->IsMultiwordObject()) {
diff --git a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp
index 37dc8f6d6d14..ad67e764fe10 100644
--- a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp
@@ -7,18 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/HostInfo.h"
-#include "lldb/Host/HostNativeThreadBase.h"
-#include "lldb/Host/windows/HostThreadWindows.h"
-#include "lldb/Host/windows/windows.h"
-#include "lldb/Target/RegisterContext.h"
 #include "lldb/Target/Unwind.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
-#include "lldb/Utility/State.h"
 
 #include "ProcessWindows.h"
-#include "ProcessWindowsLog.h"
 #include "TargetThreadWindows.h"
+#include "lldb/Host/windows/HostThreadWindows.h"
+#include <llvm/Support/ConvertUTF.h>
 
 #if defined(__x86_64__) || defined(_M_AMD64)
 #include "x64/RegisterContextWindows_x64.h"
@@ -33,6 +29,9 @@
 using namespace lldb;
 using namespace lldb_private;
 
+using GetThreadDescriptionFunctionPtr = HRESULT
+WINAPI (*)(HANDLE hThread, PWSTR *ppszThreadDescription);
+
 TargetThreadWindows::TargetThreadWindows(ProcessWindows &process,
                                          const HostThread &thread)
     : Thread(process, thread.GetNativeThread().GetThreadId()),
@@ -175,3 +174,29 @@ Status TargetThreadWindows::DoResume() {
 
   return Status();
 }
+
+const char *TargetThreadWindows::GetName() {
+  Log *log = GetLog(LLDBLog::Thread);
+  static GetThreadDescriptionFunctionPtr GetThreadDescription = []() {
+    HMODULE hModule = ::LoadLibraryW(L"Kernel32.dll");
+    return hModule ? reinterpret_cast<GetThreadDescriptionFunctionPtr>(
+                         ::GetProcAddress(hModule, "GetThreadDescription"))
+                   : nullptr;
+  }();
+  LLDB_LOGF(log, "GetProcAddress: %p",
+            reinterpret_cast<void *>(GetThreadDescription));
+  if (!GetThreadDescription)
+    return m_name.c_str();
+  PWSTR pszThreadName;
+  if (SUCCEEDED(GetThreadDescription(
+          m_host_thread.GetNativeThread().GetSystemHandle(), &pszThreadName))) {
+    LLDB_LOGF(log, "GetThreadDescription: %ls", pszThreadName);
+    llvm::convertUTF16ToUTF8String(
+        llvm::ArrayRef(reinterpret_cast<char *>(pszThreadName),
+                       wcslen(pszThreadName) * sizeof(wchar_t)),
+        m_name);
+    ::LocalFree(pszThreadName);
+  }
+
+  return m_name.c_str();
+}
diff --git a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.h b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.h
index 2845847738f6..07e1db464ad5 100644
--- a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.h
+++ b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.h
@@ -34,6 +34,7 @@ public:
   lldb::RegisterContextSP
   CreateRegisterContextForFrame(StackFrame *frame) override;
   bool CalculateStopInfo() override;
+  const char *GetName() override;
 
   Status DoResume();
 
@@ -42,6 +43,7 @@ public:
 private:
   lldb::RegisterContextSP m_thread_reg_ctx_sp;
   HostThread m_host_thread;
+  std::string m_name;
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Target/JITLoaderList.cpp b/lldb/source/Target/JITLoaderList.cpp
index 9158d0a5e546..9fa070edd4b8 100644
--- a/lldb/source/Target/JITLoaderList.cpp
+++ b/lldb/source/Target/JITLoaderList.cpp
@@ -24,7 +24,7 @@ void JITLoaderList::Append(const JITLoaderSP &jit_loader_sp) {
 
 void JITLoaderList::Remove(const JITLoaderSP &jit_loader_sp) {
   std::lock_guard<std::recursive_mutex> guard(m_jit_loaders_mutex);
-  llvm::erase_value(m_jit_loaders_vec, jit_loader_sp);
+  llvm::erase(m_jit_loaders_vec, jit_loader_sp);
 }
 
 size_t JITLoaderList::GetSize() const { return m_jit_loaders_vec.size(); }
diff --git a/lldb/test/API/commands/command/script/TestCommandScript.py b/lldb/test/API/commands/command/script/TestCommandScript.py
index cac11834fa73..850552032902 100644
--- a/lldb/test/API/commands/command/script/TestCommandScript.py
+++ b/lldb/test/API/commands/command/script/TestCommandScript.py
@@ -161,6 +161,19 @@ class CmdPythonTestCase(TestBase):
         )
         self.expect("my_command", substrs=["a.out"])
 
+        # Test that without --overwrite we are not allowed to redefine the command.
+        self.expect(
+            "command script add my_command --class welcome.TargetnameCommand",
+            substrs=[
+                (
+                    'user command "my_command" already exists and force replace was'
+                    " not set by --overwrite or 'settings set"
+                    " interpreter.require-overwrite false'"
+                ),
+            ],
+            error=True,
+        )
+
         self.runCmd("command script clear")
 
         self.expect(
diff --git a/lldb/test/API/python_api/global_module_cache/TestGlobalModuleCache.py b/lldb/test/API/python_api/global_module_cache/TestGlobalModuleCache.py
index 6bb22c46efb4..5b6e9e8a588a 100644
--- a/lldb/test/API/python_api/global_module_cache/TestGlobalModuleCache.py
+++ b/lldb/test/API/python_api/global_module_cache/TestGlobalModuleCache.py
@@ -26,6 +26,13 @@ class GlobalModuleCacheTestCase(TestBase):
         # a previous build, so sleep a bit here to ensure that the touch is later.
         time.sleep(2)
         try:
+            # Make sure dst is writeable before trying to write to it.
+            subprocess.run(
+                ["chmod", "777", dst],
+                stdin=None,
+                capture_output=False,
+                encoding="utf-8",
+            )
             shutil.copy(src, dst)
         except:
             self.fail(f"Could not copy {src} to {dst}")
diff --git a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
index cc544919b6a8..90b130d3af4d 100644
--- a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
+++ b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
@@ -3,10 +3,10 @@ Test lldb-dap variables/stackTrace request for optimized code
 """
 
 import dap_server
+import lldbdap_testcase
+from lldbsuite.test import lldbutil
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
-import lldbdap_testcase
 
 
 class TestDAP_optimized(lldbdap_testcase.DAPTestCaseBase):
@@ -47,3 +47,8 @@ class TestDAP_optimized(lldbdap_testcase.DAPTestCaseBase):
         optimized_variable = self.dap_server.get_local_variable("argc")
 
         self.assertTrue(optimized_variable["value"].startswith("<error:"))
+        error_msg = optimized_variable["$__lldb_extensions"]["error"]
+        self.assertTrue(
+            ("Could not evaluate DW_OP_entry_value" in error_msg)
+            or ("variable not available" in error_msg)
+        )
diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
index 9b0755eea7d3..58d6b941ac81 100644
--- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
+++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
@@ -153,10 +153,18 @@ class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
         buffer_children = make_buffer_verify_dict(0, 32)
         verify_locals = {
             "argc": {
-                "equals": {"type": "int", "value": "1"},
-                "declaration": {
-                    "equals": {"line": 12, "column": 14},
-                    "contains": {"path": ["lldb-dap", "variables", "main.cpp"]},
+                "equals": {
+                    "type": "int",
+                    "value": "1",
+                },
+                "$__lldb_extensions": {
+                    "equals": {
+                        "value": "1",
+                    },
+                    "declaration": {
+                        "equals": {"line": 12, "column": 14},
+                        "contains": {"path": ["lldb-dap", "variables", "main.cpp"]},
+                    },
                 },
             },
             "argv": {
@@ -165,7 +173,9 @@ class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
                 "hasVariablesReference": True,
             },
             "pt": {
-                "equals": {"type": "PointType"},
+                "equals": {
+                    "type": "PointType",
+                },
                 "hasVariablesReference": True,
                 "children": {
                     "x": {"equals": {"type": "int", "value": "11"}},
@@ -175,6 +185,10 @@ class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
             },
             "x": {"equals": {"type": "int"}},
         }
+        if enableAutoVariableSummaries:
+            verify_locals["pt"]["$__lldb_extensions"] = {
+                "equals": {"autoSummary": "{x:11, y:22}"}
+            }
         verify_globals = {
             "s_local": {"equals": {"type": "float", "value": "2.25"}},
             "::g_global": {"equals": {"type": "int", "value": "123"}},
diff --git a/lldb/tools/lldb-dap/BreakpointBase.cpp b/lldb/tools/lldb-dap/BreakpointBase.cpp
index bc9bde97678a..fb4b27fbe315 100644
--- a/lldb/tools/lldb-dap/BreakpointBase.cpp
+++ b/lldb/tools/lldb-dap/BreakpointBase.cpp
@@ -296,7 +296,7 @@ bool BreakpointBase::BreakpointHitCallback(
           frame.GetValueForVariablePath(expr, lldb::eDynamicDontRunTarget);
       if (value.GetError().Fail())
         value = frame.EvaluateExpression(expr);
-      output += ValueToString(value);
+      output += VariableDescription(value).display_value;
     } else {
       output += messagePart.text;
     }
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index a0a175f960bc..df17ac9d8491 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -211,46 +211,6 @@ static std::optional<std::string> TryCreateAutoSummary(lldb::SBValue value) {
   return TryCreateAutoSummaryForContainer(value);
 }
 
-std::string ValueToString(lldb::SBValue v) {
-  std::string result;
-  llvm::raw_string_ostream strm(result);
-
-  lldb::SBError error = v.GetError();
-  if (!error.Success()) {
-    strm << "<error: " << error.GetCString() << ">";
-  } else {
-    llvm::StringRef value = v.GetValue();
-    llvm::StringRef nonAutoSummary = v.GetSummary();
-    std::optional<std::string> summary = !nonAutoSummary.empty()
-                                             ? nonAutoSummary.str()
-                                             : TryCreateAutoSummary(v);
-    if (!value.empty()) {
-      strm << value;
-      if (summary)
-        strm << ' ' << *summary;
-    } else if (summary) {
-      strm << *summary;
-
-      // As last resort, we print its type and address if available.
-    } else {
-      if (llvm::StringRef type_name = v.GetType().GetDisplayTypeName();
-          !type_name.empty()) {
-        strm << type_name;
-        lldb::addr_t address = v.GetLoadAddress();
-        if (address != LLDB_INVALID_ADDRESS)
-          strm << " @ " << llvm::format_hex(address, 0);
-      }
-    }
-  }
-  return result;
-}
-
-void SetValueForKey(lldb::SBValue &v, llvm::json::Object &object,
-                    llvm::StringRef key) {
-  std::string result = ValueToString(v);
-  EmplaceSafeString(object, key, result);
-}
-
 void FillResponse(const llvm::json::Object &request,
                   llvm::json::Object &response) {
   // Fill in all of the needed response fields to a "request" and set "success"
@@ -1045,6 +1005,92 @@ std::string CreateUniqueVariableNameForDisplay(lldb::SBValue v,
   return name_builder.GetData();
 }
 
+VariableDescription::VariableDescription(lldb::SBValue v, bool format_hex,
+                                         bool is_name_duplicated,
+                                         std::optional<std::string> custom_name)
+    : v(v) {
+  name = custom_name
+             ? *custom_name
+             : CreateUniqueVariableNameForDisplay(v, is_name_duplicated);
+
+  type_obj = v.GetType();
+  std::string raw_display_type_name =
+      llvm::StringRef(type_obj.GetDisplayTypeName()).str();
+  display_type_name =
+      !raw_display_type_name.empty() ? raw_display_type_name : NO_TYPENAME;
+
+  if (format_hex)
+    v.SetFormat(lldb::eFormatHex);
+
+  llvm::raw_string_ostream os_display_value(display_value);
+
+  if (lldb::SBError sb_error = v.GetError(); sb_error.Fail()) {
+    error = sb_error.GetCString();
+    os_display_value << "<error: " << error << ">";
+  } else {
+    value = llvm::StringRef(v.GetValue()).str();
+    summary = llvm::StringRef(v.GetSummary()).str();
+    if (summary.empty())
+      auto_summary = TryCreateAutoSummary(v);
+
+    std::optional<std::string> effective_summary =
+        !summary.empty() ? summary : auto_summary;
+
+    if (!value.empty()) {
+      os_display_value << value;
+      if (effective_summary)
+        os_display_value << " " << *effective_summary;
+    } else if (effective_summary) {
+      os_display_value << *effective_summary;
+
+      // As last resort, we print its type and address if available.
+    } else {
+      if (!raw_display_type_name.empty()) {
+        os_display_value << raw_display_type_name;
+        lldb::addr_t address = v.GetLoadAddress();
+        if (address != LLDB_INVALID_ADDRESS)
+          os_display_value << " @ " << llvm::format_hex(address, 0);
+      }
+    }
+  }
+
+  lldb::SBStream evaluateStream;
+  v.GetExpressionPath(evaluateStream);
+  evaluate_name = llvm::StringRef(evaluateStream.GetData()).str();
+}
+
+llvm::json::Object VariableDescription::GetVariableExtensionsJSON() {
+  llvm::json::Object extensions;
+  if (error)
+    EmplaceSafeString(extensions, "error", *error);
+  if (!value.empty())
+    EmplaceSafeString(extensions, "value", value);
+  if (!summary.empty())
+    EmplaceSafeString(extensions, "summary", summary);
+  if (auto_summary)
+    EmplaceSafeString(extensions, "autoSummary", *auto_summary);
+
+  if (lldb::SBDeclaration decl = v.GetDeclaration(); decl.IsValid()) {
+    llvm::json::Object decl_obj;
+    if (lldb::SBFileSpec file = decl.GetFileSpec(); file.IsValid()) {
+      char path[PATH_MAX] = "";
+      if (file.GetPath(path, sizeof(path)) &&
+          lldb::SBFileSpec::ResolvePath(path, path, PATH_MAX)) {
+        decl_obj.try_emplace("path", std::string(path));
+      }
+    }
+
+    if (int line = decl.GetLine())
+      decl_obj.try_emplace("line", line);
+    if (int column = decl.GetColumn())
+      decl_obj.try_emplace("column", column);
+
+    if (!decl_obj.empty())
+      extensions.try_emplace("declaration", std::move(decl_obj));
+  }
+  return extensions;
+}
+
 // "Variable": {
 //   "type": "object",
 //   "description": "A Variable is a name/value pair. Optionally a variable
@@ -1104,27 +1150,56 @@ std::string CreateUniqueVariableNameForDisplay(lldb::SBValue v,
 //                       can use this optional information to present the
 //                       children in a paged UI and fetch them in chunks."
 //     }
-//     "declaration": {
-//       "type": "object | undefined",
-//       "description": "Extension to the protocol that indicates the source
-//                       location where the variable was declared. This value
-//                       might not be present if no declaration is available.",
+//
+//
+//     "$__lldb_extensions": {
+//       "description": "Unofficial extensions to the protocol",
 //       "properties": {
-//         "path": {
-//           "type": "string | undefined",
-//           "description": "The source file path where the variable was
-//                           declared."
-//         },
-//         "line": {
-//           "type": "number | undefined",
-//           "description": "The 1-indexed source line where the variable was
-//                          declared."
-//         },
-//         "column": {
-//           "type": "number | undefined",
-//           "description": "The 1-indexed source column where the variable was
-//                          declared."
+//         "declaration": {
+//         "type": "object",
+//         "description": "The source location where the variable was declared.
+//                         This value won't be present if no declaration is
+//                         available.",
+//         "properties": {
+//           "path": {
+//             "type": "string",
+//             "description": "The source file path where the variable was
+//                            declared."
+//           },
+//           "line": {
+//             "type": "number",
+//             "description": "The 1-indexed source line where the variable was
+//                             declared."
+//           },
+//           "column": {
+//             "type": "number",
+//             "description": "The 1-indexed source column where the variable
+//                             was declared."
+//           }
 //         }
+//       },
+//       "value":
+//         "type": "string",
+//         "description": "The internal value of the variable as returned by
+//                         This is effectively SBValue.GetValue(). The other
+//                         `value` entry in the top-level variable response is,
+//                          on the other hand, just a display string for the
+//                          variable."
+//       },
+//       "summary":
+//         "type": "string",
+//         "description": "The summary string of the variable. This is
+//                         effectively SBValue.GetSummary()."
+//       },
+//       "autoSummary":
+//         "type": "string",
+//         "description": "The auto generated summary if using
+//                         `enableAutoVariableSummaries`."
+//       },
+//       "error":
+//         "type": "string",
+//         "description": "An error message generated if LLDB couldn't inspect
+//                         the variable."
 //       }
 //     }
 //   },
@@ -1134,81 +1209,57 @@ llvm::json::Value CreateVariable(lldb::SBValue v, int64_t variablesReference,
                                  int64_t varID, bool format_hex,
                                  bool is_name_duplicated,
                                  std::optional<std::string> custom_name) {
+  VariableDescription desc(v, format_hex, is_name_duplicated, custom_name);
   llvm::json::Object object;
-  EmplaceSafeString(
-      object, "name",
-      custom_name ? *custom_name
-                  : CreateUniqueVariableNameForDisplay(v, is_name_duplicated));
+  EmplaceSafeString(object, "name", desc.name);
+  EmplaceSafeString(object, "value", desc.display_value);
+
+  if (!desc.evaluate_name.empty())
+    EmplaceSafeString(object, "evaluateName", desc.evaluate_name);
 
-  if (format_hex)
-    v.SetFormat(lldb::eFormatHex);
-  SetValueForKey(v, object, "value");
-  auto type_obj = v.GetType();
-  auto type_cstr = type_obj.GetDisplayTypeName();
   // If we have a type with many children, we would like to be able to
   // give a hint to the IDE that the type has indexed children so that the
-  // request can be broken up in grabbing only a few children at a time. We want
-  // to be careful and only call "v.GetNumChildren()" if we have an array type
-  // or if we have a synthetic child provider. We don't want to call
-  // "v.GetNumChildren()" on all objects as class, struct and union types don't
-  // need to be completed if they are never expanded. So we want to avoid
-  // calling this to only cases where we it makes sense to keep performance high
-  // during normal debugging.
-
-  // If we have an array type, say that it is indexed and provide the number of
-  // children in case we have a huge array. If we don't do this, then we might
-  // take a while to produce all children at onces which can delay your debug
-  // session.
-  const bool is_array = type_obj.IsArrayType();
+  // request can be broken up in grabbing only a few children at a time. We
+  // want to be careful and only call "v.GetNumChildren()" if we have an array
+  // type or if we have a synthetic child provider. We don't want to call
+  // "v.GetNumChildren()" on all objects as class, struct and union types
+  // don't need to be completed if they are never expanded. So we want to
+  // avoid calling this to only cases where we it makes sense to keep
+  // performance high during normal debugging.
+
+  // If we have an array type, say that it is indexed and provide the number
+  // of children in case we have a huge array. If we don't do this, then we
+  // might take a while to produce all children at onces which can delay your
+  // debug session.
+  const bool is_array = desc.type_obj.IsArrayType();
   const bool is_synthetic = v.IsSynthetic();
   if (is_array || is_synthetic) {
     const auto num_children = v.GetNumChildren();
     // We create a "[raw]" fake child for each synthetic type, so we have to
-    // account for it when returning indexed variables. We don't need to do this
-    // for non-indexed ones.
+    // account for it when returning indexed variables. We don't need to do
+    // this for non-indexed ones.
     bool has_raw_child = is_synthetic && g_dap.enable_synthetic_child_debugging;
     int actual_num_children = num_children + (has_raw_child ? 1 : 0);
     if (is_array) {
       object.try_emplace("indexedVariables", actual_num_children);
     } else if (num_children > 0) {
-      // If a type has a synthetic child provider, then the SBType of "v" won't
-      // tell us anything about what might be displayed. So we can check if the
-      // first child's name is "[0]" and then we can say it is indexed.
+      // If a type has a synthetic child provider, then the SBType of "v"
+      // won't tell us anything about what might be displayed. So we can check
+      // if the first child's name is "[0]" and then we can say it is indexed.
       const char *first_child_name = v.GetChildAtIndex(0).GetName();
       if (first_child_name && strcmp(first_child_name, "[0]") == 0)
         object.try_emplace("indexedVariables", actual_num_children);
     }
   }
-  EmplaceSafeString(object, "type", type_cstr ? type_cstr : NO_TYPENAME);
+  EmplaceSafeString(object, "type", desc.display_type_name);
   if (varID != INT64_MAX)
     object.try_emplace("id", varID);
   if (v.MightHaveChildren())
     object.try_emplace("variablesReference", variablesReference);
   else
     object.try_emplace("variablesReference", (int64_t)0);
-  lldb::SBStream evaluateStream;
-  v.GetExpressionPath(evaluateStream);
-  const char *evaluateName = evaluateStream.GetData();
-  if (evaluateName && evaluateName[0])
-    EmplaceSafeString(object, "evaluateName", std::string(evaluateName));
-
-  if (lldb::SBDeclaration decl = v.GetDeclaration(); decl.IsValid()) {
-    llvm::json::Object decl_obj;
-    if (lldb::SBFileSpec file = decl.GetFileSpec(); file.IsValid()) {
-      char path[PATH_MAX] = "";
-      if (file.GetPath(path, sizeof(path)) &&
-          lldb::SBFileSpec::ResolvePath(path, path, PATH_MAX)) {
-        decl_obj.try_emplace("path", std::string(path));
-      }
-    }
 
-    if (int line = decl.GetLine())
-      decl_obj.try_emplace("line", line);
-    if (int column = decl.GetColumn())
-      decl_obj.try_emplace("column", column);
-
-    object.try_emplace("declaration", std::move(decl_obj));
-  }
+  object.try_emplace("$__lldb_extensions", desc.GetVariableExtensionsJSON());
   return llvm::json::Value(std::move(object));
 }
 
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 02ee499445fa..7f9fc7037c63 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -167,33 +167,6 @@ std::vector<std::string> GetStrings(const llvm::json::Object *obj,
 void FillResponse(const llvm::json::Object &request,
                   llvm::json::Object &response);
 
-/// Utility function to convert SBValue \v into a string.
-std::string ValueToString(lldb::SBValue v);
-
-/// Emplace the string value from an SBValue into the supplied object
-/// using \a key as the key that will contain the value.
-///
-/// The value is what we will display in VS Code. Some SBValue objects
-/// can have a value and/or a summary. If a value has both, we
-/// combine the value and the summary into one string. If we only have a
-/// value or summary, then that is considered the value. If there is
-/// no value and no summary then the value is the type name followed by
-/// the address of the type if it has an address.
-///
-///
-/// \param[in] v
-///     A lldb::SBValue object to extract the string value from
-///
-///
-/// \param[in] object
-///     The object to place the value object into
-///
-///
-/// \param[in] key
-///     The key name to use when inserting the value object we create
-void SetValueForKey(lldb::SBValue &v, llvm::json::Object &object,
-                    llvm::StringRef key);
-
 /// Converts \a bp to a JSON value and appends the first valid location to the
 /// \a breakpoints array.
 ///
@@ -401,6 +374,39 @@ const char *GetNonNullVariableName(lldb::SBValue value);
 std::string CreateUniqueVariableNameForDisplay(lldb::SBValue v,
                                                bool is_name_duplicated);
 
+/// Helper struct that parses the metadata of an \a lldb::SBValue and produces
+/// a canonical set of properties that can be sent to DAP clients.
+struct VariableDescription {
+  // The error message if SBValue.GetValue() fails.
+  std::optional<std::string> error;
+  // The display description to show on the IDE.
+  std::string display_value;
+  // The display name to show on the IDE.
+  std::string name;
+  // The variable path for this variable.
+  std::string evaluate_name;
+  // The output of SBValue.GetValue() if it doesn't fail. It might be empty.
+  std::string value;
+  // The summary string of this variable. It might be empty.
+  std::string summary;
+  // The auto summary if using `enableAutoVariableSummaries`.
+  std::optional<std::string> auto_summary;
+  // The type of this variable.
+  lldb::SBType type_obj;
+  // The display type name of this variable.
+  std::string display_type_name;
+  /// The SBValue for this variable.
+  lldb::SBValue v;
+
+  VariableDescription(lldb::SBValue v, bool format_hex = false,
+                      bool is_name_duplicated = false,
+                      std::optional<std::string> custom_name = {});
+
+  /// Create a JSON object that represents these extensions to the DAP variable
+  /// response.
+  llvm::json::Object GetVariableExtensionsJSON();
+};
+
 /// Create a "Variable" object for a LLDB thread object.
 ///
 /// This function will fill in the following keys in the returned
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 75b3948b5efb..0d45b0379c85 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -1316,10 +1316,9 @@ void request_evaluate(const llvm::json::Object &request) {
       else
         EmplaceSafeString(response, "message", "evaluate failed");
     } else {
-      SetValueForKey(value, body, "result");
-      auto value_typename = value.GetType().GetDisplayTypeName();
-      EmplaceSafeString(body, "type",
-                        value_typename ? value_typename : NO_TYPENAME);
+      VariableDescription desc(value);
+      EmplaceSafeString(body, "result", desc.display_value);
+      EmplaceSafeString(body, "type", desc.display_type_name);
       if (value.MightHaveChildren()) {
         auto variableReference = g_dap.variables.InsertExpandableVariable(
             value, /*is_permanent=*/context == "repl");
@@ -3109,8 +3108,9 @@ void request_setVariable(const llvm::json::Object &request) {
     lldb::SBError error;
     bool success = variable.SetValueFromCString(value.data(), error);
     if (success) {
-      SetValueForKey(variable, body, "value");
-      EmplaceSafeString(body, "type", variable.GetType().GetDisplayTypeName());
+      VariableDescription desc(variable);
+      EmplaceSafeString(body, "result", desc.display_value);
+      EmplaceSafeString(body, "type", desc.display_type_name);
 
       // We don't know the index of the variable in our g_dap.variables
       // so always insert a new one to get its variablesReference.
diff --git a/lldb/unittests/Thread/CMakeLists.txt b/lldb/unittests/Thread/CMakeLists.txt
index d6e365adac5d..8fc44da6f5b3 100644
--- a/lldb/unittests/Thread/CMakeLists.txt
+++ b/lldb/unittests/Thread/CMakeLists.txt
@@ -1,3 +1,8 @@
+if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+  list(APPEND LLDB_WINDOWS_LIBS lldbPluginPlatformWindows)
+  list(APPEND LLDB_WINDOWS_LIBS lldbPluginProcessWindowsCommon)
+endif()
+
 add_lldb_unittest(ThreadTests
   ThreadTest.cpp
 
@@ -11,5 +16,6 @@ add_lldb_unittest(ThreadTests
       lldbInterpreter
       lldbBreakpoint
       lldbPluginPlatformLinux
+      ${LLDB_WINDOWS_LIBS}
   )
 
diff --git a/lldb/unittests/Thread/ThreadTest.cpp b/lldb/unittests/Thread/ThreadTest.cpp
index bd8cdce99f17..4c660e9815c3 100644
--- a/lldb/unittests/Thread/ThreadTest.cpp
+++ b/lldb/unittests/Thread/ThreadTest.cpp
@@ -8,9 +8,20 @@
 
 #include "lldb/Target/Thread.h"
 #include "Plugins/Platform/Linux/PlatformLinux.h"
+#include <thread>
+#ifdef _WIN32
+#include "lldb/Host/windows/HostThreadWindows.h"
+#include "lldb/Host/windows/windows.h"
+
+#include "Plugins/Platform/Windows/PlatformWindows.h"
+#include "Plugins/Process/Windows/Common/LocalDebugDelegate.h"
+#include "Plugins/Process/Windows/Common/ProcessWindows.h"
+#include "Plugins/Process/Windows/Common/TargetThreadWindows.h"
+#endif
 #include "lldb/Core/Debugger.h"
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Host/HostInfo.h"
+#include "lldb/Host/HostThread.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/StopInfo.h"
 #include "lldb/Utility/ArchSpec.h"
@@ -21,14 +32,33 @@ using namespace lldb_private::repro;
 using namespace lldb;
 
 namespace {
+
+#ifdef _WIN32
+using SetThreadDescriptionFunctionPtr = HRESULT
+WINAPI (*)(HANDLE hThread, PCWSTR lpThreadDescription);
+
+static SetThreadDescriptionFunctionPtr SetThreadName;
+#endif
+
 class ThreadTest : public ::testing::Test {
 public:
   void SetUp() override {
     FileSystem::Initialize();
     HostInfo::Initialize();
+#ifdef _WIN32
+    HMODULE hModule = ::LoadLibraryW(L"Kernel32.dll");
+    if (hModule) {
+      SetThreadName = reinterpret_cast<SetThreadDescriptionFunctionPtr>(
+          ::GetProcAddress(hModule, "SetThreadDescription"));
+    }
+    PlatformWindows::Initialize();
+#endif
     platform_linux::PlatformLinux::Initialize();
   }
   void TearDown() override {
+#ifdef _WIN32
+    PlatformWindows::Terminate();
+#endif
     platform_linux::PlatformLinux::Terminate();
     HostInfo::Terminate();
     FileSystem::Terminate();
@@ -88,6 +118,47 @@ TargetSP CreateTarget(DebuggerSP &debugger_sp, ArchSpec &arch) {
   return target_sp;
 }
 
+#ifdef _WIN32
+std::shared_ptr<TargetThreadWindows>
+CreateWindowsThread(const ProcessWindowsSP &process_sp, std::thread &t) {
+  HostThread host_thread((lldb::thread_t)t.native_handle());
+  ThreadSP thread_sp =
+      std::make_shared<TargetThreadWindows>(*process_sp.get(), host_thread);
+  return std::static_pointer_cast<TargetThreadWindows>(thread_sp);
+}
+
+TEST_F(ThreadTest, GetThreadDescription) {
+  if (!SetThreadName)
+    return;
+
+  ArchSpec arch(HostInfo::GetArchitecture());
+  Platform::SetHostPlatform(PlatformWindows::CreateInstance(true, &arch));
+
+  DebuggerSP debugger_sp = Debugger::CreateInstance();
+  ASSERT_TRUE(debugger_sp);
+
+  TargetSP target_sp = CreateTarget(debugger_sp, arch);
+  ASSERT_TRUE(target_sp);
+
+  ListenerSP listener_sp(Listener::MakeListener("dummy"));
+  auto process_sp = std::static_pointer_cast<ProcessWindows>(
+      ProcessWindows::CreateInstance(target_sp, listener_sp, nullptr, false));
+  ASSERT_TRUE(process_sp);
+
+  std::thread t([]() {});
+  auto thread_sp = CreateWindowsThread(process_sp, t);
+  DWORD tid = thread_sp->GetHostThread().GetNativeThread().GetThreadId();
+  HANDLE hThread = ::OpenThread(THREAD_SET_LIMITED_INFORMATION, FALSE, tid);
+  ASSERT_TRUE(hThread);
+
+  SetThreadName(hThread, L"thread name");
+  ::CloseHandle(hThread);
+  ASSERT_STREQ(thread_sp->GetName(), "thread name");
+
+  t.join();
+}
+#endif
+
 TEST_F(ThreadTest, SetStopInfo) {
   ArchSpec arch("powerpc64-pc-linux");
 
diff --git a/lldb/unittests/Utility/ChecksumTest.cpp b/lldb/unittests/Utility/ChecksumTest.cpp
index 7537d30b5ff5..a81aba2ee98c 100644
--- a/lldb/unittests/Utility/ChecksumTest.cpp
+++ b/lldb/unittests/Utility/ChecksumTest.cpp
@@ -12,14 +12,14 @@
 
 using namespace lldb_private;
 
-static llvm::MD5::MD5Result hash1 = {0, 1, 2,  3,  4,  5,  6,  7,
-                                     8, 9, 10, 11, 12, 13, 14, 15};
+static llvm::MD5::MD5Result hash1 = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}};
 
-static llvm::MD5::MD5Result hash2 = {0, 1, 2,  3,  4,  5,  6,  7,
-                                     8, 9, 10, 11, 12, 13, 14, 15};
+static llvm::MD5::MD5Result hash2 = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}};
 
-static llvm::MD5::MD5Result hash3 = {8, 9, 10, 11, 12, 13, 14, 15,
-                                     0, 1, 2,  3,  4,  5,  6,  7};
+static llvm::MD5::MD5Result hash3 = {
+    {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}};
 
 TEST(ChecksumTest, TestConstructor) {
   Checksum checksum1;
diff --git a/lldb/unittests/Utility/FileSpecTest.cpp b/lldb/unittests/Utility/FileSpecTest.cpp
index 565395a495be..9faad10e4730 100644
--- a/lldb/unittests/Utility/FileSpecTest.cpp
+++ b/lldb/unittests/Utility/FileSpecTest.cpp
@@ -536,7 +536,8 @@ TEST(FileSpecTest, TestGetComponents) {
 }
 
 TEST(FileSpecTest, TestChecksum) {
-  Checksum checksum({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+  Checksum checksum(llvm::MD5::MD5Result{
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}});
   FileSpec file_spec("/foo/bar", FileSpec::Style::posix, checksum);
   EXPECT_TRUE(static_cast<bool>(file_spec.GetChecksum()));
   EXPECT_EQ(file_spec.GetChecksum(), checksum);
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 7bb3e98333ef..3e12213f6e6b 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -431,15 +431,13 @@ set(USE_NO_UNINITIALIZED 0)
 # Disable gcc's potentially uninitialized use analysis as it presents lots of
 # false positives.
 if (CMAKE_COMPILER_IS_GNUCXX)
-  check_cxx_compiler_flag("-Wmaybe-uninitialized" HAS_MAYBE_UNINITIALIZED)
-  if (HAS_MAYBE_UNINITIALIZED)
-    set(USE_NO_MAYBE_UNINITIALIZED 1)
-  else()
-    # Only recent versions of gcc make the distinction between -Wuninitialized
-    # and -Wmaybe-uninitialized. If -Wmaybe-uninitialized isn't supported, just
-    # turn off all uninitialized use warnings.
+  # Disable all -Wuninitialized warning for old GCC versions.
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.0)
     check_cxx_compiler_flag("-Wuninitialized" HAS_UNINITIALIZED)
     set(USE_NO_UNINITIALIZED ${HAS_UNINITIALIZED})
+  else()
+    check_cxx_compiler_flag("-Wmaybe-uninitialized" HAS_MAYBE_UNINITIALIZED)
+    set(USE_NO_MAYBE_UNINITIALIZED ${HAS_MAYBE_UNINITIALIZED})
   endif()
 endif()
 
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index f0c81bf878f7..371e583c22a8 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -4406,7 +4406,15 @@ The fields used by CP for code objects before V3 also match those specified in
                                                        ``COMPUTE_PGM_RSRC3``
                                                        configuration
                                                        register. See
-                                                       :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx12-table`.
+                                                       :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx11-table`.
+                                                     GFX12
+                                                       Compute Shader (CS)
+                                                       program settings used by
+                                                       CP to set up
+                                                       ``COMPUTE_PGM_RSRC3``
+                                                       configuration
+                                                       register. See
+                                                       :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx12-table`.
      415:384 4 bytes COMPUTE_PGM_RSRC1               Compute Shader (CS)
                                                      program settings used by
                                                      CP to set up
@@ -4831,13 +4839,16 @@ The fields used by CP for code objects before V3 also match those specified in
 
                                                      Used by CP to set up
                                                      ``COMPUTE_PGM_RSRC2.USER_SGPR``.
-     6       1 bit   ENABLE_TRAP_HANDLER             Must be 0.
+     6       1 bit   ENABLE_TRAP_HANDLER             GFX6-GFX11
+                                                       Must be 0.
 
-                                                     This bit represents
-                                                     ``COMPUTE_PGM_RSRC2.TRAP_PRESENT``,
-                                                     which is set by the CP if
-                                                     the runtime has installed a
-                                                     trap handler.
+                                                       This bit represents
+                                                       ``COMPUTE_PGM_RSRC2.TRAP_PRESENT``,
+                                                       which is set by the CP if
+                                                       the runtime has installed a
+                                                       trap handler.
+                                                     GFX12
+                                                       Reserved, must be 0.
      7       1 bit   ENABLE_SGPR_WORKGROUP_ID_X      Enable the setup of the
                                                      system SGPR register for
                                                      the work-group id in the X
@@ -4957,7 +4968,7 @@ The fields used by CP for code objects before V3 also match those specified in
      30      1 bit   ENABLE_EXCEPTION_INT_DIVIDE_BY  Integer Division by Zero
                      _ZERO                           (rcp_iflag_f32 instruction
                                                      only)
-     31      1 bit                                   Reserved, must be 0.
+     31      1 bit   RESERVED                        Reserved, must be 0.
      32      **Total size 4 bytes.**
      ======= ===================================================================================================================
 
@@ -4986,8 +4997,8 @@ The fields used by CP for code objects before V3 also match those specified in
 
 ..
 
-  .. table:: compute_pgm_rsrc3 for GFX10-GFX12
-     :name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx12-table
+  .. table:: compute_pgm_rsrc3 for GFX10-GFX11
+     :name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx11-table
 
      ======= ======= =============================== ===========================================================================
      Bits    Size    Field Name                      Description
@@ -5038,6 +5049,32 @@ The fields used by CP for code objects before V3 also match those specified in
 
 ..
 
+  .. table:: compute_pgm_rsrc3 for GFX12
+     :name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx12-table
+
+     ======= ======= =============================== ===========================================================================
+     Bits    Size    Field Name                      Description
+     ======= ======= =============================== ===========================================================================
+     3:0     4 bits  RESERVED                        Reserved, must be 0.
+     11:4    8 bits  INST_PREF_SIZE                  Number of instruction bytes to prefetch, starting at the kernel's entry
+                                                     point instruction, before wavefront starts execution. The value is 0..255
+                                                     with a granularity of 128 bytes.
+     12      1 bit   RESERVED                        Reserved, must be 0.
+     13      1 bit   GLG_EN                          If 1, group launch guarantee will be enabled for this dispatch
+     30:14   17 bits RESERVED                        Reserved, must be 0.
+     31      1 bit   IMAGE_OP                        If 1, the kernel execution contains image instructions. If executed as
+                                                     part of a graphics pipeline, image read instructions will stall waiting
+                                                     for any necessary ``WAIT_SYNC`` fence to be performed in order to
+                                                     indicate that earlier pipeline stages have completed writing to the
+                                                     image.
+
+                                                     Not used for compute kernels that are not part of a graphics pipeline and
+                                                     must be 0.
+     32      **Total size 4 bytes.**
+     ======= ===================================================================================================================
+
+..
+
   .. table:: Floating Point Rounding Mode Enumeration Values
      :name: amdgpu-amdhsa-floating-point-rounding-mode-enumeration-values-table
 
@@ -15508,7 +15545,7 @@ terminated by an ``.end_amdhsa_kernel`` directive.
      ``.amdhsa_forward_progress``                             0                   GFX10-GFX12  Controls FWD_PROGRESS in
                                                                                                :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`.
      ``.amdhsa_shared_vgpr_count``                            0                   GFX10-GFX11  Controls SHARED_VGPR_COUNT in
-                                                                                               :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx12-table`.
+                                                                                               :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx11-table`.
      ``.amdhsa_exception_fp_ieee_invalid_op``                 0                   GFX6-GFX12   Controls ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION in
                                                                                                :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`.
      ``.amdhsa_exception_fp_denorm_src``                      0                   GFX6-GFX12   Controls ENABLE_EXCEPTION_FP_DENORMAL_SOURCE in
diff --git a/llvm/docs/CommandGuide/llvm-cxxfilt.rst b/llvm/docs/CommandGuide/llvm-cxxfilt.rst
index 3f7deb171951..0933f0b5bed8 100644
--- a/llvm/docs/CommandGuide/llvm-cxxfilt.rst
+++ b/llvm/docs/CommandGuide/llvm-cxxfilt.rst
@@ -48,6 +48,10 @@ OPTIONS
 
   Print a summary of command line options.
 
+.. option:: --no-params, -p
+
+  Do not demangle function parameters or return types.
+
 .. option:: --no-strip-underscore, -n
 
   Do not strip a leading underscore. This is the default for all platforms
diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst
index bc37d52d1476..2ee533c324d9 100644
--- a/llvm/docs/CommandGuide/llvm-exegesis.rst
+++ b/llvm/docs/CommandGuide/llvm-exegesis.rst
@@ -44,7 +44,7 @@ SNIPPET ANNOTATIONS
 
 :program:`llvm-exegesis` supports benchmarking arbitrary snippets of assembly.
 However, benchmarking these snippets often requires some setup so that they
-can execute properly. :program:`llvm-exegesis` has four annotations and some
+can execute properly. :program:`llvm-exegesis` has five annotations and some
 additional utilities to help with setup so that snippets can be benchmarked
 properly.
 
@@ -183,7 +183,7 @@ annotations added to the snippet:
 
 .. code-block:: none
 
-  # LLVM-EXEGESIS-MEM-DEF test1 4096 2147483647
+  # LLVM-EXEGESIS-MEM-DEF test1 4096 7fffffff
   # LLVM-EXEGESIS-MEM-MAP test1 8192
 
   movq $8192, %rax
diff --git a/llvm/docs/HowToReleaseLLVM.rst b/llvm/docs/HowToReleaseLLVM.rst
index 15397c593d49..51ab6dfd8d8d 100644
--- a/llvm/docs/HowToReleaseLLVM.rst
+++ b/llvm/docs/HowToReleaseLLVM.rst
@@ -40,16 +40,16 @@ Release                         Approx. Date
 =============================== =========================
 *release branch: even releases* *4th Tue in January*
 *release branch: odd releases*  *4th Tue in July*
-X.0.0-rc1                       3 days after branch.
-X.0.0-rc2                       2 weeks after branch.
-X.0.0-rc3                       4 weeks after branch
-**X.0.0-final**                 **6 weeks after branch**
-**X.0.1**                       **8 weeks after branch**
-**X.0.2**                       **10 weeks after branch**
-**X.0.3**                       **12 weeks after branch**
-**X.0.4**                       **14 weeks after branch**
-**X.0.5**                       **16 weeks after branch**
-**X.0.6 (if necessary)**        **18 weeks after branch**
+X.1.0-rc1                       3 days after branch.
+X.1.0-rc2                       2 weeks after branch.
+X.1.0-rc3                       4 weeks after branch
+**X.1.0-final**                 **6 weeks after branch**
+**X.1.1**                       **8 weeks after branch**
+**X.1.2**                       **10 weeks after branch**
+**X.1.3**                       **12 weeks after branch**
+**X.1.4**                       **14 weeks after branch**
+**X.1.5**                       **16 weeks after branch**
+**X.1.6 (if necessary)**        **18 weeks after branch**
 =============================== =========================
 
 Release Process Summary
@@ -77,7 +77,7 @@ Release Process Summary
 
 * Announce bug fix release schedule to the LLVM community and update the website.
 
-* Do bug-fix releases every two weeks until X.0.5 or X.0.6 (if necessary).
+* Do bug-fix releases every two weeks until X.1.5 or X.1.6 (if necessary).
 
 Release Process
 ===============
@@ -123,6 +123,9 @@ Branch the Git trunk using the following procedure:
    version bump.  The branch's name is release/X.x where ``X`` is the major version
    number and ``x`` is just the letter ``x``.
 
+#. On the newly-created release branch, immediately bump the version
+   to X.1.0git (where ``X`` is the major version of the branch.)
+
 #. All tags and branches need to be created in both the llvm/llvm-project and
    llvm/llvm-test-suite repos.
 
@@ -406,13 +409,13 @@ Announce the Release
 ^^^^^^^^^^^^^^^^^^^^
 
 Create a new post in the `Announce Category <https://discourse.llvm.org/c/announce>`_
-once all the release tasks are complete.  For X.0.0 releases, make sure to include a
-link to the release notes in the post.  For X.0.1+ releases, generate a changelog
+once all the release tasks are complete.  For X.1.0 releases, make sure to include a
+link to the release notes in the post.  For X.1.1+ releases, generate a changelog
 using this command and add it to the post.
 
 ::
 
-  $ git log --format="- %aN: [%s (%h)](https://github.com/llvm/llvm-project/commit/%H)" llvmorg-X.0.N-1..llvmorg-X.0.N
+  $ git log --format="- %aN: [%s (%h)](https://github.com/llvm/llvm-project/commit/%H)" llvmorg-X.1.N-1..llvmorg-X.1.N
 
 Once the release has been announced add a link to the announcement on the llvm
 homepage (from the llvm-www repo) in the "Release Emails" section.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 7f4a316a21ac..b5918e3063d8 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -10515,9 +10515,10 @@ Atomic loads produce :ref:`defined <memmodel>` results when they may see
 multiple atomic stores. The type of the pointee must be an integer, pointer, or
 floating-point type whose bit width is a power of two greater than or equal to
 eight and less than or equal to a target-specific size limit.  ``align`` must be
-explicitly specified on atomic loads, and the load has undefined behavior if the
-alignment is not set to a value which is at least the size in bytes of the
-pointee. ``!nontemporal`` does not have any defined semantics for atomic loads.
+explicitly specified on atomic loads. Note: if the alignment is not greater or
+equal to the size of the `<value>` type, the atomic operation is likely to
+require a lock and have poor performance. ``!nontemporal`` does not have any
+defined semantics for atomic loads.
 
 The optional constant ``align`` argument specifies the alignment of the
 operation (that is, the alignment of the memory address). It is the
@@ -10655,9 +10656,10 @@ Atomic loads produce :ref:`defined <memmodel>` results when they may see
 multiple atomic stores. The type of the pointee must be an integer, pointer, or
 floating-point type whose bit width is a power of two greater than or equal to
 eight and less than or equal to a target-specific size limit.  ``align`` must be
-explicitly specified on atomic stores, and the store has undefined behavior if
-the alignment is not set to a value which is at least the size in bytes of the
-pointee. ``!nontemporal`` does not have any defined semantics for atomic stores.
+explicitly specified on atomic stores. Note: if the alignment is not greater or
+equal to the size of the `<value>` type, the atomic operation is likely to
+require a lock and have poor performance. ``!nontemporal`` does not have any
+defined semantics for atomic stores.
 
 The optional constant ``align`` argument specifies the alignment of the
 operation (that is, the alignment of the memory address). It is the
@@ -10807,8 +10809,9 @@ must be at least ``monotonic``, the failure ordering cannot be either
 A ``cmpxchg`` instruction can also take an optional
 ":ref:`syncscope <syncscope>`" argument.
 
-The alignment must be a power of two greater or equal to the size of the
-`<value>` type.
+Note: if the alignment is not greater or equal to the size of the `<value>`
+type, the atomic operation is likely to require a lock and have poor
+performance.
 
 The alignment is only optional when parsing textual IR; for in-memory IR, it is
 always present. If unspecified, the alignment is assumed to be equal to the
@@ -10910,8 +10913,9 @@ the ``atomicrmw`` is marked as ``volatile``, then the optimizer is not
 allowed to modify the number or order of execution of this
 ``atomicrmw`` with other :ref:`volatile operations <volatile>`.
 
-The alignment must be a power of two greater or equal to the size of the
-`<value>` type.
+Note: if the alignment is not greater or equal to the size of the `<value>`
+type, the atomic operation is likely to require a lock and have poor
+performance.
 
 The alignment is only optional when parsing textual IR; for in-memory IR, it is
 always present. If unspecified, the alignment is assumed to be equal to the
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 84cc83ef847a..99c7146825f5 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -212,7 +212,7 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-zfbfmin``, ``experimental-zvfbfmin``, ``experimental-zvfbfwma``
   LLVM implements assembler support for the `0.8.0 draft specification <https://github.com/riscv/riscv-bfloat16/releases/tag/20230629>`_.
 
-``experimental-zicfilp``
+``experimental-zicfilp``, ``experimental-zicfiss``
   LLVM implements the `0.4 draft specification <https://github.com/riscv/riscv-cfi/releases/tag/v0.4.0>`__.
 
 ``experimental-zicond``
@@ -221,6 +221,12 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-ztso``
   LLVM implements the `v0.1 proposed specification <https://github.com/riscv/riscv-isa-manual/releases/download/draft-20220723-10eea63/riscv-spec.pdf>`__ (see Chapter 25).  The mapping from the C/C++ memory model to Ztso has not yet been ratified in any standards document.  There are multiple possible mappings, and they are *not* mutually ABI compatible.  The mapping LLVM implements is ABI compatible with the default WMO mapping.  This mapping may change and there is *explicitly* no ABI stability offered while the extension remains in experimental status.  User beware.
 
+``experimental-zimop``
+  LLVM implements the `v0.1 proposed specification <https://github.com/riscv/riscv-isa-manual/blob/main/src/zimop.adoc>`__.
+
+``experimental-zcmop``
+  LLVM implements the `v0.2 proposed specification <https://github.com/riscv/riscv-isa-manual/blob/main/src/zimop.adoc>`__.
+
 To use an experimental extension from `clang`, you must add `-menable-experimental-extensions` to the command line, and specify the exact version of the experimental extension you are using.  To use an experimental extension with LLVM's internal developer tools (e.g. `llc`, `llvm-objdump`, `llvm-mc`), you must prefix the extension name with `experimental-`.  Note that you don't need to specify the version with internal tools, and shouldn't include the `experimental-` prefix with `clang`.
 
 Vendor Extensions
@@ -294,9 +300,6 @@ The current vendor extensions supported are:
 ``XCVbi``
   LLVM implements `version 1.0.0 of the CORE-V immediate branching custom instructions specification <https://github.com/openhwgroup/cv32e40p/blob/cv32e40p_v1.3.2/docs/source/instruction_set_extensions.rst>`_ by OpenHW Group.  All instructions are prefixed with `cv.` as described in the specification. These instructions are only available for riscv32 at this time.
 
-``XSfcie``
-  LLVM implements `version 1.0.0 of the SiFive Custom Instruction Extension (CIE) Software Specification <https://sifive.cdn.prismic.io/sifive/767804da-53b2-4893-97d5-b7c030ae0a94_s76mc_core_complex_manual_21G3.pdf>`_ by SiFive.  All custom instruction are added as described in the specification, and the riscv-toolchain-convention document linked above. These instructions are only available for S76 processor at this time.
-
 Experimental C Intrinsics
 =========================
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 33afa09fcac3..52610e7de187 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -146,6 +146,11 @@ Changes to the RISC-V Backend
   and is no longer experimental.  However, the C intrinsics for these extensions
   are still experimental.  To use the C intrinsics for these extensions,
   ``-menable-experimental-extensions`` needs to be passed to Clang.
+* XSfcie extension and SiFive CSRs and instructions that were associated with
+  it have been removed. None of these CSRs and instructions were part of
+  "SiFive Custom Instruction Extension" as SiFive defines it. The LLVM project
+  needs to work with SiFive to define and document real extension names for
+  individual CSRs and instructions.
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -172,6 +177,11 @@ Changes to the X86 Backend
 * Support ISA of ``AVX10.1-256`` and ``AVX10.1-512``.
 * ``-mcpu=pantherlake`` and ``-mcpu=clearwaterforest`` are now supported.
 * ``-mapxf`` is supported.
+* Marking global variables with ``code_model = "small"/"large"`` in the IR now
+  overrides the global code model to allow 32-bit relocations or require 64-bit
+  relocations to the global variable.
+* The medium code model's code generation was audited to be more similar to the
+  small code model where possible.
 
 Changes to the OCaml bindings
 -----------------------------
diff --git a/llvm/docs/llvm-objdump.1 b/llvm/docs/llvm-objdump.1
deleted file mode 100644
index 42dcc7367659..000000000000
--- a/llvm/docs/llvm-objdump.1
+++ /dev/null
@@ -1,209 +0,0 @@
-.\" Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-.\" See https://llvm.org/LICENSE.txt for license information.
-.\" SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-.\"
-.Dd December 19, 2018
-.Dt LLVM-OBJDUMP 1
-.Os
-.Sh NAME
-.Nm llvm-objdump
-.Nd LLVM object file dumper
-.Sh SYNOPSIS
-.Nm llvm-objdump
-.Op Ar options
-.Ar objfile ...
-.Sh DESCRIPTION
-.Nm
-prints the contents of object files and final linked images named on the
-command line.
-If no file name is specified,
-.Nm
-will attempt to read from
-.Pa a.out .
-If
-.Pa -
-is used as a file name,
-.Nm
-will process a file on its standard input stream.
-.Nm
-accepts many of the same command line arguments as GNU objdump.
-.Sh OPTIONS
-.Ss General Options
-.Bl -tag -width indent
-.It Fl -aarch64-neon-syntax Ns = Ns Ar value
-Choose style of NEON code to emit from AArch64 backend.
-.Ar value
-may be one of:
-.Bl -tag -width indent
-.It generic
-Generic NEON assembly
-.It apple
-Apple-style NEON assembly
-.El
-.It Fl -arch Ns = Ns Ar value
-Choose architecture(s) from a Mach-O file to dump
-.It Fl -arch-name Ns = Ns ar arch
-Target arch to disassemble for.
-See
-.Fl -version
-for available targets.
-.It Fl -bind
-Display mach-o binding info.
-.It Fl -color
-Use colored syntax highlighting.
-Default autodetect.
-.It Fl -disassemble
-Display assembler mnemonics for machine instructions.
-.It Fl -disassemble-all
-Display assembler mnemonics for the machine instruction in all sections.
-.It Fl -dsym Ns = Ns Ar file
-Use
-.Ar file
-for debug info.
-.It Fl -dwarf Ns = Ns Ar sections
-Dump of dwarf debug sections.
-.Bl -tag -width indent
-.It frames
-.Dv .debug_frame
-.El
-.It Fl -exports-trie
-Display mach-o exported symbols.
-.It Fl -fault-map-section
-Display contents of faultmap section.
-.It Fl -filter-print-funcs Ns = Ns Ar functions
-Only print IR for functions whose name match
-.Ar functions
-for all print-[before|after][-all] options.
-.It Fl -full-leading-addr
-Print full leading address.
-.It Fl g
-Print line information from debug info if available.
-.It Fl h , -headers , -section-headers
-Display summaries of the headers for each section.
-.It Fl -help
-Display available options.
-Use
-.Fl -help-hidden
-for more.
-.It Fl -lazy-bind
-Display mach-o lazy binding info.
-.It Fl -line-numbers
-Display source line numbers with disassembly.
-Implies disassemble object.
-.It Fl -macho
-Use MachO specific object file parser.
-.It Fl -mattr Ns = Ns Ar attribute ...
-Target specific attributes.
-.It Fl -mcpu Ns = Ns Ar CPU
-Target a specific cpu type.
-Use
-.Fl mcpu Ns = Ns help
-for details.
-.It Fl -no-leading-addr
-Print no leading address.
-.It Fl -no-leading-headers
-Print no leading headers.
-.It Fl -no-show-raw-insn
-When disassembling instructions, do not print the instruction bytes.
-.It Fl -offloading
-Display the content of the LLVM offloading section.
-.It Fl -prefix Ns = Ns Ar PREFIX
-When disassembling, add
-.Ar PREFIX
-to absolute paths.
-.It Fl -prefix-strip Ns = Ns Ar LEVEL
-When disassembling, strip out
-.Ar LEVEL
-initial directories from absolute paths. This option has no effect without
-.Fl -prefix Ns = Ns PREFIX .
-.It Fl -print-imm-hex
-Use hex format for immediate values.
-.It Fl -private-header
-Display only the first format specific file header.
-.It Fl -private-headers
-Display format specific file headers.
-.It Fl r
-Display the relocation entries in the file.
-.It Fl -raw-clang-ast
-Dump the raw binary contents of the clang AST section.
-.It Fl -rebase
-Display mach-o rebasing info.
-.It Fl -reverse-iterate
-Reverse iterate.
-.It Fl s
-Display the content of each section.
-.It Fl -section Ns = Ns Ar section
-Operate on the specified sections only.
-With
-.Fl -macho
-dump segment,section.
-.It Fl -source
-Display source inline with disassembly.
-Implies disassemble object.
-.It Fl -start-address Ns = Ns Ar address
-Disassemble beginning at
-.Ar address .
-.It Fl -stop-address Ns = Ns Ar address
-Stop disassembly at
-.Ar address .
-.It Fl t
-Display the symbol table.
-.It Fl -triple Ns = Ns Ar triple
-Target triple to disassemble for.
-See
-.Fl -version
-for available targets.
-.It Fl -unwind-info
-Display unwind information.
-.It Fl -version
-Display the version of this program.
-.It Fl -weak-bind
-Display mach-o weak binding info.
-.It Fl -x86-asm-syntax Ns = Ns Ar syntax
-Choose style of code to emit from X86 backend.
-.Bl -tag -width indent
-.It att
-Emit AT&T-style assembly.
-.It intel
-Emit Intel-style assembly.
-.El
-.El
-.Ss Mach-O Options
-There are a number of options specific to the Mach-O format.
-These are used in combination with the
-.Fl -macho
-option.
-.Bl -tag -width indent
-.It Fl -archive-headers
-Print archive headers for Mach-O archives.
-.It Fl -archive-member-offsets
-Print the offset to each archive member for Mach-O archives.
-Requires
-.Fl -macho
-and
-.Fl -archive-headers .
-.It Fl -data-in-code
-Print the data in code table for Mach-O objects.
-.It Fl -dis-symname Ns = Ns Ar symbol
-Disassemble just
-.Ar symbol 's
-instructions.
-.It Fl -dylib-id
-Print the shared library's id for the dylib Mach-O file.
-.It Fl -dylibs-used
-Print the shared libraries used for linked Mach-O files.
-.It Fl -indirect-symbols
-Print indirect symbol table for Mach-O objects.
-.It Fl -info-plist
-Print the info plist section as strings for Mach-O objects.
-.It Fl -link-opt-hints
-Print the linker optimization hints for Mach-O objects.
-.It Fl -no-symbolic-operands
-do not symbolic operands when disassembling.
-.It Fl -non-verbose
-Print the info for Mach-O objects in non-verbose or numeric form.
-.It Fl -objc-meta-data
-Print the Objective-C runtime meta data for Mach-O files.
-.It Fl -universal-headers
-Print Mach-O universal headers.
-.El
diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl07.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl07.rst
index 0347127d0cdf..8fd4c39d3ff4 100644
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl07.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl07.rst
@@ -182,7 +182,7 @@ example through the pass, for example, you'll get:
 
 .. code-block:: bash
 
-    $ llvm-as < example.ll | opt -mem2reg | llvm-dis
+    $ llvm-as < example.ll | opt -passes=mem2reg | llvm-dis
     @G = weak global i32 0
     @H = weak global i32 0
 
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index b7d0a1228ebf..d397b937d78c 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -33,6 +33,12 @@
 /// the propagation of the impact of divergent control flow on the divergence of
 /// values (sync dependencies).
 ///
+/// NOTE: In general, no interface exists for a transform to update
+/// (Machine)UniformityInfo. Additionally, (Machine)CycleAnalysis is a
+/// transitive dependence, but it also does not provide an interface for
+/// updating itself. Given that, transforms should not preserve uniformity in
+/// their getAnalysisUsage() callback.
+///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_ADT_GENERICUNIFORMITYIMPL_H
diff --git a/llvm/include/llvm/ADT/SmallString.h b/llvm/include/llvm/ADT/SmallString.h
index 02fa28fc856d..a5b9eec50c82 100644
--- a/llvm/include/llvm/ADT/SmallString.h
+++ b/llvm/include/llvm/ADT/SmallString.h
@@ -89,30 +89,26 @@ public:
 
   /// Check for string equality.  This is more efficient than compare() when
   /// the relative ordering of inequal strings isn't needed.
-  bool equals(StringRef RHS) const {
-    return str().equals(RHS);
-  }
+  [[nodiscard]] bool equals(StringRef RHS) const { return str().equals(RHS); }
 
   /// Check for string equality, ignoring case.
-  bool equals_insensitive(StringRef RHS) const {
+  [[nodiscard]] bool equals_insensitive(StringRef RHS) const {
     return str().equals_insensitive(RHS);
   }
 
   /// compare - Compare two strings; the result is negative, zero, or positive
   /// if this string is lexicographically less than, equal to, or greater than
   /// the \p RHS.
-  int compare(StringRef RHS) const {
-    return str().compare(RHS);
-  }
+  [[nodiscard]] int compare(StringRef RHS) const { return str().compare(RHS); }
 
   /// compare_insensitive - Compare two strings, ignoring case.
-  int compare_insensitive(StringRef RHS) const {
+  [[nodiscard]] int compare_insensitive(StringRef RHS) const {
     return str().compare_insensitive(RHS);
   }
 
   /// compare_numeric - Compare two strings, treating sequences of digits as
   /// numbers.
-  int compare_numeric(StringRef RHS) const {
+  [[nodiscard]] int compare_numeric(StringRef RHS) const {
     return str().compare_numeric(RHS);
   }
 
@@ -121,10 +117,14 @@ public:
   /// @{
 
   /// starts_with - Check if this string starts with the given \p Prefix.
-  bool starts_with(StringRef Prefix) const { return str().starts_with(Prefix); }
+  [[nodiscard]] bool starts_with(StringRef Prefix) const {
+    return str().starts_with(Prefix);
+  }
 
   /// ends_with - Check if this string ends with the given \p Suffix.
-  bool ends_with(StringRef Suffix) const { return str().ends_with(Suffix); }
+  [[nodiscard]] bool ends_with(StringRef Suffix) const {
+    return str().ends_with(Suffix);
+  }
 
   /// @}
   /// @name String Searching
@@ -134,7 +134,7 @@ public:
   ///
   /// \return - The index of the first occurrence of \p C, or npos if not
   /// found.
-  size_t find(char C, size_t From = 0) const {
+  [[nodiscard]] size_t find(char C, size_t From = 0) const {
     return str().find(C, From);
   }
 
@@ -142,7 +142,7 @@ public:
   ///
   /// \returns The index of the first occurrence of \p Str, or npos if not
   /// found.
-  size_t find(StringRef Str, size_t From = 0) const {
+  [[nodiscard]] size_t find(StringRef Str, size_t From = 0) const {
     return str().find(Str, From);
   }
 
@@ -150,7 +150,7 @@ public:
   ///
   /// \returns The index of the last occurrence of \p C, or npos if not
   /// found.
-  size_t rfind(char C, size_t From = StringRef::npos) const {
+  [[nodiscard]] size_t rfind(char C, size_t From = StringRef::npos) const {
     return str().rfind(C, From);
   }
 
@@ -158,13 +158,11 @@ public:
   ///
   /// \returns The index of the last occurrence of \p Str, or npos if not
   /// found.
-  size_t rfind(StringRef Str) const {
-    return str().rfind(Str);
-  }
+  [[nodiscard]] size_t rfind(StringRef Str) const { return str().rfind(Str); }
 
   /// Find the first character in the string that is \p C, or npos if not
   /// found. Same as find.
-  size_t find_first_of(char C, size_t From = 0) const {
+  [[nodiscard]] size_t find_first_of(char C, size_t From = 0) const {
     return str().find_first_of(C, From);
   }
 
@@ -172,13 +170,13 @@ public:
   /// not found.
   ///
   /// Complexity: O(size() + Chars.size())
-  size_t find_first_of(StringRef Chars, size_t From = 0) const {
+  [[nodiscard]] size_t find_first_of(StringRef Chars, size_t From = 0) const {
     return str().find_first_of(Chars, From);
   }
 
   /// Find the first character in the string that is not \p C or npos if not
   /// found.
-  size_t find_first_not_of(char C, size_t From = 0) const {
+  [[nodiscard]] size_t find_first_not_of(char C, size_t From = 0) const {
     return str().find_first_not_of(C, From);
   }
 
@@ -186,13 +184,15 @@ public:
   /// \p Chars, or npos if not found.
   ///
   /// Complexity: O(size() + Chars.size())
-  size_t find_first_not_of(StringRef Chars, size_t From = 0) const {
+  [[nodiscard]] size_t find_first_not_of(StringRef Chars,
+                                         size_t From = 0) const {
     return str().find_first_not_of(Chars, From);
   }
 
   /// Find the last character in the string that is \p C, or npos if not
   /// found.
-  size_t find_last_of(char C, size_t From = StringRef::npos) const {
+  [[nodiscard]] size_t find_last_of(char C,
+                                    size_t From = StringRef::npos) const {
     return str().find_last_of(C, From);
   }
 
@@ -200,8 +200,8 @@ public:
   /// found.
   ///
   /// Complexity: O(size() + Chars.size())
-  size_t find_last_of(
-      StringRef Chars, size_t From = StringRef::npos) const {
+  [[nodiscard]] size_t find_last_of(StringRef Chars,
+                                    size_t From = StringRef::npos) const {
     return str().find_last_of(Chars, From);
   }
 
@@ -210,15 +210,11 @@ public:
   /// @{
 
   /// Return the number of occurrences of \p C in the string.
-  size_t count(char C) const {
-    return str().count(C);
-  }
+  [[nodiscard]] size_t count(char C) const { return str().count(C); }
 
   /// Return the number of non-overlapped occurrences of \p Str in the
   /// string.
-  size_t count(StringRef Str) const {
-    return str().count(Str);
-  }
+  [[nodiscard]] size_t count(StringRef Str) const { return str().count(Str); }
 
   /// @}
   /// @name Substring Operations
@@ -233,7 +229,8 @@ public:
   /// \param N The number of characters to included in the substring. If \p N
   /// exceeds the number of characters remaining in the string, the string
   /// suffix (starting with \p Start) will be returned.
-  StringRef substr(size_t Start, size_t N = StringRef::npos) const {
+  [[nodiscard]] StringRef substr(size_t Start,
+                                 size_t N = StringRef::npos) const {
     return str().substr(Start, N);
   }
 
@@ -247,14 +244,16 @@ public:
   /// substring. If this is npos, or less than \p Start, or exceeds the
   /// number of characters remaining in the string, the string suffix
   /// (starting with \p Start) will be returned.
-  StringRef slice(size_t Start, size_t End) const {
+  [[nodiscard]] StringRef slice(size_t Start, size_t End) const {
     return str().slice(Start, End);
   }
 
   // Extra methods.
 
   /// Explicit conversion to StringRef.
-  StringRef str() const { return StringRef(this->data(), this->size()); }
+  [[nodiscard]] StringRef str() const {
+    return StringRef(this->data(), this->size());
+  }
 
   // TODO: Make this const, if it's safe...
   const char* c_str() {
diff --git a/llvm/include/llvm/ADT/Twine.h b/llvm/include/llvm/ADT/Twine.h
index 8dfbe4f72e07..1f1fd1967efb 100644
--- a/llvm/include/llvm/ADT/Twine.h
+++ b/llvm/include/llvm/ADT/Twine.h
@@ -37,7 +37,7 @@ namespace llvm {
   /// A Twine is not intended for use directly and should not be stored, its
   /// implementation relies on the ability to store pointers to temporary stack
   /// objects which may be deallocated at the end of a statement. Twines should
-  /// only be used accepted as const references in arguments, when an API wishes
+  /// only be used as const references in arguments, when an API wishes
   /// to accept possibly-concatenated strings.
   ///
   /// Twines support a special 'null' value, which always concatenates to form
diff --git a/llvm/include/llvm/Analysis/ConstraintSystem.h b/llvm/include/llvm/Analysis/ConstraintSystem.h
index 5d3bc64bf8b4..7b02b618f7cb 100644
--- a/llvm/include/llvm/Analysis/ConstraintSystem.h
+++ b/llvm/include/llvm/Analysis/ConstraintSystem.h
@@ -54,9 +54,6 @@ class ConstraintSystem {
   /// constraint system.
   DenseMap<Value *, unsigned> Value2Index;
 
-  /// Current greatest common divisor for all coefficients in the system.
-  uint32_t GCD = 1;
-
   // Eliminate constraints from the system using Fourier–Motzkin elimination.
   bool eliminateUsingFM();
 
@@ -88,10 +85,6 @@ public:
     for (const auto &[Idx, C] : enumerate(R)) {
       if (C == 0)
         continue;
-      auto A = std::abs(C);
-      GCD = APIntOps::GreatestCommonDivisor({32, (uint32_t)A}, {32, GCD})
-                .getZExtValue();
-
       NewRow.emplace_back(C, Idx);
     }
     if (Constraints.empty())
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 2ffd4d4b7143..daf1d8e2079f 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -156,6 +156,10 @@ public:
   /// FDecl is assumed to have a parent Module when using this function.
   bool getLibFunc(const Function &FDecl, LibFunc &F) const;
 
+  /// Searches for a function name using an Instruction \p Opcode.
+  /// Currently, only the frem instruction is supported.
+  bool getLibFunc(unsigned int Opcode, Type *Ty, LibFunc &F) const;
+
   /// Forces a function to be marked as unavailable.
   void setUnavailable(LibFunc F) {
     setState(F, Unavailable);
@@ -360,6 +364,12 @@ public:
            getLibFunc(*(CB.getCalledFunction()), F);
   }
 
+  /// Searches for a function name using an Instruction \p Opcode.
+  /// Currently, only the frem instruction is supported.
+  bool getLibFunc(unsigned int Opcode, Type *Ty, LibFunc &F) const {
+    return Impl->getLibFunc(Opcode, Ty, F);
+  }
+
   /// Disables all builtins.
   ///
   /// This can be used for options like -fno-builtin.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f5114fa40c70..048912beaba5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -560,6 +560,10 @@ public:
     // (set to UINT_MAX to disable). This does not apply in cases where the
     // loop is being fully unrolled.
     unsigned MaxCount;
+    /// Set the maximum upper bound of trip count. Allowing the MaxUpperBound
+    /// to be overrided by a target gives more flexiblity on certain cases.
+    /// By default, MaxUpperBound uses UnrollMaxUpperBound which value is 8.
+    unsigned MaxUpperBound;
     /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but
     /// applies even if full unrolling is selected. This allows a target to fall
     /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount.
@@ -1239,6 +1243,18 @@ public:
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
       const Instruction *CxtI = nullptr) const;
 
+  /// Returns the cost estimation for alternating opcode pattern that can be
+  /// lowered to a single instruction on the target. In X86 this is for the
+  /// addsub instruction which corrsponds to a Shuffle + Fadd + FSub pattern in
+  /// IR. This function expects two opcodes: \p Opcode1 and \p Opcode2 being
+  /// selected by \p OpcodeMask. The mask contains one bit per lane and is a `0`
+  /// when \p Opcode0 is selected and `1` when Opcode1 is selected.
+  /// \p VecTy is the vector type of the instruction to be generated.
+  InstructionCost getAltInstrCost(
+      VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+      const SmallBitVector &OpcodeMask,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
+
   /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
   /// The exact mask may be passed as Mask, or else the array will be empty.
   /// The index and subtype parameters are used by the subvector insertion and
@@ -1940,6 +1956,10 @@ public:
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       OperandValueInfo Opd1Info, OperandValueInfo Opd2Info,
       ArrayRef<const Value *> Args, const Instruction *CxtI = nullptr) = 0;
+  virtual InstructionCost getAltInstrCost(
+      VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+      const SmallBitVector &OpcodeMask,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const = 0;
 
   virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
                                          ArrayRef<int> Mask,
@@ -2551,6 +2571,12 @@ public:
     return Impl.getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
                                        Args, CxtI);
   }
+  InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
+                                  unsigned Opcode1,
+                                  const SmallBitVector &OpcodeMask,
+                                  TTI::TargetCostKind CostKind) const override {
+    return Impl.getAltInstrCost(VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
+  }
 
   InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 1d8f523e9792..7ad3ce512a35 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -554,6 +554,13 @@ public:
     return 1;
   }
 
+  InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
+                                  unsigned Opcode1,
+                                  const SmallBitVector &OpcodeMask,
+                                  TTI::TargetCostKind CostKind) const {
+    return InstructionCost::getInvalid();
+  }
+
   InstructionCost
   getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask,
                  TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index a3186e61b94a..baa16306ebf5 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -863,6 +863,11 @@ ConstantRange computeConstantRange(const Value *V, bool ForSigned,
                                    const DominatorTree *DT = nullptr,
                                    unsigned Depth = 0);
 
+/// Combine constant ranges from computeConstantRange() and computeKnownBits().
+ConstantRange
+computeConstantRangeIncludingKnownBits(const WithCache<const Value *> &V,
+                                       bool ForSigned, const SimplifyQuery &SQ);
+
 /// Return true if this function can prove that the instruction I will
 /// always transfer execution to one of its successors (including the next
 /// instruction that follows within a basic block). E.g. this is not
diff --git a/llvm/include/llvm/CodeGen/AccelTable.h b/llvm/include/llvm/CodeGen/AccelTable.h
index af874aa5e91a..6eb09f32f9f9 100644
--- a/llvm/include/llvm/CodeGen/AccelTable.h
+++ b/llvm/include/llvm/CodeGen/AccelTable.h
@@ -270,16 +270,14 @@ public:
 #endif
 
   uint64_t getDieOffset() const {
-    assert(std::holds_alternative<uint64_t>(OffsetVal) &&
-           "Accessing DIE Offset before normalizing.");
+    assert(isNormalized() && "Accessing DIE Offset before normalizing.");
     return std::get<uint64_t>(OffsetVal);
   }
   unsigned getDieTag() const { return DieTag; }
   unsigned getUnitID() const { return UnitID; }
   bool isTU() const { return IsTU; }
   void normalizeDIEToOffset() {
-    assert(std::holds_alternative<const DIE *>(OffsetVal) &&
-           "Accessing offset after normalizing.");
+    assert(!isNormalized() && "Accessing offset after normalizing.");
     OffsetVal = std::get<const DIE *>(OffsetVal)->getOffset();
   }
   bool isNormalized() const {
@@ -309,7 +307,7 @@ class DWARF5AccelTable : public AccelTable<DWARF5AccelTableData> {
 public:
   struct UnitIndexAndEncoding {
     unsigned Index;
-    DWARF5AccelTableData::AttributeEncoding Endoding;
+    DWARF5AccelTableData::AttributeEncoding Encoding;
   };
   /// Returns type units that were constructed.
   const TUVectorTy &getTypeUnitsSymbols() { return TUSymbolsOrHashes; }
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index e7debc652a0a..dcc1a4580b14 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -769,9 +769,6 @@ public:
   bool matchCombineFSubFpExtFNegFMulToFMadOrFMA(MachineInstr &MI,
                                                 BuildFnTy &MatchInfo);
 
-  /// Fold boolean selects to logical operations.
-  bool matchSelectToLogical(MachineInstr &MI, BuildFnTy &MatchInfo);
-
   bool matchCombineFMinMaxNaN(MachineInstr &MI, unsigned &Info);
 
   /// Transform G_ADD(x, G_SUB(y, x)) to y.
@@ -814,6 +811,9 @@ public:
   // Given a binop \p MI, commute operands 1 and 2.
   void applyCommuteBinOpOperands(MachineInstr &MI);
 
+  /// Combine selects.
+  bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
@@ -904,6 +904,18 @@ private:
   /// select (fcmp uge x, 1.0) 1.0, x -> fminnm x, 1.0
   bool matchFPSelectToMinMax(Register Dst, Register Cond, Register TrueVal,
                              Register FalseVal, BuildFnTy &MatchInfo);
+
+  /// Try to fold selects to logical operations.
+  bool tryFoldBoolSelectToLogic(GSelect *Select, BuildFnTy &MatchInfo);
+
+  bool tryFoldSelectOfConstants(GSelect *Select, BuildFnTy &MatchInfo);
+
+  bool isOneOrOneSplat(Register Src, bool AllowUndefs);
+  bool isZeroOrZeroSplat(Register Src, bool AllowUndefs);
+  bool isConstantSplatVector(Register Src, int64_t SplatValue,
+                             bool AllowUndefs);
+
+  std::optional<APInt> getConstantOrConstantSplatVector(Register Src);
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/MacroFusion.h b/llvm/include/llvm/CodeGen/MacroFusion.h
index a359fca60426..191c906e9ef6 100644
--- a/llvm/include/llvm/CodeGen/MacroFusion.h
+++ b/llvm/include/llvm/CodeGen/MacroFusion.h
@@ -50,15 +50,11 @@ bool fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
 /// for instructions that benefit according to the target-specific
 /// predicate functions. shouldScheduleAdjacent will be true if any of the
 /// provided predicates are true.
+/// If BranchOnly is true, only branch instructions with one of their
+/// predecessors will be fused.
 std::unique_ptr<ScheduleDAGMutation>
-createMacroFusionDAGMutation(ArrayRef<MacroFusionPredTy> Predicates);
-
-/// Create a DAG scheduling mutation to pair branch instructions with one
-/// of their predecessors back to back for instructions that benefit according
-/// to the target-specific predicate functions. shouldScheduleAdjacent will be
-/// true if any of the provided predicates are true.
-std::unique_ptr<ScheduleDAGMutation>
-createBranchMacroFusionDAGMutation(ArrayRef<MacroFusionPredTy> Predicates);
+createMacroFusionDAGMutation(ArrayRef<MacroFusionPredTy> Predicates,
+                             bool BranchOnly = false);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 490125164ab3..ed2b513be960 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2869,6 +2869,12 @@ public:
                           getApproximateEVTForLLT(ToTy, DL, Ctx));
   }
 
+  /// Return true if truncating the specific node Val to type VT2 is free.
+  virtual bool isTruncateFree(SDValue Val, EVT VT2) const {
+    // Fallback to type matching.
+    return isTruncateFree(Val.getValueType(), VT2);
+  }
+
   virtual bool isProfitableToHoist(Instruction *I) const { return true; }
 
   /// Return true if the extension represented by \p I is free.
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index d464263c190a..fc1f9bf342f8 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -50,9 +50,15 @@
    don't. */
 #cmakedefine01 HAVE_DECL_STRERROR_S
 
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#cmakedefine HAVE_DLFCN_H ${HAVE_DLFCN_H}
+
 /* Define if dlopen() is available on this platform. */
 #cmakedefine HAVE_DLOPEN ${HAVE_DLOPEN}
 
+/* Define if dladdr() is available on this platform. */
+#cmakedefine HAVE_DLADDR ${HAVE_DLADDR}
+
 /* Define to 1 if we can register EH frames on this platform. */
 #cmakedefine HAVE_REGISTER_FRAME ${HAVE_REGISTER_FRAME}
 
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 483c5adc99ca..6605ea60df99 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -198,10 +198,4 @@
 /* Define if plugins enabled */
 #cmakedefine LLVM_ENABLE_PLUGINS
 
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#cmakedefine HAVE_DLFCN_H ${HAVE_DLFCN_H}
-
-/* Define if dladdr() is available on this platform. */
-#cmakedefine HAVE_DLADDR ${HAVE_DLADDR}
-
 #endif
diff --git a/llvm/include/llvm/Demangle/Demangle.h b/llvm/include/llvm/Demangle/Demangle.h
index 70cfc1418f0c..fe129603c078 100644
--- a/llvm/include/llvm/Demangle/Demangle.h
+++ b/llvm/include/llvm/Demangle/Demangle.h
@@ -32,7 +32,7 @@ enum : int {
 /// Returns a non-NULL pointer to a NUL-terminated C style string
 /// that should be explicitly freed, if successful. Otherwise, may return
 /// nullptr if mangled_name is not a valid mangling or is nullptr.
-char *itaniumDemangle(std::string_view mangled_name);
+char *itaniumDemangle(std::string_view mangled_name, bool ParseParams = true);
 
 enum MSDemangleFlags {
   MSDF_None = 0,
@@ -68,7 +68,8 @@ char *dlangDemangle(std::string_view MangledName);
 std::string demangle(std::string_view MangledName);
 
 bool nonMicrosoftDemangle(std::string_view MangledName, std::string &Result,
-                          bool CanHaveLeadingDot = true);
+                          bool CanHaveLeadingDot = true,
+                          bool ParseParams = true);
 
 /// "Partial" demangler. This supports demangling a string into an AST
 /// (typically an intermediate stage in itaniumDemangle) and querying certain
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index e0ff035d47cf..06956f47c1f0 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -2793,7 +2793,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseClassEnumType();
   Node *parseQualifiedType();
 
-  Node *parseEncoding();
+  Node *parseEncoding(bool ParseParams = true);
   bool parseCallOffset();
   Node *parseSpecialName();
 
@@ -2910,7 +2910,7 @@ template <typename Derived, typename Alloc> struct AbstractManglingParser {
   Node *parseDestructorName();
 
   /// Top-level entry point into the parser.
-  Node *parse();
+  Node *parse(bool ParseParams = true);
 };
 
 const char* parse_discriminator(const char* first, const char* last);
@@ -5404,7 +5404,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
 //            ::= <data name>
 //            ::= <special-name>
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
+Node *AbstractManglingParser<Derived, Alloc>::parseEncoding(bool ParseParams) {
   // The template parameters of an encoding are unrelated to those of the
   // enclosing context.
   SaveTemplateParams SaveTemplateParamsScope(this);
@@ -5430,6 +5430,16 @@ Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
   if (IsEndOfEncoding())
     return Name;
 
+  // ParseParams may be false at the top level only, when called from parse().
+  // For example in the mangled name _Z3fooILZ3BarEET_f, ParseParams may be
+  // false when demangling 3fooILZ3BarEET_f but is always true when demangling
+  // 3Bar.
+  if (!ParseParams) {
+    while (consume())
+      ;
+    return Name;
+  }
+
   Node *Attrs = nullptr;
   if (consumeIf("Ua9enable_ifI")) {
     size_t BeforeArgs = Names.size();
@@ -5894,9 +5904,9 @@ AbstractManglingParser<Derived, Alloc>::parseTemplateArgs(bool TagTemplates) {
 // extension      ::= ___Z <encoding> _block_invoke<decimal-digit>+
 // extension      ::= ___Z <encoding> _block_invoke_<decimal-digit>+
 template <typename Derived, typename Alloc>
-Node *AbstractManglingParser<Derived, Alloc>::parse() {
+Node *AbstractManglingParser<Derived, Alloc>::parse(bool ParseParams) {
   if (consumeIf("_Z") || consumeIf("__Z")) {
-    Node *Encoding = getDerived().parseEncoding();
+    Node *Encoding = getDerived().parseEncoding(ParseParams);
     if (Encoding == nullptr)
       return nullptr;
     if (look() == '.') {
@@ -5910,7 +5920,7 @@ Node *AbstractManglingParser<Derived, Alloc>::parse() {
   }
 
   if (consumeIf("___Z") || consumeIf("____Z")) {
-    Node *Encoding = getDerived().parseEncoding();
+    Node *Encoding = getDerived().parseEncoding(ParseParams);
     if (Encoding == nullptr || !consumeIf("_block_invoke"))
       return nullptr;
     bool RequireNumber = consumeIf('_');
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index abbef03d02cb..669104307fa0 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2562,6 +2562,13 @@ public:
                       AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D,
                       AtomicOrdering AO, omp::OMPAtomicCompareOp Op,
                       bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
+  InsertPointTy createAtomicCompare(const LocationDescription &Loc,
+                                    AtomicOpValue &X, AtomicOpValue &V,
+                                    AtomicOpValue &R, Value *E, Value *D,
+                                    AtomicOrdering AO,
+                                    omp::OMPAtomicCompareOp Op,
+                                    bool IsXBinopExpr, bool IsPostfixUpdate,
+                                    bool IsFailOnly, AtomicOrdering Failure);
 
   /// Create the control flow structure of a canonical OpenMP loop.
   ///
diff --git a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
index e2ece30b1864..f6eb5066d553 100644
--- a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
+++ b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
@@ -29,7 +29,7 @@
 #include "llvm/ADT/GenericConvergenceVerifier.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
 
 #define Check(C, ...)                                                          \
   do {                                                                         \
@@ -48,17 +48,6 @@
   } while (false)
 
 namespace llvm {
-static bool isConvergenceControlIntrinsic(unsigned IntrinsicID) {
-  switch (IntrinsicID) {
-  default:
-    return false;
-  case Intrinsic::experimental_convergence_anchor:
-  case Intrinsic::experimental_convergence_entry:
-  case Intrinsic::experimental_convergence_loop:
-    return true;
-  }
-}
-
 template <class ContextT> void GenericConvergenceVerifier<ContextT>::clear() {
   Tokens.clear();
   CI.clear();
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 8940bebd2c9a..b8d578d0fee0 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1724,6 +1724,30 @@ public:
   }
 };
 
+/// Check if \p ID corresponds to a convergence control intrinsic.
+static inline bool isConvergenceControlIntrinsic(unsigned IntrinsicID) {
+  switch (IntrinsicID) {
+  default:
+    return false;
+  case Intrinsic::experimental_convergence_anchor:
+  case Intrinsic::experimental_convergence_entry:
+  case Intrinsic::experimental_convergence_loop:
+    return true;
+  }
+}
+
+/// Represents calls to the llvm.experimintal.convergence.* intrinsics.
+class ConvergenceControlInst : public IntrinsicInst {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return isConvergenceControlIntrinsic(I->getIntrinsicID());
+  }
+
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_IR_INTRINSICINST_H
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 51bd9b63c127..531b11123545 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -10,6 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+def global_ptr_ty : LLVMQualPointerType<1>;
+
 class AMDGPUReadPreloadRegisterIntrinsic
   : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
 
@@ -2353,14 +2355,14 @@ def int_amdgcn_s_get_waveid_in_workgroup :
   Intrinsic<[llvm_i32_ty], [],
     [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
-class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic <
+class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic <
   [vt],
-  [llvm_anyptr_ty,    // vaddr
-   vt],               // vdata(VGPR)
+  [pt,  // vaddr
+   vt], // vdata(VGPR)
   [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "",
   [SDNPMemOperand]>;
 
-def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>;
+def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>;
 
 // uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
 //                                           <ray_dir>, <ray_inv_dir>, <texture_descr>
@@ -2486,10 +2488,12 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
 
-def int_amdgcn_flat_atomic_fmin_num   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_flat_atomic_fmax_num   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_global_atomic_fmin_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_global_atomic_fmax_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
+
+def int_amdgcn_flat_atomic_fmin_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fmax_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 
 //===----------------------------------------------------------------------===//
 // Deep learning intrinsics.
@@ -2692,7 +2696,7 @@ def int_amdgcn_udot8 :
 // gfx908 intrinsics
 // ===----------------------------------------------------------------------===//
 
-def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 
 // llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
 class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
@@ -2728,11 +2732,11 @@ def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v
 // gfx90a intrinsics
 // ===----------------------------------------------------------------------===//
 
-def int_amdgcn_global_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_global_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_flat_atomic_fadd   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_flat_atomic_fmin   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_flat_atomic_fmax   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fadd   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fmin   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fmax   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 
 def int_amdgcn_mfma_f32_32x32x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
 def int_amdgcn_mfma_f32_16x16x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
@@ -2751,8 +2755,8 @@ def int_amdgcn_mfma_f64_4x4x4f64        : AMDGPUMfmaIntrinsic<llvm_double_ty, ll
 // ===----------------------------------------------------------------------===//
 
 // bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
-def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
-def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
+def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
+def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUAtomicRtn<llvm_v2i16_ty>;
 def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
     [llvm_v2i16_ty],
     [LLVMQualPointerType<3>, llvm_v2i16_ty],
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 096d1688af3f..48afdb867ba6 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -1656,6 +1656,19 @@ template <typename Op_t> struct PtrToIntSameSize_match {
   }
 };
 
+template <typename Op_t> struct NNegZExt_match {
+  Op_t Op;
+
+  NNegZExt_match(const Op_t &OpMatch) : Op(OpMatch) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *I = dyn_cast<Instruction>(V))
+      return I->getOpcode() == Instruction::ZExt && I->hasNonNeg() &&
+             Op.match(I->getOperand(0));
+    return false;
+  }
+};
+
 /// Matches BitCast.
 template <typename OpTy>
 inline CastOperator_match<OpTy, Instruction::BitCast>
@@ -1708,6 +1721,11 @@ inline CastInst_match<OpTy, Instruction::ZExt> m_ZExt(const OpTy &Op) {
 }
 
 template <typename OpTy>
+inline NNegZExt_match<OpTy> m_NNegZExt(const OpTy &Op) {
+  return NNegZExt_match<OpTy>(Op);
+}
+
+template <typename OpTy>
 inline match_combine_or<CastInst_match<OpTy, Instruction::ZExt>, OpTy>
 m_ZExtOrSelf(const OpTy &Op) {
   return m_CombineOr(m_ZExt(Op), Op);
@@ -1719,6 +1737,14 @@ m_SExtOrSelf(const OpTy &Op) {
   return m_CombineOr(m_SExt(Op), Op);
 }
 
+/// Match either "sext" or "zext nneg".
+template <typename OpTy>
+inline match_combine_or<CastInst_match<OpTy, Instruction::SExt>,
+                        NNegZExt_match<OpTy>>
+m_SExtLike(const OpTy &Op) {
+  return m_CombineOr(m_SExt(Op), m_NNegZExt(Op));
+}
+
 template <typename OpTy>
 inline match_combine_or<CastInst_match<OpTy, Instruction::ZExt>,
                         CastInst_match<OpTy, Instruction::SExt>>
diff --git a/llvm/include/llvm/Object/Wasm.h b/llvm/include/llvm/Object/Wasm.h
index dfab4c68d18f..6b8edb90e144 100644
--- a/llvm/include/llvm/Object/Wasm.h
+++ b/llvm/include/llvm/Object/Wasm.h
@@ -144,7 +144,6 @@ public:
   ArrayRef<wasm::WasmGlobal> globals() const { return Globals; }
   ArrayRef<wasm::WasmTag> tags() const { return Tags; }
   ArrayRef<wasm::WasmExport> exports() const { return Exports; }
-  ArrayRef<WasmSymbol> syms() const { return Symbols; }
   const wasm::WasmLinkingData &linkingData() const { return LinkingData; }
   uint32_t getNumberOfSymbols() const { return Symbols.size(); }
   ArrayRef<wasm::WasmElemSegment> elements() const { return ElemSegments; }
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 493689f6a61e..2757b8cd54a6 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -447,7 +447,7 @@ public:
   bool isConditionIndependencePairCovered(unsigned Condition) const {
     auto It = PosToID.find(Condition);
     if (It != PosToID.end())
-      return (IndependencePairs.find(It->second) != IndependencePairs.end());
+      return IndependencePairs.contains(It->second);
     llvm_unreachable("Condition ID without an Ordinal mapping");
   }
 
diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index d995cc69af89..66aaf602d0e1 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -883,7 +883,7 @@ public:
   /// Returns the call target map collected at a given location.
   /// Each location is specified by \p LineOffset and \p Discriminator.
   /// If the location is not found in profile, return error.
-  ErrorOr<SampleRecord::CallTargetMap>
+  ErrorOr<const SampleRecord::CallTargetMap &>
   findCallTargetMapAt(uint32_t LineOffset, uint32_t Discriminator) const {
     const auto &ret = BodySamples.find(
         mapIRLocToProfileLoc(LineLocation(LineOffset, Discriminator)));
@@ -894,7 +894,7 @@ public:
 
   /// Returns the call target map collected at a given location specified by \p
   /// CallSite. If the location is not found in profile, return error.
-  ErrorOr<SampleRecord::CallTargetMap>
+  ErrorOr<const SampleRecord::CallTargetMap &>
   findCallTargetMapAt(const LineLocation &CallSite) const {
     const auto &Ret = BodySamples.find(mapIRLocToProfileLoc(CallSite));
     if (Ret == BodySamples.end())
diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index 2de2cf4185d8..84cac3ef700e 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -127,12 +127,20 @@ enum : int32_t {
 #undef COMPUTE_PGM_RSRC1
 
 // Compute program resource register 2. Must match hardware definition.
+// GFX6+.
 #define COMPUTE_PGM_RSRC2(NAME, SHIFT, WIDTH) \
   AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_ ## NAME, SHIFT, WIDTH)
+// [GFX6-GFX11].
+#define COMPUTE_PGM_RSRC2_GFX6_GFX11(NAME, SHIFT, WIDTH)                       \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_GFX6_GFX11_##NAME, SHIFT, WIDTH)
+// GFX12+.
+#define COMPUTE_PGM_RSRC2_GFX12_PLUS(NAME, SHIFT, WIDTH)                       \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_GFX12_PLUS_##NAME, SHIFT, WIDTH)
 enum : int32_t {
   COMPUTE_PGM_RSRC2(ENABLE_PRIVATE_SEGMENT, 0, 1),
   COMPUTE_PGM_RSRC2(USER_SGPR_COUNT, 1, 5),
-  COMPUTE_PGM_RSRC2(ENABLE_TRAP_HANDLER, 6, 1),
+  COMPUTE_PGM_RSRC2_GFX6_GFX11(ENABLE_TRAP_HANDLER, 6, 1),
+  COMPUTE_PGM_RSRC2_GFX12_PLUS(RESERVED1, 6, 1),
   COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_X, 7, 1),
   COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1),
   COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1),
@@ -166,23 +174,37 @@ enum : int32_t {
 
 // Compute program resource register 3 for GFX10+. Must match hardware
 // definition.
-// [GFX10].
-#define COMPUTE_PGM_RSRC3_GFX10(NAME, SHIFT, WIDTH) \
-  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_ ## NAME, SHIFT, WIDTH)
 // GFX10+.
 #define COMPUTE_PGM_RSRC3_GFX10_PLUS(NAME, SHIFT, WIDTH) \
   AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_PLUS_ ## NAME, SHIFT, WIDTH)
+// [GFX10].
+#define COMPUTE_PGM_RSRC3_GFX10(NAME, SHIFT, WIDTH)                            \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_##NAME, SHIFT, WIDTH)
+// [GFX10-GFX11].
+#define COMPUTE_PGM_RSRC3_GFX10_GFX11(NAME, SHIFT, WIDTH)                      \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_GFX11_##NAME, SHIFT, WIDTH)
 // GFX11+.
 #define COMPUTE_PGM_RSRC3_GFX11_PLUS(NAME, SHIFT, WIDTH) \
   AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_PLUS_ ## NAME, SHIFT, WIDTH)
+// [GFX11].
+#define COMPUTE_PGM_RSRC3_GFX11(NAME, SHIFT, WIDTH)                            \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX11_##NAME, SHIFT, WIDTH)
+// GFX12+.
+#define COMPUTE_PGM_RSRC3_GFX12_PLUS(NAME, SHIFT, WIDTH)                       \
+  AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX12_PLUS_##NAME, SHIFT, WIDTH)
 enum : int32_t {
-  COMPUTE_PGM_RSRC3_GFX10_PLUS(SHARED_VGPR_COUNT, 0, 4),
-  COMPUTE_PGM_RSRC3_GFX10(RESERVED0, 4, 8),
-  COMPUTE_PGM_RSRC3_GFX11_PLUS(INST_PREF_SIZE, 4, 6),
-  COMPUTE_PGM_RSRC3_GFX11_PLUS(TRAP_ON_START, 10, 1),
-  COMPUTE_PGM_RSRC3_GFX11_PLUS(TRAP_ON_END, 11, 1),
-  COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED1, 12, 19),
-  COMPUTE_PGM_RSRC3_GFX10(RESERVED2, 31, 1),
+  COMPUTE_PGM_RSRC3_GFX10_GFX11(SHARED_VGPR_COUNT, 0, 4),
+  COMPUTE_PGM_RSRC3_GFX12_PLUS(RESERVED0, 0, 4),
+  COMPUTE_PGM_RSRC3_GFX10(RESERVED1, 4, 8),
+  COMPUTE_PGM_RSRC3_GFX11(INST_PREF_SIZE, 4, 6),
+  COMPUTE_PGM_RSRC3_GFX11(TRAP_ON_START, 10, 1),
+  COMPUTE_PGM_RSRC3_GFX11(TRAP_ON_END, 11, 1),
+  COMPUTE_PGM_RSRC3_GFX12_PLUS(INST_PREF_SIZE, 4, 8),
+  COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED2, 12, 1),
+  COMPUTE_PGM_RSRC3_GFX10_GFX11(RESERVED3, 13, 1),
+  COMPUTE_PGM_RSRC3_GFX12_PLUS(GLG_EN, 13, 1),
+  COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED4, 14, 17),
+  COMPUTE_PGM_RSRC3_GFX10(RESERVED5, 31, 1),
   COMPUTE_PGM_RSRC3_GFX11_PLUS(IMAGE_OP, 31, 1),
 };
 #undef COMPUTE_PGM_RSRC3_GFX10_PLUS
diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index b0683ac2e32c..3aceb247a26c 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -70,7 +70,8 @@ enum attributeBits {
   ATTR_EVEXKZ = 0x1 << 11,
   ATTR_EVEXB = 0x1 << 12,
   ATTR_REX2 = 0x1 << 13,
-  ATTR_max = 0x1 << 14,
+  ATTR_EVEXNF = 0x1 << 14,
+  ATTR_max = 0x1 << 15,
 };
 
 // Combinations of the above attributes that are relevant to instruction
@@ -137,12 +138,15 @@ enum attributeBits {
   ENUM_ENTRY(IC_VEX_L_W_XD, 5, "requires VEX, L, W and XD prefix")             \
   ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize")            \
   ENUM_ENTRY(IC_EVEX, 1, "requires an EVEX prefix")                            \
+  ENUM_ENTRY(IC_EVEX_NF, 2, "requires EVEX and NF prefix")                     \
   ENUM_ENTRY(IC_EVEX_XS, 2, "requires EVEX and the XS prefix")                 \
   ENUM_ENTRY(IC_EVEX_XD, 2, "requires EVEX and the XD prefix")                 \
   ENUM_ENTRY(IC_EVEX_OPSIZE, 2, "requires EVEX and the OpSize prefix")         \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_NF, 3, "requires EVEX, NF and the OpSize prefix")  \
   ENUM_ENTRY(IC_EVEX_OPSIZE_ADSIZE, 3,                                         \
              "requires EVEX, OPSIZE and the ADSIZE prefix")                    \
   ENUM_ENTRY(IC_EVEX_W, 3, "requires EVEX and the W prefix")                   \
+  ENUM_ENTRY(IC_EVEX_W_NF, 4, "requires EVEX, W and NF prefix")                \
   ENUM_ENTRY(IC_EVEX_W_XS, 4, "requires EVEX, W, and XS prefix")               \
   ENUM_ENTRY(IC_EVEX_W_XD, 4, "requires EVEX, W, and XD prefix")               \
   ENUM_ENTRY(IC_EVEX_W_OPSIZE, 4, "requires EVEX, W, and OpSize")              \
@@ -187,10 +191,13 @@ enum attributeBits {
   ENUM_ENTRY(IC_EVEX_L2_W_XD_K, 4, "requires EVEX_K, L2, W and XD prefix")     \
   ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4, "requires EVEX_K, L2, W and OpSize")    \
   ENUM_ENTRY(IC_EVEX_B, 1, "requires an EVEX_B prefix")                        \
+  ENUM_ENTRY(IC_EVEX_B_NF, 2, "requires EVEX_NF and EVEX_B prefix")            \
   ENUM_ENTRY(IC_EVEX_XS_B, 2, "requires EVEX_B and the XS prefix")             \
   ENUM_ENTRY(IC_EVEX_XD_B, 2, "requires EVEX_B and the XD prefix")             \
   ENUM_ENTRY(IC_EVEX_OPSIZE_B, 2, "requires EVEX_B and the OpSize prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_B_NF, 3, "requires EVEX_B, NF and Opsize prefix")  \
   ENUM_ENTRY(IC_EVEX_W_B, 3, "requires EVEX_B and the W prefix")               \
+  ENUM_ENTRY(IC_EVEX_W_B_NF, 4, "requires EVEX_NF, EVEX_B and the W prefix")   \
   ENUM_ENTRY(IC_EVEX_W_XS_B, 4, "requires EVEX_B, W, and XS prefix")           \
   ENUM_ENTRY(IC_EVEX_W_XD_B, 4, "requires EVEX_B, W, and XD prefix")           \
   ENUM_ENTRY(IC_EVEX_W_OPSIZE_B, 4, "requires EVEX_B, W, and OpSize")          \
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 77db371adaf7..6bda80681432 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -437,13 +437,6 @@ def select_constant_cmp: GICombineRule<
   (apply [{ Helper.replaceSingleDefInstWithOperand(*${root}, ${matchinfo}); }])
 >;
 
-def select_to_logical : GICombineRule<
-  (defs root:$root, build_fn_matchinfo:$matchinfo),
-  (match (wip_match_opcode G_SELECT):$root,
-    [{ return Helper.matchSelectToLogical(*${root}, ${matchinfo}); }]),
-  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])
->;
-
 // Fold (C op x) -> (x op C)
 // TODO: handle more isCommutable opcodes
 // TODO: handle compares (currently not marked as isCommutable)
@@ -1242,6 +1235,12 @@ def select_to_minmax: GICombineRule<
          [{ return Helper.matchSimplifySelectToMinMax(*${root}, ${info}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
 
+def match_selects : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SELECT):$root,
+        [{ return Helper.matchSelect(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -1282,7 +1281,7 @@ def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend,
 def phi_combines : GICombineGroup<[extend_through_phis]>;
 
 def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp,
-                                      select_to_logical]>;
+                                      match_selects]>;
 
 def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd,
                                        mul_by_neg_one, idempotent_prop]>;
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index f0b35790133f..53dc2be825f2 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -174,6 +174,8 @@ enum ArchExtKind : unsigned {
   AEK_SMEF8F32 =      70, // FEAT_SME_F8F32
   AEK_SMEFA64 =       71, // FEAT_SME_FA64
   AEK_CPA =           72, // FEAT_CPA
+  AEK_PAUTHLR =       73, // FEAT_PAuth_LR
+  AEK_TLBIW =         74, // FEAT_TLBIW
   AEK_NUM_EXTENSIONS
 };
 using ExtensionBitset = Bitset<AEK_NUM_EXTENSIONS>;
@@ -297,6 +299,8 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"sme-f8f32", AArch64::AEK_SMEF8F32, "+sme-f8f32", "-sme-f8f32", FEAT_INIT, "+sme2,+fp8", 0},
     {"sme-fa64",  AArch64::AEK_SMEFA64,  "+sme-fa64", "-sme-fa64",  FEAT_INIT, "", 0},
     {"cpa", AArch64::AEK_CPA, "+cpa", "-cpa", FEAT_INIT, "", 0},
+    {"pauth-lr", AArch64::AEK_PAUTHLR, "+pauth-lr", "-pauth-lr", FEAT_INIT, "", 0},
+    {"tlbiw", AArch64::AEK_TLBIW, "+tlbiw", "-tlbiw", FEAT_INIT, "", 0},
     // Special cases
     {"none", AArch64::AEK_NONE, {}, {}, FEAT_INIT, "", ExtensionInfo::MaxFMVPriority},
 };
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
index e3d9ffc1d4db..1e4187c6fb11 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
+++ b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
@@ -41,6 +41,7 @@ struct ParsedBranchProtection {
   StringRef Scope;
   StringRef Key;
   bool BranchTargetEnforcement;
+  bool BranchProtectionPAuthLR;
 };
 
 bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
diff --git a/llvm/include/llvm/TextAPI/Utils.h b/llvm/include/llvm/TextAPI/Utils.h
new file mode 100644
index 000000000000..bb22ea5e9606
--- /dev/null
+++ b/llvm/include/llvm/TextAPI/Utils.h
@@ -0,0 +1,34 @@
+//===- llvm/TextAPI/Utils.h - TAPI Utils -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helper functionality used for Darwin specific operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TEXTAPI_UTILS_H
+#define LLVM_TEXTAPI_UTILS_H
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+
+#if !defined(PATH_MAX)
+#define PATH_MAX 1024
+#endif
+
+namespace llvm::MachO {
+
+using PathSeq = std::vector<std::string>;
+
+/// Replace extension considering frameworks.
+///
+/// \param Path Location of file.
+/// \param Extension File extension to update with.
+void replace_extension(SmallVectorImpl<char> &Path, const Twine &Extension);
+} // namespace llvm::MachO
+#endif // LLVM_TEXTAPI_UTILS_H
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 50167708163e..30c51250af61 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1157,6 +1157,12 @@ struct AnalysisGetter {
     return nullptr;
   }
 
+  /// Invalidates the analyses. Valid only when using the new pass manager.
+  void invalidateAnalyses() {
+    assert(FAM && "Can only be used from the new PM!");
+    FAM->clear();
+  }
+
   AnalysisGetter(FunctionAnalysisManager &FAM, bool CachedOnly = false)
       : FAM(&FAM), CachedOnly(CachedOnly) {}
   AnalysisGetter(Pass *P, bool CachedOnly = false)
@@ -1286,6 +1292,10 @@ struct InformationCache {
     return AssumeOnlyValues.contains(&I);
   }
 
+  /// Invalidates the cached analyses. Valid only when using the new pass
+  /// manager.
+  void invalidateAnalyses() { AG.invalidateAnalyses(); }
+
   /// Return the analysis result from a pass \p AP for function \p F.
   template <typename AP>
   typename AP::Result *getAnalysisResultForFunction(const Function &F,
diff --git a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
index 9a0abdfa8954..8bf902fc8d28 100644
--- a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
+++ b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
@@ -114,9 +114,8 @@ public:
           uint64_t CallsiteCount = 0;
           LineLocation Callsite = Callee->getCallSiteLoc();
           if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
-            SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
-            auto It = TargetCounts.find(CalleeSamples->getFunction());
-            if (It != TargetCounts.end())
+            auto It = CallTargets->find(CalleeSamples->getFunction());
+            if (It != CallTargets->end())
               CallsiteCount = It->second;
           }
           Weight = std::max(CallsiteCount, CalleeEntryCount);
diff --git a/llvm/lib/Analysis/ConstraintSystem.cpp b/llvm/lib/Analysis/ConstraintSystem.cpp
index 8a802515b6f4..35bdd869a88d 100644
--- a/llvm/lib/Analysis/ConstraintSystem.cpp
+++ b/llvm/lib/Analysis/ConstraintSystem.cpp
@@ -29,7 +29,6 @@ bool ConstraintSystem::eliminateUsingFM() {
   assert(!Constraints.empty() &&
          "should only be called for non-empty constraint systems");
 
-  uint32_t NewGCD = 1;
   unsigned LastIdx = NumVariables - 1;
 
   // First, either remove the variable in place if it is 0 or add the row to
@@ -96,24 +95,20 @@ bool ConstraintSystem::eliminateUsingFM() {
           IdxUpper++;
         }
 
-        if (MulOverflow(UpperV, ((-1) * LowerLast / GCD), M1))
+        if (MulOverflow(UpperV, ((-1) * LowerLast), M1))
           return false;
         if (IdxLower < LowerRow.size() && LowerRow[IdxLower].Id == CurrentId) {
           LowerV = LowerRow[IdxLower].Coefficient;
           IdxLower++;
         }
 
-        if (MulOverflow(LowerV, (UpperLast / GCD), M2))
+        if (MulOverflow(LowerV, (UpperLast), M2))
           return false;
         if (AddOverflow(M1, M2, N))
           return false;
         if (N == 0)
           continue;
         NR.emplace_back(N, CurrentId);
-
-        NewGCD =
-            APIntOps::GreatestCommonDivisor({32, (uint32_t)N}, {32, NewGCD})
-                .getZExtValue();
       }
       if (NR.empty())
         continue;
@@ -124,7 +119,6 @@ bool ConstraintSystem::eliminateUsingFM() {
     }
   }
   NumVariables -= 1;
-  GCD = NewGCD;
 
   return true;
 }
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 5beac5547d65..78a833476334 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -1189,14 +1189,26 @@ static Value *simplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   if (Value *V = simplifyDivRem(Opcode, Op0, Op1, Q, MaxRecurse))
     return V;
 
-  // If this is an exact divide by a constant, then the dividend (Op0) must have
-  // at least as many trailing zeros as the divisor to divide evenly. If it has
-  // less trailing zeros, then the result must be poison.
   const APInt *DivC;
-  if (IsExact && match(Op1, m_APInt(DivC)) && DivC->countr_zero()) {
-    KnownBits KnownOp0 = computeKnownBits(Op0, /* Depth */ 0, Q);
-    if (KnownOp0.countMaxTrailingZeros() < DivC->countr_zero())
-      return PoisonValue::get(Op0->getType());
+  if (IsExact && match(Op1, m_APInt(DivC))) {
+    // If this is an exact divide by a constant, then the dividend (Op0) must
+    // have at least as many trailing zeros as the divisor to divide evenly. If
+    // it has less trailing zeros, then the result must be poison.
+    if (DivC->countr_zero()) {
+      KnownBits KnownOp0 = computeKnownBits(Op0, /* Depth */ 0, Q);
+      if (KnownOp0.countMaxTrailingZeros() < DivC->countr_zero())
+        return PoisonValue::get(Op0->getType());
+    }
+
+    // udiv exact (mul nsw X, C), C --> X
+    // sdiv exact (mul nuw X, C), C --> X
+    // where C is not a power of 2.
+    Value *X;
+    if (!DivC->isPowerOf2() &&
+        (Opcode == Instruction::UDiv
+             ? match(Op0, m_NSWMul(m_Value(X), m_Specific(Op1)))
+             : match(Op0, m_NUWMul(m_Value(X), m_Specific(Op1)))))
+      return X;
   }
 
   return nullptr;
@@ -4857,14 +4869,12 @@ static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
   // select ?, poison, X -> X
   // select ?, undef,  X -> X
   if (isa<PoisonValue>(TrueVal) ||
-      (Q.isUndefValue(TrueVal) &&
-       isGuaranteedNotToBePoison(FalseVal, Q.AC, Q.CxtI, Q.DT)))
+      (Q.isUndefValue(TrueVal) && impliesPoison(FalseVal, Cond)))
     return FalseVal;
   // select ?, X, poison -> X
   // select ?, X, undef  -> X
   if (isa<PoisonValue>(FalseVal) ||
-      (Q.isUndefValue(FalseVal) &&
-       isGuaranteedNotToBePoison(TrueVal, Q.AC, Q.CxtI, Q.DT)))
+      (Q.isUndefValue(FalseVal) && impliesPoison(TrueVal, Cond)))
     return TrueVal;
 
   // Deal with partial undef vector constants: select ?, VecC, VecC' --> VecC''
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 89cc7ea15ec1..360fc594ef7c 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -434,6 +434,28 @@ class LazyValueInfoImpl {
 
   void solve();
 
+  // For the following methods, if UseBlockValue is true, the function may
+  // push additional values to the worklist and return nullopt. If
+  // UseBlockValue is false, it will never return nullopt.
+
+  std::optional<ValueLatticeElement>
+  getValueFromSimpleICmpCondition(CmpInst::Predicate Pred, Value *RHS,
+                                  const APInt &Offset, Instruction *CxtI,
+                                  bool UseBlockValue);
+
+  std::optional<ValueLatticeElement>
+  getValueFromICmpCondition(Value *Val, ICmpInst *ICI, bool isTrueDest,
+                            bool UseBlockValue);
+
+  std::optional<ValueLatticeElement>
+  getValueFromCondition(Value *Val, Value *Cond, bool IsTrueDest,
+                        bool UseBlockValue, unsigned Depth = 0);
+
+  std::optional<ValueLatticeElement> getEdgeValueLocal(Value *Val,
+                                                       BasicBlock *BBFrom,
+                                                       BasicBlock *BBTo,
+                                                       bool UseBlockValue);
+
 public:
   /// This is the query interface to determine the lattice value for the
   /// specified Value* at the context instruction (if specified) or at the
@@ -755,14 +777,10 @@ LazyValueInfoImpl::solveBlockValuePHINode(PHINode *PN, BasicBlock *BB) {
   return Result;
 }
 
-static ValueLatticeElement getValueFromCondition(Value *Val, Value *Cond,
-                                                 bool isTrueDest = true,
-                                                 unsigned Depth = 0);
-
 // If we can determine a constraint on the value given conditions assumed by
 // the program, intersect those constraints with BBLV
 void LazyValueInfoImpl::intersectAssumeOrGuardBlockValueConstantRange(
-        Value *Val, ValueLatticeElement &BBLV, Instruction *BBI) {
+    Value *Val, ValueLatticeElement &BBLV, Instruction *BBI) {
   BBI = BBI ? BBI : dyn_cast<Instruction>(Val);
   if (!BBI)
     return;
@@ -779,17 +797,21 @@ void LazyValueInfoImpl::intersectAssumeOrGuardBlockValueConstantRange(
     if (I->getParent() != BB || !isValidAssumeForContext(I, BBI))
       continue;
 
-    BBLV = intersect(BBLV, getValueFromCondition(Val, I->getArgOperand(0)));
+    BBLV = intersect(BBLV, *getValueFromCondition(Val, I->getArgOperand(0),
+                                                  /*IsTrueDest*/ true,
+                                                  /*UseBlockValue*/ false));
   }
 
   // If guards are not used in the module, don't spend time looking for them
   if (GuardDecl && !GuardDecl->use_empty() &&
       BBI->getIterator() != BB->begin()) {
-    for (Instruction &I : make_range(std::next(BBI->getIterator().getReverse()),
-                                     BB->rend())) {
+    for (Instruction &I :
+         make_range(std::next(BBI->getIterator().getReverse()), BB->rend())) {
       Value *Cond = nullptr;
       if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>(m_Value(Cond))))
-        BBLV = intersect(BBLV, getValueFromCondition(Val, Cond));
+        BBLV = intersect(BBLV,
+                         *getValueFromCondition(Val, Cond, /*IsTrueDest*/ true,
+                                                /*UseBlockValue*/ false));
     }
   }
 
@@ -886,10 +908,14 @@ LazyValueInfoImpl::solveBlockValueSelect(SelectInst *SI, BasicBlock *BB) {
   // If the value is undef, a different value may be chosen in
   // the select condition.
   if (isGuaranteedNotToBeUndef(Cond, AC)) {
-    TrueVal = intersect(TrueVal,
-                        getValueFromCondition(SI->getTrueValue(), Cond, true));
-    FalseVal = intersect(
-        FalseVal, getValueFromCondition(SI->getFalseValue(), Cond, false));
+    TrueVal =
+        intersect(TrueVal, *getValueFromCondition(SI->getTrueValue(), Cond,
+                                                  /*IsTrueDest*/ true,
+                                                  /*UseBlockValue*/ false));
+    FalseVal =
+        intersect(FalseVal, *getValueFromCondition(SI->getFalseValue(), Cond,
+                                                   /*IsTrueDest*/ false,
+                                                   /*UseBlockValue*/ false));
   }
 
   ValueLatticeElement Result = TrueVal;
@@ -950,9 +976,11 @@ LazyValueInfoImpl::solveBlockValueBinaryOpImpl(
   // lets us pick up facts from expressions like "and i32 (call i32
   // @foo()), 32"
   std::optional<ConstantRange> LHSRes = getRangeFor(I->getOperand(0), I, BB);
+  if (!LHSRes)
+    return std::nullopt;
+
   std::optional<ConstantRange> RHSRes = getRangeFor(I->getOperand(1), I, BB);
-  if (!LHSRes || !RHSRes)
-    // More work to do before applying this transfer rule.
+  if (!RHSRes)
     return std::nullopt;
 
   const ConstantRange &LHSRange = *LHSRes;
@@ -1068,15 +1096,26 @@ static bool matchICmpOperand(APInt &Offset, Value *LHS, Value *Val,
 }
 
 /// Get value range for a "(Val + Offset) Pred RHS" condition.
-static ValueLatticeElement getValueFromSimpleICmpCondition(
-    CmpInst::Predicate Pred, Value *RHS, const APInt &Offset) {
+std::optional<ValueLatticeElement>
+LazyValueInfoImpl::getValueFromSimpleICmpCondition(CmpInst::Predicate Pred,
+                                                   Value *RHS,
+                                                   const APInt &Offset,
+                                                   Instruction *CxtI,
+                                                   bool UseBlockValue) {
   ConstantRange RHSRange(RHS->getType()->getIntegerBitWidth(),
                          /*isFullSet=*/true);
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS))
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
     RHSRange = ConstantRange(CI->getValue());
-  else if (Instruction *I = dyn_cast<Instruction>(RHS))
+  } else if (UseBlockValue) {
+    std::optional<ValueLatticeElement> R =
+        getBlockValue(RHS, CxtI->getParent(), CxtI);
+    if (!R)
+      return std::nullopt;
+    RHSRange = toConstantRange(*R, RHS->getType());
+  } else if (Instruction *I = dyn_cast<Instruction>(RHS)) {
     if (auto *Ranges = I->getMetadata(LLVMContext::MD_range))
       RHSRange = getConstantRangeFromMetadata(*Ranges);
+  }
 
   ConstantRange TrueValues =
       ConstantRange::makeAllowedICmpRegion(Pred, RHSRange);
@@ -1103,8 +1142,8 @@ getRangeViaSLT(CmpInst::Predicate Pred, APInt RHS,
   return std::nullopt;
 }
 
-static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
-                                                     bool isTrueDest) {
+std::optional<ValueLatticeElement> LazyValueInfoImpl::getValueFromICmpCondition(
+    Value *Val, ICmpInst *ICI, bool isTrueDest, bool UseBlockValue) {
   Value *LHS = ICI->getOperand(0);
   Value *RHS = ICI->getOperand(1);
 
@@ -1128,11 +1167,13 @@ static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
   unsigned BitWidth = Ty->getScalarSizeInBits();
   APInt Offset(BitWidth, 0);
   if (matchICmpOperand(Offset, LHS, Val, EdgePred))
-    return getValueFromSimpleICmpCondition(EdgePred, RHS, Offset);
+    return getValueFromSimpleICmpCondition(EdgePred, RHS, Offset, ICI,
+                                           UseBlockValue);
 
   CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(EdgePred);
   if (matchICmpOperand(Offset, RHS, Val, SwappedPred))
-    return getValueFromSimpleICmpCondition(SwappedPred, LHS, Offset);
+    return getValueFromSimpleICmpCondition(SwappedPred, LHS, Offset, ICI,
+                                           UseBlockValue);
 
   const APInt *Mask, *C;
   if (match(LHS, m_And(m_Specific(Val), m_APInt(Mask))) &&
@@ -1212,10 +1253,12 @@ static ValueLatticeElement getValueFromOverflowCondition(
   return ValueLatticeElement::getRange(NWR);
 }
 
-static ValueLatticeElement getValueFromCondition(
-    Value *Val, Value *Cond, bool IsTrueDest, unsigned Depth) {
+std::optional<ValueLatticeElement>
+LazyValueInfoImpl::getValueFromCondition(Value *Val, Value *Cond,
+                                         bool IsTrueDest, bool UseBlockValue,
+                                         unsigned Depth) {
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(Cond))
-    return getValueFromICmpCondition(Val, ICI, IsTrueDest);
+    return getValueFromICmpCondition(Val, ICI, IsTrueDest, UseBlockValue);
 
   if (auto *EVI = dyn_cast<ExtractValueInst>(Cond))
     if (auto *WO = dyn_cast<WithOverflowInst>(EVI->getAggregateOperand()))
@@ -1227,7 +1270,7 @@ static ValueLatticeElement getValueFromCondition(
 
   Value *N;
   if (match(Cond, m_Not(m_Value(N))))
-    return getValueFromCondition(Val, N, !IsTrueDest, Depth);
+    return getValueFromCondition(Val, N, !IsTrueDest, UseBlockValue, Depth);
 
   Value *L, *R;
   bool IsAnd;
@@ -1238,19 +1281,25 @@ static ValueLatticeElement getValueFromCondition(
   else
     return ValueLatticeElement::getOverdefined();
 
-  ValueLatticeElement LV = getValueFromCondition(Val, L, IsTrueDest, Depth);
-  ValueLatticeElement RV = getValueFromCondition(Val, R, IsTrueDest, Depth);
+  std::optional<ValueLatticeElement> LV =
+      getValueFromCondition(Val, L, IsTrueDest, UseBlockValue, Depth);
+  if (!LV)
+    return std::nullopt;
+  std::optional<ValueLatticeElement> RV =
+      getValueFromCondition(Val, R, IsTrueDest, UseBlockValue, Depth);
+  if (!RV)
+    return std::nullopt;
 
   // if (L && R) -> intersect L and R
   // if (!(L || R)) -> intersect !L and !R
   // if (L || R) -> union L and R
   // if (!(L && R)) -> union !L and !R
   if (IsTrueDest ^ IsAnd) {
-    LV.mergeIn(RV);
-    return LV;
+    LV->mergeIn(*RV);
+    return *LV;
   }
 
-  return intersect(LV, RV);
+  return intersect(*LV, *RV);
 }
 
 // Return true if Usr has Op as an operand, otherwise false.
@@ -1302,8 +1351,9 @@ static ValueLatticeElement constantFoldUser(User *Usr, Value *Op,
 }
 
 /// Compute the value of Val on the edge BBFrom -> BBTo.
-static ValueLatticeElement getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
-                                             BasicBlock *BBTo) {
+std::optional<ValueLatticeElement>
+LazyValueInfoImpl::getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
+                                     BasicBlock *BBTo, bool UseBlockValue) {
   // TODO: Handle more complex conditionals. If (v == 0 || v2 < 1) is false, we
   // know that v != 0.
   if (BranchInst *BI = dyn_cast<BranchInst>(BBFrom->getTerminator())) {
@@ -1324,13 +1374,16 @@ static ValueLatticeElement getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
 
       // If the condition of the branch is an equality comparison, we may be
       // able to infer the value.
-      ValueLatticeElement Result = getValueFromCondition(Val, Condition,
-                                                         isTrueDest);
-      if (!Result.isOverdefined())
+      std::optional<ValueLatticeElement> Result =
+          getValueFromCondition(Val, Condition, isTrueDest, UseBlockValue);
+      if (!Result)
+        return std::nullopt;
+
+      if (!Result->isOverdefined())
         return Result;
 
       if (User *Usr = dyn_cast<User>(Val)) {
-        assert(Result.isOverdefined() && "Result isn't overdefined");
+        assert(Result->isOverdefined() && "Result isn't overdefined");
         // Check with isOperationFoldable() first to avoid linearly iterating
         // over the operands unnecessarily which can be expensive for
         // instructions with many operands.
@@ -1356,8 +1409,8 @@ static ValueLatticeElement getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
             //    br i1 %Condition, label %then, label %else
             for (unsigned i = 0; i < Usr->getNumOperands(); ++i) {
               Value *Op = Usr->getOperand(i);
-              ValueLatticeElement OpLatticeVal =
-                  getValueFromCondition(Op, Condition, isTrueDest);
+              ValueLatticeElement OpLatticeVal = *getValueFromCondition(
+                  Op, Condition, isTrueDest, /*UseBlockValue*/ false);
               if (std::optional<APInt> OpConst =
                       OpLatticeVal.asConstantInteger()) {
                 Result = constantFoldUser(Usr, Op, *OpConst, DL);
@@ -1367,7 +1420,7 @@ static ValueLatticeElement getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
           }
         }
       }
-      if (!Result.isOverdefined())
+      if (!Result->isOverdefined())
         return Result;
     }
   }
@@ -1432,8 +1485,12 @@ LazyValueInfoImpl::getEdgeValue(Value *Val, BasicBlock *BBFrom,
   if (Constant *VC = dyn_cast<Constant>(Val))
     return ValueLatticeElement::get(VC);
 
-  ValueLatticeElement LocalResult = getEdgeValueLocal(Val, BBFrom, BBTo);
-  if (hasSingleValue(LocalResult))
+  std::optional<ValueLatticeElement> LocalResult =
+      getEdgeValueLocal(Val, BBFrom, BBTo, /*UseBlockValue*/ true);
+  if (!LocalResult)
+    return std::nullopt;
+
+  if (hasSingleValue(*LocalResult))
     // Can't get any more precise here
     return LocalResult;
 
@@ -1453,7 +1510,7 @@ LazyValueInfoImpl::getEdgeValue(Value *Val, BasicBlock *BBFrom,
   // but then the result is not cached.
   intersectAssumeOrGuardBlockValueConstantRange(Val, InBlock, CxtI);
 
-  return intersect(LocalResult, InBlock);
+  return intersect(*LocalResult, InBlock);
 }
 
 ValueLatticeElement LazyValueInfoImpl::getValueInBlock(Value *V, BasicBlock *BB,
@@ -1499,10 +1556,12 @@ getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB,
 
   std::optional<ValueLatticeElement> Result =
       getEdgeValue(V, FromBB, ToBB, CxtI);
-  if (!Result) {
+  while (!Result) {
+    // As the worklist only explicitly tracks block values (but not edge values)
+    // we may have to call solve() multiple times, as the edge value calculation
+    // may request additional block values.
     solve();
     Result = getEdgeValue(V, FromBB, ToBB, CxtI);
-    assert(Result && "More work to do after problem solved?");
   }
 
   LLVM_DEBUG(dbgs() << "  Result = " << *Result << "\n");
@@ -1528,13 +1587,17 @@ ValueLatticeElement LazyValueInfoImpl::getValueAtUse(const Use &U) {
       if (!isGuaranteedNotToBeUndef(SI->getCondition(), AC))
         break;
       if (CurrU->getOperandNo() == 1)
-        CondVal = getValueFromCondition(V, SI->getCondition(), true);
+        CondVal =
+            *getValueFromCondition(V, SI->getCondition(), /*IsTrueDest*/ true,
+                                   /*UseBlockValue*/ false);
       else if (CurrU->getOperandNo() == 2)
-        CondVal = getValueFromCondition(V, SI->getCondition(), false);
+        CondVal =
+            *getValueFromCondition(V, SI->getCondition(), /*IsTrueDest*/ false,
+                                   /*UseBlockValue*/ false);
     } else if (auto *PHI = dyn_cast<PHINode>(CurrI)) {
       // TODO: Use non-local query?
-      CondVal =
-          getEdgeValueLocal(V, PHI->getIncomingBlock(*CurrU), PHI->getParent());
+      CondVal = *getEdgeValueLocal(V, PHI->getIncomingBlock(*CurrU),
+                                   PHI->getParent(), /*UseBlockValue*/ false);
     }
     if (CondVal)
       VL = intersect(VL, *CondVal);
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 49eccde45f31..951e00e34142 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -1292,16 +1292,16 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
         if (InsertRes.first->second != Pointer.getAddr()) {
           // Make sure to clean up the Visited map before continuing on to
           // PredTranslationFailure.
-          for (unsigned i = 0; i < NewBlocks.size(); i++)
-            Visited.erase(NewBlocks[i]);
+          for (auto *NewBlock : NewBlocks)
+            Visited.erase(NewBlock);
           goto PredTranslationFailure;
         }
       }
       if (NewBlocks.size() > WorklistEntries) {
         // Make sure to clean up the Visited map before continuing on to
         // PredTranslationFailure.
-        for (unsigned i = 0; i < NewBlocks.size(); i++)
-          Visited.erase(NewBlocks[i]);
+        for (auto *NewBlock : NewBlocks)
+          Visited.erase(NewBlock);
         GotWorklistLimit = true;
         goto PredTranslationFailure;
       }
@@ -1359,8 +1359,8 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
 
         // Make sure to clean up the Visited map before continuing on to
         // PredTranslationFailure.
-        for (unsigned i = 0, n = PredList.size(); i < n; ++i)
-          Visited.erase(PredList[i].first);
+        for (const auto &Pred : PredList)
+          Visited.erase(Pred.first);
 
         goto PredTranslationFailure;
       }
@@ -1371,9 +1371,9 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
     // any results for.  (getNonLocalPointerDepFromBB will modify our
     // datastructures in ways the code after the PredTranslationFailure label
     // doesn't expect.)
-    for (unsigned i = 0, n = PredList.size(); i < n; ++i) {
-      BasicBlock *Pred = PredList[i].first;
-      PHITransAddr &PredPointer = PredList[i].second;
+    for (auto &I : PredList) {
+      BasicBlock *Pred = I.first;
+      PHITransAddr &PredPointer = I.second;
       Value *PredPtrVal = PredPointer.getAddr();
 
       bool CanTranslate = true;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 580fe112fcd7..623814c038a7 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -7914,9 +7914,10 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
         //    expression. We already checked that ShlAmt < BitWidth, so
         //    the multiplier, 1 << (ShlAmt - AShrAmt), fits into TruncTy as
         //    ShlAmt - AShrAmt < Amt.
-        uint64_t ShlAmt = ShlAmtCI->getZExtValue();
-        if (ShlAmtCI->getValue().ult(BitWidth) && ShlAmt >= AShrAmt) {
-          APInt Mul = APInt::getOneBitSet(BitWidth - AShrAmt, ShlAmt - AShrAmt);
+        const APInt &ShlAmt = ShlAmtCI->getValue();
+        if (ShlAmt.ult(BitWidth) && ShlAmt.uge(AShrAmt)) {
+          APInt Mul = APInt::getOneBitSet(BitWidth - AShrAmt,
+                                          ShlAmtCI->getZExtValue() - AShrAmt);
           const SCEV *CompositeExpr =
               getMulExpr(AddTruncateExpr, getConstant(Mul));
           if (L->getOpcode() != Instruction::Shl)
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 20959cf6948f..bbb7c86d2185 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -1149,6 +1149,16 @@ bool TargetLibraryInfoImpl::getLibFunc(const Function &FDecl,
   return isValidProtoForLibFunc(*FDecl.getFunctionType(), F, *M);
 }
 
+bool TargetLibraryInfoImpl::getLibFunc(unsigned int Opcode, Type *Ty,
+                                       LibFunc &F) const {
+  // Must be a frem instruction with float or double arguments.
+  if (Opcode != Instruction::FRem || (!Ty->isDoubleTy() && !Ty->isFloatTy()))
+    return false;
+
+  F = Ty->isDoubleTy() ? LibFunc_fmod : LibFunc_fmodf;
+  return true;
+}
+
 void TargetLibraryInfoImpl::disableAllFunctions() {
   memset(AvailableArray, 0, sizeof(AvailableArray));
 }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 3f76dfdaac31..67246afa2314 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -862,6 +862,15 @@ InstructionCost TargetTransformInfo::getArithmeticInstrCost(
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getAltInstrCost(
+    VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+    const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const {
+  InstructionCost Cost =
+      TTIImpl->getAltInstrCost(VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost TargetTransformInfo::getShuffleCost(
     ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask,
     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
diff --git a/llvm/lib/Analysis/VFABIDemangling.cpp b/llvm/lib/Analysis/VFABIDemangling.cpp
index 22fc52070015..426f98c0c628 100644
--- a/llvm/lib/Analysis/VFABIDemangling.cpp
+++ b/llvm/lib/Analysis/VFABIDemangling.cpp
@@ -126,7 +126,7 @@ static ParseRet tryParseLinearTokenWithRuntimeStep(StringRef &ParseString,
   return ParseRet::None;
 }
 
-/// The function looks for the following stringt at the beginning of
+/// The function looks for the following string at the beginning of
 /// the input string `ParseString`:
 ///
 ///  <token> <number>
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 9b8376833d2e..16d78c1ded6d 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -983,45 +983,11 @@ static void computeKnownBitsFromOperator(const Operator *I,
     break;
   }
   case Instruction::Select: {
-    const Value *LHS = nullptr, *RHS = nullptr;
-    SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
-    if (SelectPatternResult::isMinOrMax(SPF)) {
-      computeKnownBits(RHS, Known, Depth + 1, Q);
-      computeKnownBits(LHS, Known2, Depth + 1, Q);
-      switch (SPF) {
-      default:
-        llvm_unreachable("Unhandled select pattern flavor!");
-      case SPF_SMAX:
-        Known = KnownBits::smax(Known, Known2);
-        break;
-      case SPF_SMIN:
-        Known = KnownBits::smin(Known, Known2);
-        break;
-      case SPF_UMAX:
-        Known = KnownBits::umax(Known, Known2);
-        break;
-      case SPF_UMIN:
-        Known = KnownBits::umin(Known, Known2);
-        break;
-      }
-      break;
-    }
-
     computeKnownBits(I->getOperand(2), Known, Depth + 1, Q);
     computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
 
     // Only known if known in both the LHS and RHS.
     Known = Known.intersectWith(Known2);
-
-    if (SPF == SPF_ABS) {
-      // RHS from matchSelectPattern returns the negation part of abs pattern.
-      // If the negate has an NSW flag we can assume the sign bit of the result
-      // will be 0 because that makes abs(INT_MIN) undefined.
-      if (match(RHS, m_Neg(m_Specific(LHS))) &&
-          Q.IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(RHS)))
-        Known.Zero.setSignBit();
-    }
-
     break;
   }
   case Instruction::FPTrunc:
@@ -1826,6 +1792,8 @@ void computeKnownBits(const Value *V, const APInt &DemandedElts,
       Known.Zero &= ~Elt;
       Known.One &= Elt;
     }
+    if (Known.hasConflict())
+      Known.resetAll();
     return;
   }
 
@@ -1849,6 +1817,8 @@ void computeKnownBits(const Value *V, const APInt &DemandedElts,
       Known.Zero &= ~Elt;
       Known.One &= Elt;
     }
+    if (Known.hasConflict())
+      Known.resetAll();
     return;
   }
 
@@ -6285,10 +6255,10 @@ static OverflowResult mapOverflowResult(ConstantRange::OverflowResult OR) {
 }
 
 /// Combine constant ranges from computeConstantRange() and computeKnownBits().
-static ConstantRange
-computeConstantRangeIncludingKnownBits(const WithCache<const Value *> &V,
-                                       bool ForSigned,
-                                       const SimplifyQuery &SQ) {
+ConstantRange
+llvm::computeConstantRangeIncludingKnownBits(const WithCache<const Value *> &V,
+                                             bool ForSigned,
+                                             const SimplifyQuery &SQ) {
   ConstantRange CR1 =
       ConstantRange::fromKnownBits(V.getKnownBits(SQ), ForSigned);
   ConstantRange CR2 = computeConstantRange(V, ForSigned, SQ.IIQ.UseInstrInfo);
@@ -6556,10 +6526,25 @@ static bool shiftAmountKnownInRange(const Value *ShiftAmount) {
   return Safe;
 }
 
-static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly,
+enum class UndefPoisonKind {
+  PoisonOnly = (1 << 0),
+  UndefOnly = (1 << 1),
+  UndefOrPoison = PoisonOnly | UndefOnly,
+};
+
+static bool includesPoison(UndefPoisonKind Kind) {
+  return (unsigned(Kind) & unsigned(UndefPoisonKind::PoisonOnly)) != 0;
+}
+
+static bool includesUndef(UndefPoisonKind Kind) {
+  return (unsigned(Kind) & unsigned(UndefPoisonKind::UndefOnly)) != 0;
+}
+
+static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind,
                                    bool ConsiderFlagsAndMetadata) {
 
-  if (ConsiderFlagsAndMetadata && Op->hasPoisonGeneratingFlagsOrMetadata())
+  if (ConsiderFlagsAndMetadata && includesPoison(Kind) &&
+      Op->hasPoisonGeneratingFlagsOrMetadata())
     return true;
 
   unsigned Opcode = Op->getOpcode();
@@ -6569,7 +6554,7 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly,
   case Instruction::Shl:
   case Instruction::AShr:
   case Instruction::LShr:
-    return !shiftAmountKnownInRange(Op->getOperand(1));
+    return includesPoison(Kind) && !shiftAmountKnownInRange(Op->getOperand(1));
   case Instruction::FPToSI:
   case Instruction::FPToUI:
     // fptosi/ui yields poison if the resulting value does not fit in the
@@ -6610,7 +6595,8 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly,
         return false;
       case Intrinsic::sshl_sat:
       case Intrinsic::ushl_sat:
-        return !shiftAmountKnownInRange(II->getArgOperand(1));
+        return includesPoison(Kind) &&
+               !shiftAmountKnownInRange(II->getArgOperand(1));
       case Intrinsic::fma:
       case Intrinsic::fmuladd:
       case Intrinsic::sqrt:
@@ -6665,18 +6651,16 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly,
     auto *VTy = cast<VectorType>(Op->getOperand(0)->getType());
     unsigned IdxOp = Op->getOpcode() == Instruction::InsertElement ? 2 : 1;
     auto *Idx = dyn_cast<ConstantInt>(Op->getOperand(IdxOp));
-    if (!Idx || Idx->getValue().uge(VTy->getElementCount().getKnownMinValue()))
-      return true;
+    if (includesPoison(Kind))
+      return !Idx ||
+             Idx->getValue().uge(VTy->getElementCount().getKnownMinValue());
     return false;
   }
   case Instruction::ShuffleVector: {
-    // shufflevector may return undef.
-    if (PoisonOnly)
-      return false;
     ArrayRef<int> Mask = isa<ConstantExpr>(Op)
                              ? cast<ConstantExpr>(Op)->getShuffleMask()
                              : cast<ShuffleVectorInst>(Op)->getShuffleMask();
-    return is_contained(Mask, PoisonMaskElem);
+    return includesPoison(Kind) && is_contained(Mask, PoisonMaskElem);
   }
   case Instruction::FNeg:
   case Instruction::PHI:
@@ -6712,17 +6696,17 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly,
 
 bool llvm::canCreateUndefOrPoison(const Operator *Op,
                                   bool ConsiderFlagsAndMetadata) {
-  return ::canCreateUndefOrPoison(Op, /*PoisonOnly=*/false,
+  return ::canCreateUndefOrPoison(Op, UndefPoisonKind::UndefOrPoison,
                                   ConsiderFlagsAndMetadata);
 }
 
 bool llvm::canCreatePoison(const Operator *Op, bool ConsiderFlagsAndMetadata) {
-  return ::canCreateUndefOrPoison(Op, /*PoisonOnly=*/true,
+  return ::canCreateUndefOrPoison(Op, UndefPoisonKind::PoisonOnly,
                                   ConsiderFlagsAndMetadata);
 }
 
-static bool directlyImpliesPoison(const Value *ValAssumedPoison,
-                                  const Value *V, unsigned Depth) {
+static bool directlyImpliesPoison(const Value *ValAssumedPoison, const Value *V,
+                                  unsigned Depth) {
   if (ValAssumedPoison == V)
     return true;
 
@@ -6774,14 +6758,11 @@ bool llvm::impliesPoison(const Value *ValAssumedPoison, const Value *V) {
   return ::impliesPoison(ValAssumedPoison, V, /* Depth */ 0);
 }
 
-static bool programUndefinedIfUndefOrPoison(const Value *V,
-                                            bool PoisonOnly);
+static bool programUndefinedIfUndefOrPoison(const Value *V, bool PoisonOnly);
 
-static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
-                                             AssumptionCache *AC,
-                                             const Instruction *CtxI,
-                                             const DominatorTree *DT,
-                                             unsigned Depth, bool PoisonOnly) {
+static bool isGuaranteedNotToBeUndefOrPoison(
+    const Value *V, AssumptionCache *AC, const Instruction *CtxI,
+    const DominatorTree *DT, unsigned Depth, UndefPoisonKind Kind) {
   if (Depth >= MaxAnalysisRecursionDepth)
     return false;
 
@@ -6796,16 +6777,19 @@ static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
   }
 
   if (auto *C = dyn_cast<Constant>(V)) {
+    if (isa<PoisonValue>(C))
+      return !includesPoison(Kind);
+
     if (isa<UndefValue>(C))
-      return PoisonOnly && !isa<PoisonValue>(C);
+      return !includesUndef(Kind);
 
     if (isa<ConstantInt>(C) || isa<GlobalVariable>(C) || isa<ConstantFP>(V) ||
         isa<ConstantPointerNull>(C) || isa<Function>(C))
       return true;
 
     if (C->getType()->isVectorTy() && !isa<ConstantExpr>(C))
-      return (PoisonOnly ? !C->containsPoisonElement()
-                         : !C->containsUndefOrPoisonElement()) &&
+      return (!includesUndef(Kind) ? !C->containsPoisonElement()
+                                   : !C->containsUndefOrPoisonElement()) &&
              !C->containsConstantExpression();
   }
 
@@ -6823,8 +6807,7 @@ static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
     return true;
 
   auto OpCheck = [&](const Value *V) {
-    return isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth + 1,
-                                            PoisonOnly);
+    return isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth + 1, Kind);
   };
 
   if (auto *Opr = dyn_cast<Operator>(V)) {
@@ -6846,14 +6829,16 @@ static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
       for (unsigned i = 0; i < Num; ++i) {
         auto *TI = PN->getIncomingBlock(i)->getTerminator();
         if (!isGuaranteedNotToBeUndefOrPoison(PN->getIncomingValue(i), AC, TI,
-                                              DT, Depth + 1, PoisonOnly)) {
+                                              DT, Depth + 1, Kind)) {
           IsWellDefined = false;
           break;
         }
       }
       if (IsWellDefined)
         return true;
-    } else if (!canCreateUndefOrPoison(Opr) && all_of(Opr->operands(), OpCheck))
+    } else if (!::canCreateUndefOrPoison(Opr, Kind,
+                                         /*ConsiderFlagsAndMetadata*/ true) &&
+               all_of(Opr->operands(), OpCheck))
       return true;
   }
 
@@ -6863,7 +6848,7 @@ static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
         I->hasMetadata(LLVMContext::MD_dereferenceable_or_null))
       return true;
 
-  if (programUndefinedIfUndefOrPoison(V, PoisonOnly))
+  if (programUndefinedIfUndefOrPoison(V, !includesUndef(Kind)))
     return true;
 
   // CxtI may be null or a cloned instruction.
@@ -6895,7 +6880,7 @@ static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
     if (Cond) {
       if (Cond == V)
         return true;
-      else if (PoisonOnly && isa<Operator>(Cond)) {
+      else if (!includesUndef(Kind) && isa<Operator>(Cond)) {
         // For poison, we can analyze further
         auto *Opr = cast<Operator>(Cond);
         if (any_of(Opr->operands(),
@@ -6917,20 +6902,22 @@ bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC,
                                             const Instruction *CtxI,
                                             const DominatorTree *DT,
                                             unsigned Depth) {
-  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth, false);
+  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth,
+                                            UndefPoisonKind::UndefOrPoison);
 }
 
 bool llvm::isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC,
                                      const Instruction *CtxI,
                                      const DominatorTree *DT, unsigned Depth) {
-  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth, true);
+  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth,
+                                            UndefPoisonKind::PoisonOnly);
 }
 
 bool llvm::isGuaranteedNotToBeUndef(const Value *V, AssumptionCache *AC,
                                     const Instruction *CtxI,
                                     const DominatorTree *DT, unsigned Depth) {
-  // TODO: This is currently equivalent to isGuaranteedNotToBeUndefOrPoison().
-  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth, false);
+  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth,
+                                            UndefPoisonKind::UndefOnly);
 }
 
 /// Return true if undefined behavior would provably be executed on the path to
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index f90fca9d937f..5b57f0a25cec 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -123,6 +123,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
 
 bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
                                                   int OpdIdx) {
+  assert(ID != Intrinsic::not_intrinsic && "Not an intrinsic!");
+
   switch (ID) {
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 8907f6fa4ff3..a027d0c21ba0 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -4218,6 +4218,9 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
 
   // Check whether we have enough values to read a partition name.
   if (OpNum + 1 < Record.size()) {
+    // Check Strtab has enough values for the partition.
+    if (Record[OpNum] + Record[OpNum + 1] > Strtab.size())
+      return error("Malformed partition, too large.");
     NewGA->setPartition(
         StringRef(Strtab.data() + Record[OpNum], Record[OpNum + 1]));
     OpNum += 2;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index d6f487c18b03..30ea7eef3a12 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -412,7 +412,7 @@ static uint32_t constructAbbreviationTag(
     const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> &EntryRet) {
   uint32_t AbbrvTag = 0;
   if (EntryRet)
-    AbbrvTag |= 1 << EntryRet->Endoding.Index;
+    AbbrvTag |= 1 << EntryRet->Encoding.Index;
   AbbrvTag |= 1 << dwarf::DW_IDX_die_offset;
   AbbrvTag |= Tag << LowerBitSize;
   return AbbrvTag;
@@ -429,7 +429,7 @@ void Dwarf5AccelTableWriter<DataT>::populateAbbrevsMap() {
         if (Abbreviations.count(AbbrvTag) == 0) {
           SmallVector<DWARF5AccelTableData::AttributeEncoding, 2> UA;
           if (EntryRet)
-            UA.push_back(EntryRet->Endoding);
+            UA.push_back(EntryRet->Encoding);
           UA.push_back({dwarf::DW_IDX_die_offset, dwarf::DW_FORM_ref4});
           Abbreviations.try_emplace(AbbrvTag, UA);
         }
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 91a64d59e154..8b15bdb0aca3 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5940,62 +5940,6 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
   return false;
 }
 
-bool CombinerHelper::matchSelectToLogical(MachineInstr &MI,
-                                          BuildFnTy &MatchInfo) {
-  GSelect &Sel = cast<GSelect>(MI);
-  Register DstReg = Sel.getReg(0);
-  Register Cond = Sel.getCondReg();
-  Register TrueReg = Sel.getTrueReg();
-  Register FalseReg = Sel.getFalseReg();
-
-  auto *TrueDef = getDefIgnoringCopies(TrueReg, MRI);
-  auto *FalseDef = getDefIgnoringCopies(FalseReg, MRI);
-
-  const LLT CondTy = MRI.getType(Cond);
-  const LLT OpTy = MRI.getType(TrueReg);
-  if (CondTy != OpTy || OpTy.getScalarSizeInBits() != 1)
-    return false;
-
-  // We have a boolean select.
-
-  // select Cond, Cond, F --> or Cond, F
-  // select Cond, 1, F    --> or Cond, F
-  auto MaybeCstTrue = isConstantOrConstantSplatVector(*TrueDef, MRI);
-  if (Cond == TrueReg || (MaybeCstTrue && MaybeCstTrue->isOne())) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildOr(DstReg, Cond, FalseReg);
-    };
-    return true;
-  }
-
-  // select Cond, T, Cond --> and Cond, T
-  // select Cond, T, 0    --> and Cond, T
-  auto MaybeCstFalse = isConstantOrConstantSplatVector(*FalseDef, MRI);
-  if (Cond == FalseReg || (MaybeCstFalse && MaybeCstFalse->isZero())) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildAnd(DstReg, Cond, TrueReg);
-    };
-    return true;
-  }
-
- // select Cond, T, 1 --> or (not Cond), T
-  if (MaybeCstFalse && MaybeCstFalse->isOne()) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildOr(DstReg, MIB.buildNot(OpTy, Cond), TrueReg);
-    };
-    return true;
-  }
-
-  // select Cond, 0, F --> and (not Cond), F
-  if (MaybeCstTrue && MaybeCstTrue->isZero()) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildAnd(DstReg, MIB.buildNot(OpTy, Cond), FalseReg);
-    };
-    return true;
-  }
-  return false;
-}
-
 bool CombinerHelper::matchCombineFMinMaxNaN(MachineInstr &MI,
                                             unsigned &IdxToPropagate) {
   bool PropagateNaN;
@@ -6318,3 +6262,300 @@ void CombinerHelper::applyCommuteBinOpOperands(MachineInstr &MI) {
   MI.getOperand(2).setReg(LHSReg);
   Observer.changedInstr(MI);
 }
+
+bool CombinerHelper::isOneOrOneSplat(Register Src, bool AllowUndefs) {
+  LLT SrcTy = MRI.getType(Src);
+  if (SrcTy.isFixedVector())
+    return isConstantSplatVector(Src, 1, AllowUndefs);
+  if (SrcTy.isScalar()) {
+    if (AllowUndefs && getOpcodeDef<GImplicitDef>(Src, MRI) != nullptr)
+      return true;
+    auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+    return IConstant && IConstant->Value == 1;
+  }
+  return false; // scalable vector
+}
+
+bool CombinerHelper::isZeroOrZeroSplat(Register Src, bool AllowUndefs) {
+  LLT SrcTy = MRI.getType(Src);
+  if (SrcTy.isFixedVector())
+    return isConstantSplatVector(Src, 0, AllowUndefs);
+  if (SrcTy.isScalar()) {
+    if (AllowUndefs && getOpcodeDef<GImplicitDef>(Src, MRI) != nullptr)
+      return true;
+    auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+    return IConstant && IConstant->Value == 0;
+  }
+  return false; // scalable vector
+}
+
+// Ignores COPYs during conformance checks.
+// FIXME scalable vectors.
+bool CombinerHelper::isConstantSplatVector(Register Src, int64_t SplatValue,
+                                           bool AllowUndefs) {
+  GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
+  if (!BuildVector)
+    return false;
+  unsigned NumSources = BuildVector->getNumSources();
+
+  for (unsigned I = 0; I < NumSources; ++I) {
+    GImplicitDef *ImplicitDef =
+        getOpcodeDef<GImplicitDef>(BuildVector->getSourceReg(I), MRI);
+    if (ImplicitDef && AllowUndefs)
+      continue;
+    if (ImplicitDef && !AllowUndefs)
+      return false;
+    std::optional<ValueAndVReg> IConstant =
+        getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI);
+    if (IConstant && IConstant->Value == SplatValue)
+      continue;
+    return false;
+  }
+  return true;
+}
+
+// Ignores COPYs during lookups.
+// FIXME scalable vectors
+std::optional<APInt>
+CombinerHelper::getConstantOrConstantSplatVector(Register Src) {
+  auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+  if (IConstant)
+    return IConstant->Value;
+
+  GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
+  if (!BuildVector)
+    return std::nullopt;
+  unsigned NumSources = BuildVector->getNumSources();
+
+  std::optional<APInt> Value = std::nullopt;
+  for (unsigned I = 0; I < NumSources; ++I) {
+    std::optional<ValueAndVReg> IConstant =
+        getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI);
+    if (!IConstant)
+      return std::nullopt;
+    if (!Value)
+      Value = IConstant->Value;
+    else if (*Value != IConstant->Value)
+      return std::nullopt;
+  }
+  return Value;
+}
+
+// TODO: use knownbits to determine zeros
+bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
+                                              BuildFnTy &MatchInfo) {
+  uint32_t Flags = Select->getFlags();
+  Register Dest = Select->getReg(0);
+  Register Cond = Select->getCondReg();
+  Register True = Select->getTrueReg();
+  Register False = Select->getFalseReg();
+  LLT CondTy = MRI.getType(Select->getCondReg());
+  LLT TrueTy = MRI.getType(Select->getTrueReg());
+
+  // We only do this combine for scalar boolean conditions.
+  if (CondTy != LLT::scalar(1))
+    return false;
+
+  // Both are scalars.
+  std::optional<ValueAndVReg> TrueOpt =
+      getIConstantVRegValWithLookThrough(True, MRI);
+  std::optional<ValueAndVReg> FalseOpt =
+      getIConstantVRegValWithLookThrough(False, MRI);
+
+  if (!TrueOpt || !FalseOpt)
+    return false;
+
+  APInt TrueValue = TrueOpt->Value;
+  APInt FalseValue = FalseOpt->Value;
+
+  // select Cond, 1, 0 --> zext (Cond)
+  if (TrueValue.isOne() && FalseValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      B.buildZExtOrTrunc(Dest, Cond);
+    };
+    return true;
+  }
+
+  // select Cond, -1, 0 --> sext (Cond)
+  if (TrueValue.isAllOnes() && FalseValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      B.buildSExtOrTrunc(Dest, Cond);
+    };
+    return true;
+  }
+
+  // select Cond, 0, 1 --> zext (!Cond)
+  if (TrueValue.isZero() && FalseValue.isOne()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      B.buildZExtOrTrunc(Dest, Inner);
+    };
+    return true;
+  }
+
+  // select Cond, 0, -1 --> sext (!Cond)
+  if (TrueValue.isZero() && FalseValue.isAllOnes()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      B.buildSExtOrTrunc(Dest, Inner);
+    };
+    return true;
+  }
+
+  // select Cond, C1, C1-1 --> add (zext Cond), C1-1
+  if (TrueValue - 1 == FalseValue) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Inner, Cond);
+      B.buildAdd(Dest, Inner, False);
+    };
+    return true;
+  }
+
+  // select Cond, C1, C1+1 --> add (sext Cond), C1+1
+  if (TrueValue + 1 == FalseValue) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildSExtOrTrunc(Inner, Cond);
+      B.buildAdd(Dest, Inner, False);
+    };
+    return true;
+  }
+
+  // select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2)
+  if (TrueValue.isPowerOf2() && FalseValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Inner, Cond);
+      // The shift amount must be scalar.
+      LLT ShiftTy = TrueTy.isVector() ? TrueTy.getElementType() : TrueTy;
+      auto ShAmtC = B.buildConstant(ShiftTy, TrueValue.exactLogBase2());
+      B.buildShl(Dest, Inner, ShAmtC, Flags);
+    };
+    return true;
+  }
+  // select Cond, -1, C --> or (sext Cond), C
+  if (TrueValue.isAllOnes()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildSExtOrTrunc(Inner, Cond);
+      B.buildOr(Dest, Inner, False, Flags);
+    };
+    return true;
+  }
+
+  // select Cond, C, -1 --> or (sext (not Cond)), C
+  if (FalseValue.isAllOnes()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Not = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Not, Cond);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildSExtOrTrunc(Inner, Not);
+      B.buildOr(Dest, Inner, True, Flags);
+    };
+    return true;
+  }
+
+  return false;
+}
+
+// TODO: use knownbits to determine zeros
+bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select,
+                                              BuildFnTy &MatchInfo) {
+  uint32_t Flags = Select->getFlags();
+  Register DstReg = Select->getReg(0);
+  Register Cond = Select->getCondReg();
+  Register True = Select->getTrueReg();
+  Register False = Select->getFalseReg();
+  LLT CondTy = MRI.getType(Select->getCondReg());
+  LLT TrueTy = MRI.getType(Select->getTrueReg());
+
+  // Boolean or fixed vector of booleans.
+  if (CondTy.isScalableVector() ||
+      (CondTy.isFixedVector() &&
+       CondTy.getElementType().getScalarSizeInBits() != 1) ||
+      CondTy.getScalarSizeInBits() != 1)
+    return false;
+
+  if (CondTy != TrueTy)
+    return false;
+
+  // select Cond, Cond, F --> or Cond, F
+  // select Cond, 1, F    --> or Cond, F
+  if ((Cond == True) || isOneOrOneSplat(True, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Cond);
+      B.buildOr(DstReg, Ext, False, Flags);
+    };
+    return true;
+  }
+
+  // select Cond, T, Cond --> and Cond, T
+  // select Cond, T, 0    --> and Cond, T
+  if ((Cond == False) || isZeroOrZeroSplat(False, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Cond);
+      B.buildAnd(DstReg, Ext, True);
+    };
+    return true;
+  }
+
+  // select Cond, T, 1 --> or (not Cond), T
+  if (isOneOrOneSplat(False, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      // First the not.
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      // Then an ext to match the destination register.
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Inner);
+      B.buildOr(DstReg, Ext, True, Flags);
+    };
+    return true;
+  }
+
+  // select Cond, 0, F --> and (not Cond), F
+  if (isZeroOrZeroSplat(True, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      // First the not.
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      // Then an ext to match the destination register.
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Inner);
+      B.buildAnd(DstReg, Ext, False);
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  GSelect *Select = cast<GSelect>(&MI);
+
+  if (tryFoldSelectOfConstants(Select, MatchInfo))
+    return true;
+
+  if (tryFoldBoolSelectToLogic(Select, MatchInfo))
+    return true;
+
+  return false;
+}
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index aeb8a20e1f12..9037f752dc4f 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -2413,7 +2413,7 @@ bool InstrRefBasedLDV::mlocJoin(
 
     // Pick out the first predecessors live-out value for this location. It's
     // guaranteed to not be a backedge, as we order by RPO.
-    ValueIDNum FirstVal = OutLocs[BlockOrders[0]->getNumber()][Idx.asU64()];
+    ValueIDNum FirstVal = OutLocs[*BlockOrders[0]][Idx.asU64()];
 
     // If we've already eliminated a PHI here, do no further checking, just
     // propagate the first live-in value into this block.
@@ -2430,8 +2430,7 @@ bool InstrRefBasedLDV::mlocJoin(
     bool Disagree = false;
     for (unsigned int I = 1; I < BlockOrders.size(); ++I) {
       const MachineBasicBlock *PredMBB = BlockOrders[I];
-      const ValueIDNum &PredLiveOut =
-          OutLocs[PredMBB->getNumber()][Idx.asU64()];
+      const ValueIDNum &PredLiveOut = OutLocs[*PredMBB][Idx.asU64()];
 
       // Incoming values agree, continue trying to eliminate this PHI.
       if (FirstVal == PredLiveOut)
@@ -2556,7 +2555,7 @@ void InstrRefBasedLDV::placeMLocPHIs(
 
   auto InstallPHIsAtLoc = [&PHIBlocks, &MInLocs](LocIdx L) {
     for (const MachineBasicBlock *MBB : PHIBlocks)
-      MInLocs[MBB->getNumber()][L.asU64()] = ValueIDNum(MBB->getNumber(), 0, L);
+      MInLocs[*MBB][L.asU64()] = ValueIDNum(MBB->getNumber(), 0, L);
   };
 
   // For locations with no reg units, just place PHIs.
@@ -2635,7 +2634,8 @@ void InstrRefBasedLDV::buildMLocValueMap(
 
   // Initialize entry block to PHIs. These represent arguments.
   for (auto Location : MTracker->locations())
-    MInLocs[0][Location.Idx.asU64()] = ValueIDNum(0, 0, Location.Idx);
+    MInLocs.tableForEntryMBB()[Location.Idx.asU64()] =
+        ValueIDNum(0, 0, Location.Idx);
 
   MTracker->reset();
 
@@ -2664,7 +2664,7 @@ void InstrRefBasedLDV::buildMLocValueMap(
 
       // Join the values in all predecessor blocks.
       bool InLocsChanged;
-      InLocsChanged = mlocJoin(*MBB, Visited, MOutLocs, MInLocs[CurBB]);
+      InLocsChanged = mlocJoin(*MBB, Visited, MOutLocs, MInLocs[*MBB]);
       InLocsChanged |= Visited.insert(MBB).second;
 
       // Don't examine transfer function if we've visited this loc at least
@@ -2673,7 +2673,7 @@ void InstrRefBasedLDV::buildMLocValueMap(
         continue;
 
       // Load the current set of live-ins into MLocTracker.
-      MTracker->loadFromArray(MInLocs[CurBB], CurBB);
+      MTracker->loadFromArray(MInLocs[*MBB], CurBB);
 
       // Each element of the transfer function can be a new def, or a read of
       // a live-in value. Evaluate each element, and store to "ToRemap".
@@ -2700,8 +2700,8 @@ void InstrRefBasedLDV::buildMLocValueMap(
       // the transfer function, and mlocJoin.
       bool OLChanged = false;
       for (auto Location : MTracker->locations()) {
-        OLChanged |= MOutLocs[CurBB][Location.Idx.asU64()] != Location.Value;
-        MOutLocs[CurBB][Location.Idx.asU64()] = Location.Value;
+        OLChanged |= MOutLocs[*MBB][Location.Idx.asU64()] != Location.Value;
+        MOutLocs[*MBB][Location.Idx.asU64()] = Location.Value;
       }
 
       MTracker->reset();
@@ -2844,7 +2844,6 @@ std::optional<ValueIDNum> InstrRefBasedLDV::pickOperandPHILoc(
   unsigned NumLocs = MTracker->getNumLocs();
 
   for (const auto p : BlockOrders) {
-    unsigned ThisBBNum = p->getNumber();
     auto OutValIt = LiveOuts.find(p);
     assert(OutValIt != LiveOuts.end());
     const DbgValue &OutVal = *OutValIt->second;
@@ -2863,7 +2862,7 @@ std::optional<ValueIDNum> InstrRefBasedLDV::pickOperandPHILoc(
       ValueIDNum ValToLookFor = OutValOp.ID;
       // Search the live-outs of the predecessor for the specified value.
       for (unsigned int I = 0; I < NumLocs; ++I) {
-        if (MOutLocs[ThisBBNum][I] == ValToLookFor)
+        if (MOutLocs[*p][I] == ValToLookFor)
           Locs.back().push_back(LocIdx(I));
       }
     } else {
@@ -2876,7 +2875,7 @@ std::optional<ValueIDNum> InstrRefBasedLDV::pickOperandPHILoc(
       // machine-value PHI locations.
       for (unsigned int I = 0; I < NumLocs; ++I) {
         ValueIDNum MPHI(MBB.getNumber(), 0, LocIdx(I));
-        if (MOutLocs[ThisBBNum][I] == MPHI)
+        if (MOutLocs[*p][I] == MPHI)
           Locs.back().push_back(LocIdx(I));
       }
     }
@@ -3498,19 +3497,15 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit(
   // Helper lambda for ejecting a block -- if nothing is going to use the block,
   // we can translate the variable location information into DBG_VALUEs and then
   // free all of InstrRefBasedLDV's data structures.
-  SmallPtrSet<const MachineBasicBlock *, 8> EjectedBBs;
   auto EjectBlock = [&](MachineBasicBlock &MBB) -> void {
-    if (EjectedBBs.insert(&MBB).second == false)
-      return;
     unsigned BBNum = MBB.getNumber();
     AllTheVLocs[BBNum].clear();
 
     // Prime the transfer-tracker, and then step through all the block
     // instructions, installing transfers.
     MTracker->reset();
-    MTracker->loadFromArray(MInLocs[BBNum], BBNum);
-    TTracker->loadInlocs(MBB, MInLocs[BBNum], DbgOpStore, Output[BBNum],
-                         NumLocs);
+    MTracker->loadFromArray(MInLocs[MBB], BBNum);
+    TTracker->loadInlocs(MBB, MInLocs[MBB], DbgOpStore, Output[BBNum], NumLocs);
 
     CurBB = BBNum;
     CurInst = 1;
@@ -3521,8 +3516,8 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit(
     }
 
     // Free machine-location tables for this block.
-    MInLocs[BBNum] = ValueTable();
-    MOutLocs[BBNum] = ValueTable();
+    MInLocs.ejectTableForBlock(MBB);
+    MOutLocs.ejectTableForBlock(MBB);
     // We don't need live-in variable values for this block either.
     Output[BBNum].clear();
     AllTheVLocs[BBNum].clear();
@@ -3587,7 +3582,8 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit(
   // anything for such out-of-scope blocks, but for the sake of being similar
   // to VarLocBasedLDV, eject these too.
   for (auto *MBB : ArtificialBlocks)
-    EjectBlock(*MBB);
+    if (MInLocs.hasTableFor(*MBB))
+      EjectBlock(*MBB);
 
   return emitTransfers(AllVarsNumbering);
 }
@@ -3686,8 +3682,8 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   // machine values. The outer dimension is the block number; while the inner
   // dimension is a LocIdx from MLocTracker.
   unsigned NumLocs = MTracker->getNumLocs();
-  FuncValueTable MOutLocs(MaxNumBlocks, ValueTable(NumLocs));
-  FuncValueTable MInLocs(MaxNumBlocks, ValueTable(NumLocs));
+  FuncValueTable MOutLocs(MaxNumBlocks, NumLocs);
+  FuncValueTable MInLocs(MaxNumBlocks, NumLocs);
 
   // Solve the machine value dataflow problem using the MLocTransfer function,
   // storing the computed live-ins / live-outs into the array-of-arrays. We use
@@ -3725,7 +3721,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
     CurBB = MBB.getNumber();
     VTracker = &vlocs[CurBB];
     VTracker->MBB = &MBB;
-    MTracker->loadFromArray(MInLocs[CurBB], CurBB);
+    MTracker->loadFromArray(MInLocs[MBB], CurBB);
     CurInst = 1;
     for (auto &MI : MBB) {
       process(MI, &MOutLocs, &MInLocs);
@@ -3939,7 +3935,7 @@ public:
   /// Find the live-in value number for the given block. Looks up the value at
   /// the PHI location on entry.
   BlockValueNum getValue(LDVSSABlock *LDVBB) {
-    return MLiveIns[LDVBB->BB.getNumber()][Loc.asU64()].asU64();
+    return MLiveIns[LDVBB->BB][Loc.asU64()].asU64();
   }
 };
 
@@ -4179,8 +4175,7 @@ std::optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIsImpl(
   });
 
   for (auto &PHI : SortedPHIs) {
-    ValueIDNum ThisBlockValueNum =
-        MLiveIns[PHI->ParentBlock->BB.getNumber()][Loc.asU64()];
+    ValueIDNum ThisBlockValueNum = MLiveIns[PHI->ParentBlock->BB][Loc.asU64()];
 
     // Are all these things actually defined?
     for (auto &PHIIt : PHI->IncomingValues) {
@@ -4189,7 +4184,7 @@ std::optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIsImpl(
         return std::nullopt;
 
       ValueIDNum ValueToCheck;
-      const ValueTable &BlockLiveOuts = MLiveOuts[PHIIt.first->BB.getNumber()];
+      const ValueTable &BlockLiveOuts = MLiveOuts[PHIIt.first->BB];
 
       auto VVal = ValidatedValues.find(PHIIt.first);
       if (VVal == ValidatedValues.end()) {
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
index d6dbb1feda3e..ccc284b62331 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
@@ -207,9 +207,48 @@ using namespace llvm;
 /// Type for a table of values in a block.
 using ValueTable = SmallVector<ValueIDNum, 0>;
 
-/// Type for a table-of-table-of-values, i.e., the collection of either
-/// live-in or live-out values for each block in the function.
-using FuncValueTable = SmallVector<ValueTable, 0>;
+/// A collection of ValueTables, one per BB in a function, with convenient
+/// accessor methods.
+struct FuncValueTable {
+  FuncValueTable(int NumBBs, int NumLocs) {
+    Storage.reserve(NumBBs);
+    for (int i = 0; i != NumBBs; ++i)
+      Storage.push_back(
+          std::make_unique<ValueTable>(NumLocs, ValueIDNum::EmptyValue));
+  }
+
+  /// Returns the ValueTable associated with MBB.
+  ValueTable &operator[](const MachineBasicBlock &MBB) const {
+    return (*this)[MBB.getNumber()];
+  }
+
+  /// Returns the ValueTable associated with the MachineBasicBlock whose number
+  /// is MBBNum.
+  ValueTable &operator[](int MBBNum) const {
+    auto &TablePtr = Storage[MBBNum];
+    assert(TablePtr && "Trying to access a deleted table");
+    return *TablePtr;
+  }
+
+  /// Returns the ValueTable associated with the entry MachineBasicBlock.
+  ValueTable &tableForEntryMBB() const { return (*this)[0]; }
+
+  /// Returns true if the ValueTable associated with MBB has not been freed.
+  bool hasTableFor(MachineBasicBlock &MBB) const {
+    return Storage[MBB.getNumber()] != nullptr;
+  }
+
+  /// Frees the memory of the ValueTable associated with MBB.
+  void ejectTableForBlock(const MachineBasicBlock &MBB) {
+    Storage[MBB.getNumber()].reset();
+  }
+
+private:
+  /// ValueTables are stored as unique_ptrs to allow for deallocation during
+  /// LDV; this was measured to have a significant impact on compiler memory
+  /// usage.
+  SmallVector<std::unique_ptr<ValueTable>, 0> Storage;
+};
 
 /// Thin wrapper around an integer -- designed to give more type safety to
 /// spill location numbers.
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 0203034b5a01..643370f0573d 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -426,8 +426,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
 
   // Erase any virtregs that are now empty and unused. There may be <undef>
   // uses around. Keep the empty live range in that case.
-  for (unsigned i = 0, e = RegsToErase.size(); i != e; ++i) {
-    Register Reg = RegsToErase[i];
+  for (Register Reg : RegsToErase) {
     if (LIS.hasInterval(Reg) && MRI.reg_nodbg_empty(Reg)) {
       ToShrink.remove(&LIS.getInterval(Reg));
       eraseVirtReg(Reg);
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index a032b31a1fc7..51e944d0279f 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -175,8 +175,46 @@ public:
         if (MachineInstr *MI = I->second.MI) {
           std::optional<DestSourcePair> CopyOperands =
               isCopyInstr(*MI, TII, UseCopyInstr);
-          markRegsUnavailable({CopyOperands->Destination->getReg().asMCReg()},
-                              TRI);
+
+          MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
+          MCRegister Src = CopyOperands->Source->getReg().asMCReg();
+
+          markRegsUnavailable(Def, TRI);
+
+          // Since we clobber the destination of a copy, the semantic of Src's
+          // "DefRegs" to contain Def is no longer effectual. We will also need
+          // to remove the record from the copy maps that indicates Src defined
+          // Def. Failing to do so might cause the target to miss some
+          // opportunities to further eliminate redundant copy instructions.
+          // Consider the following sequence during the
+          // ForwardCopyPropagateBlock procedure:
+          // L1: r0 = COPY r9     <- TrackMI
+          // L2: r0 = COPY r8     <- TrackMI (Remove r9 defined r0 from tracker)
+          // L3: use r0           <- Remove L2 from MaybeDeadCopies
+          // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker)
+          // L5: r0 = COPY r8     <- Remove NopCopy
+          for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
+            auto SrcCopy = Copies.find(SrcUnit);
+            if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
+              // If SrcCopy defines multiple values, we only need
+              // to erase the record for Def in DefRegs.
+              for (auto itr = SrcCopy->second.DefRegs.begin();
+                   itr != SrcCopy->second.DefRegs.end(); itr++) {
+                if (*itr == Def) {
+                  SrcCopy->second.DefRegs.erase(itr);
+                  // If DefReg becomes empty after removal, we can remove the
+                  // SrcCopy from the tracker's copy maps. We only remove those
+                  // entries solely record the Def is defined by Src. If an
+                  // entry also contains the definition record of other Def'
+                  // registers, it cannot be cleared.
+                  if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) {
+                    Copies.erase(SrcCopy);
+                  }
+                  break;
+                }
+              }
+            }
+          }
         }
         // Now we can erase the copy.
         Copies.erase(I);
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index b9db34f7be95..6eeed8b5c3f7 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -208,8 +208,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   }
 
   SmallSet<Register, 32> Added;
-  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
-    Register Reg = LocalDefs[i];
+  for (Register Reg : LocalDefs) {
     if (Added.insert(Reg).second) {
       // If it's not live beyond end of the bundle, mark it dead.
       bool isDead = DeadDefSet.count(Reg) || KilledDefSet.count(Reg);
@@ -218,8 +217,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
     }
   }
 
-  for (unsigned i = 0, e = ExternUses.size(); i != e; ++i) {
-    Register Reg = ExternUses[i];
+  for (Register Reg : ExternUses) {
     bool isKill = KilledUseSet.count(Reg);
     bool isUndef = UndefUseSet.count(Reg);
     MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) |
diff --git a/llvm/lib/CodeGen/MacroFusion.cpp b/llvm/lib/CodeGen/MacroFusion.cpp
index aff4d95781f4..5bd6ca0978a4 100644
--- a/llvm/lib/CodeGen/MacroFusion.cpp
+++ b/llvm/lib/CodeGen/MacroFusion.cpp
@@ -212,15 +212,9 @@ bool MacroFusion::scheduleAdjacentImpl(ScheduleDAGInstrs &DAG, SUnit &AnchorSU)
 }
 
 std::unique_ptr<ScheduleDAGMutation>
-llvm::createMacroFusionDAGMutation(ArrayRef<MacroFusionPredTy> Predicates) {
+llvm::createMacroFusionDAGMutation(ArrayRef<MacroFusionPredTy> Predicates,
+                                   bool BranchOnly) {
   if (EnableMacroFusion)
-    return std::make_unique<MacroFusion>(Predicates, true);
-  return nullptr;
-}
-
-std::unique_ptr<ScheduleDAGMutation> llvm::createBranchMacroFusionDAGMutation(
-    ArrayRef<MacroFusionPredTy> Predicates) {
-  if (EnableMacroFusion)
-    return std::make_unique<MacroFusion>(Predicates, false);
+    return std::make_unique<MacroFusion>(Predicates, !BranchOnly);
   return nullptr;
 }
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 40c42cabf776..e81d47930136 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -62,6 +62,118 @@ static RegisterRegAlloc fastRegAlloc("fast", "fast register allocator",
 
 namespace {
 
+/// Assign ascending index for instructions in machine basic block. The index
+/// can be used to determine dominance between instructions in same MBB.
+class InstrPosIndexes {
+public:
+  void unsetInitialized() { IsInitialized = false; }
+
+  void init(const MachineBasicBlock &MBB) {
+    CurMBB = &MBB;
+    Instr2PosIndex.clear();
+    uint64_t LastIndex = 0;
+    for (const MachineInstr &MI : MBB) {
+      LastIndex += InstrDist;
+      Instr2PosIndex[&MI] = LastIndex;
+    }
+  }
+
+  /// Set \p Index to index of \p MI. If \p MI is new inserted, it try to assign
+  /// index without affecting existing instruction's index. Return true if all
+  /// instructions index has been reassigned.
+  bool getIndex(const MachineInstr &MI, uint64_t &Index) {
+    if (!IsInitialized) {
+      init(*MI.getParent());
+      IsInitialized = true;
+      Index = Instr2PosIndex.at(&MI);
+      return true;
+    }
+
+    assert(MI.getParent() == CurMBB && "MI is not in CurMBB");
+    auto It = Instr2PosIndex.find(&MI);
+    if (It != Instr2PosIndex.end()) {
+      Index = It->second;
+      return false;
+    }
+
+    // Distance is the number of consecutive unassigned instructions including
+    // MI. Start is the first instruction of them. End is the next of last
+    // instruction of them.
+    // e.g.
+    // |Instruction|  A   |  B   |  C   |  MI  |  D   |  E   |
+    // |   Index   | 1024 |      |      |      |      | 2048 |
+    //
+    // In this case, B, C, MI, D are unassigned. Distance is 4, Start is B, End
+    // is E.
+    unsigned Distance = 1;
+    MachineBasicBlock::const_iterator Start = MI.getIterator(),
+                                      End = std::next(Start);
+    while (Start != CurMBB->begin() &&
+           !Instr2PosIndex.count(&*std::prev(Start))) {
+      --Start;
+      ++Distance;
+    }
+    while (End != CurMBB->end() && !Instr2PosIndex.count(&*(End))) {
+      ++End;
+      ++Distance;
+    }
+
+    // LastIndex is initialized to last used index prior to MI or zero.
+    // In previous example, LastIndex is 1024, EndIndex is 2048;
+    uint64_t LastIndex =
+        Start == CurMBB->begin() ? 0 : Instr2PosIndex.at(&*std::prev(Start));
+    uint64_t Step;
+    if (End == CurMBB->end())
+      Step = static_cast<uint64_t>(InstrDist);
+    else {
+      // No instruction uses index zero.
+      uint64_t EndIndex = Instr2PosIndex.at(&*End);
+      assert(EndIndex > LastIndex && "Index must be ascending order");
+      unsigned NumAvailableIndexes = EndIndex - LastIndex - 1;
+      // We want index gap between two adjacent MI is as same as possible. Given
+      // total A available indexes, D is number of consecutive unassigned
+      // instructions, S is the step.
+      // |<- S-1 -> MI <- S-1 -> MI <- A-S*D ->|
+      // There're S-1 available indexes between unassigned instruction and its
+      // predecessor. There're A-S*D available indexes between the last
+      // unassigned instruction and its successor.
+      // Ideally, we want
+      //    S-1 = A-S*D
+      // then
+      //    S = (A+1)/(D+1)
+      // An valid S must be integer greater than zero, so
+      //    S <= (A+1)/(D+1)
+      // =>
+      //    A-S*D >= 0
+      // That means we can safely use (A+1)/(D+1) as step.
+      // In previous example, Step is 204, Index of B, C, MI, D is 1228, 1432,
+      // 1636, 1840.
+      Step = (NumAvailableIndexes + 1) / (Distance + 1);
+    }
+
+    // Reassign index for all instructions if number of new inserted
+    // instructions exceed slot or all instructions are new.
+    if (LLVM_UNLIKELY(!Step || (!LastIndex && Step == InstrDist))) {
+      init(*CurMBB);
+      Index = Instr2PosIndex.at(&MI);
+      return true;
+    }
+
+    for (auto I = Start; I != End; ++I) {
+      LastIndex += Step;
+      Instr2PosIndex[&*I] = LastIndex;
+    }
+    Index = Instr2PosIndex.at(&MI);
+    return false;
+  }
+
+private:
+  bool IsInitialized = false;
+  enum { InstrDist = 1024 };
+  const MachineBasicBlock *CurMBB = nullptr;
+  DenseMap<const MachineInstr *, uint64_t> Instr2PosIndex;
+};
+
 class RegAllocFast : public MachineFunctionPass {
 public:
   static char ID;
@@ -153,6 +265,9 @@ private:
   // Register masks attached to the current instruction.
   SmallVector<const uint32_t *> RegMasks;
 
+  // Assign index for each instruction to quickly determine dominance.
+  InstrPosIndexes PosIndexes;
+
   void setPhysRegState(MCPhysReg PhysReg, unsigned NewState);
   bool isPhysRegFree(MCPhysReg PhysReg) const;
 
@@ -339,18 +454,13 @@ int RegAllocFast::getStackSpaceFor(Register VirtReg) {
   return FrameIdx;
 }
 
-static bool dominates(MachineBasicBlock &MBB,
-                      MachineBasicBlock::const_iterator A,
-                      MachineBasicBlock::const_iterator B) {
-  auto MBBEnd = MBB.end();
-  if (B == MBBEnd)
-    return true;
-
-  MachineBasicBlock::const_iterator I = MBB.begin();
-  for (; &*I != A && &*I != B; ++I)
-    ;
-
-  return &*I == A;
+static bool dominates(InstrPosIndexes &PosIndexes, const MachineInstr &A,
+                      const MachineInstr &B) {
+  uint64_t IndexA, IndexB;
+  PosIndexes.getIndex(A, IndexA);
+  if (LLVM_UNLIKELY(PosIndexes.getIndex(B, IndexB)))
+    PosIndexes.getIndex(A, IndexA);
+  return IndexA < IndexB;
 }
 
 /// Returns false if \p VirtReg is known to not live out of the current block.
@@ -371,7 +481,7 @@ bool RegAllocFast::mayLiveOut(Register VirtReg) {
         MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
         return true;
       } else {
-        if (!SelfLoopDef || dominates(*MBB, DefInst.getIterator(), SelfLoopDef))
+        if (!SelfLoopDef || dominates(PosIndexes, DefInst, *SelfLoopDef))
           SelfLoopDef = &DefInst;
       }
     }
@@ -396,7 +506,7 @@ bool RegAllocFast::mayLiveOut(Register VirtReg) {
       // Try to handle some simple cases to avoid spilling and reloading every
       // value inside a self looping block.
       if (SelfLoopDef == &UseInst ||
-          !dominates(*MBB, SelfLoopDef->getIterator(), UseInst.getIterator())) {
+          !dominates(PosIndexes, *SelfLoopDef, UseInst)) {
         MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
         return true;
       }
@@ -1565,6 +1675,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   this->MBB = &MBB;
   LLVM_DEBUG(dbgs() << "\nAllocating " << MBB);
 
+  PosIndexes.unsetInitialized();
   RegUnitStates.assign(TRI->getNumRegUnits(), regFree);
   assert(LiveVirtRegs.empty() && "Mapping not cleared from last block?");
 
diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp
index fba8c35ecec2..17a9f55cccc0 100644
--- a/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -165,8 +165,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
   assert(RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
 
   // CSR aliases go after the volatile registers, preserve the target's order.
-  for (unsigned i = 0, e = CSRAlias.size(); i != e; ++i) {
-    unsigned PhysReg = CSRAlias[i];
+  for (unsigned PhysReg : CSRAlias) {
     uint8_t Cost = RegCosts[PhysReg];
     if (Cost != LastCost)
       LastCostChange = N;
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index c1af37c8510f..3fbb93795075 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -305,7 +305,11 @@ namespace {
     /// number if it is not zero. If DstReg is a physical register and the
     /// existing subregister number of the def / use being updated is not zero,
     /// make sure to set it to the correct physical subregister.
-    void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx);
+    ///
+    /// If \p IsSubregToReg, we are coalescing a DstReg = SUBREG_TO_REG
+    /// SrcReg. This introduces an implicit-def of DstReg on coalesced users.
+    void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx,
+                           bool IsSubregToReg);
 
     /// If the given machine operand reads only undefined lanes add an undef
     /// flag.
@@ -1343,8 +1347,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     if (DstReg.isPhysical()) {
       Register NewDstReg = DstReg;
 
-      unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
-                                              DefMI->getOperand(0).getSubReg());
+      unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(), DefSubIdx);
       if (NewDstIdx)
         NewDstReg = TRI->getSubReg(DstReg, NewDstIdx);
 
@@ -1493,7 +1496,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     MRI->setRegClass(DstReg, NewRC);
 
     // Update machine operands and add flags.
-    updateRegDefsUses(DstReg, DstReg, DstIdx);
+    updateRegDefsUses(DstReg, DstReg, DstIdx, false);
     NewMI.getOperand(0).setSubReg(NewIdx);
     // updateRegDefUses can add an "undef" flag to the definition, since
     // it will replace DstReg with DstReg.DstIdx. If NewIdx is 0, make
@@ -1618,8 +1621,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     NewMI.addOperand(MO);
 
   SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
-  for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
-    MCRegister Reg = NewMIImplDefs[i];
+  for (MCRegister Reg : NewMIImplDefs) {
     for (MCRegUnit Unit : TRI->regunits(Reg))
       if (LiveRange *LR = LIS->getCachedRegUnit(Unit))
         LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
@@ -1814,7 +1816,7 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
 }
 
 void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
-                                          unsigned SubIdx) {
+                                          unsigned SubIdx, bool IsSubregToReg) {
   bool DstIsPhys = DstReg.isPhysical();
   LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
 
@@ -1854,6 +1856,8 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
     if (DstInt && !Reads && SubIdx && !UseMI->isDebugInstr())
       Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
 
+    bool FullDef = true;
+
     // Replace SrcReg with DstReg in all UseMI operands.
     for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
       MachineOperand &MO = UseMI->getOperand(Ops[i]);
@@ -1861,9 +1865,13 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
       // Adjust <undef> flags in case of sub-register joins. We don't want to
       // turn a full def into a read-modify-write sub-register def and vice
       // versa.
-      if (SubIdx && MO.isDef())
+      if (SubIdx && MO.isDef()) {
         MO.setIsUndef(!Reads);
 
+        if (!Reads)
+          FullDef = false;
+      }
+
       // A subreg use of a partially undef (super) register may be a complete
       // undef use now and then has to be marked that way.
       if (MO.isUse() && !DstIsPhys) {
@@ -1895,6 +1903,25 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
         MO.substVirtReg(DstReg, SubIdx, *TRI);
     }
 
+    if (IsSubregToReg && !FullDef) {
+      // If the coalesed instruction doesn't fully define the register, we need
+      // to preserve the original super register liveness for SUBREG_TO_REG.
+      //
+      // We pretended SUBREG_TO_REG was a regular copy for coalescing purposes,
+      // but it introduces liveness for other subregisters. Downstream users may
+      // have been relying on those bits, so we need to ensure their liveness is
+      // captured with a def of other lanes.
+
+      // FIXME: Need to add new subrange if tracking subranges. We could also
+      // skip adding this if we knew the other lanes are dead, and only for
+      // other lanes.
+
+      assert(!MRI->shouldTrackSubRegLiveness(DstReg) &&
+             "this should update subranges");
+      MachineInstrBuilder MIB(*MF, UseMI);
+      MIB.addReg(DstReg, RegState::ImplicitDefine);
+    }
+
     LLVM_DEBUG({
       dbgs() << "\t\tupdated: ";
       if (!UseMI->isDebugInstr())
@@ -2094,6 +2121,8 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     });
   }
 
+  const bool IsSubregToReg = CopyMI->isSubregToReg();
+
   ShrinkMask = LaneBitmask::getNone();
   ShrinkMainRange = false;
 
@@ -2161,9 +2190,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
   // Rewrite all SrcReg operands to DstReg.
   // Also update DstReg operands to include DstIdx if it is set.
-  if (CP.getDstIdx())
-    updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
-  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
+  if (CP.getDstIdx()) {
+    assert(!IsSubregToReg && "can this happen?");
+    updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx(), false);
+  }
+  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx(),
+                    IsSubregToReg);
 
   // Shrink subregister ranges if necessary.
   if (ShrinkMask.any()) {
@@ -4236,8 +4268,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
                     InflateRegs.end());
   LLVM_DEBUG(dbgs() << "Trying to inflate " << InflateRegs.size()
                     << " regs.\n");
-  for (unsigned i = 0, e = InflateRegs.size(); i != e; ++i) {
-    Register Reg = InflateRegs[i];
+  for (Register Reg : InflateRegs) {
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     if (MRI->recomputeRegClass(Reg)) {
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 36c91b7fa97e..893aa4a91828 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -15,14 +15,17 @@
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
@@ -38,138 +41,137 @@ STATISTIC(NumTLIFuncDeclAdded,
 STATISTIC(NumFuncUsedAdded,
           "Number of functions added to `llvm.compiler.used`");
 
-static bool replaceWithTLIFunction(CallInst &CI, const StringRef TLIName) {
-  Module *M = CI.getModule();
-
-  Function *OldFunc = CI.getCalledFunction();
-
-  // Check if the vector library function is already declared in this module,
-  // otherwise insert it.
+/// Returns a vector Function that it adds to the Module \p M. When an \p
+/// ScalarFunc is not null, it copies its attributes to the newly created
+/// Function.
+Function *getTLIFunction(Module *M, FunctionType *VectorFTy,
+                         const StringRef TLIName,
+                         Function *ScalarFunc = nullptr) {
   Function *TLIFunc = M->getFunction(TLIName);
   if (!TLIFunc) {
-    TLIFunc = Function::Create(OldFunc->getFunctionType(),
-                               Function::ExternalLinkage, TLIName, *M);
-    TLIFunc->copyAttributesFrom(OldFunc);
+    TLIFunc =
+        Function::Create(VectorFTy, Function::ExternalLinkage, TLIName, *M);
+    if (ScalarFunc)
+      TLIFunc->copyAttributesFrom(ScalarFunc);
 
     LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added vector library function `"
                       << TLIName << "` of type `" << *(TLIFunc->getType())
                       << "` to module.\n");
 
     ++NumTLIFuncDeclAdded;
-
-    // Add the freshly created function to llvm.compiler.used,
-    // similar to as it is done in InjectTLIMappings
+    // Add the freshly created function to llvm.compiler.used, similar to as it
+    // is done in InjectTLIMappings.
     appendToCompilerUsed(*M, {TLIFunc});
-
     LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << TLIName
                       << "` to `@llvm.compiler.used`.\n");
     ++NumFuncUsedAdded;
   }
+  return TLIFunc;
+}
 
-  // Replace the call to the vector intrinsic with a call
-  // to the corresponding function from the vector library.
-  IRBuilder<> IRBuilder(&CI);
-  SmallVector<Value *> Args(CI.args());
-  // Preserve the operand bundles.
-  SmallVector<OperandBundleDef, 1> OpBundles;
-  CI.getOperandBundlesAsDefs(OpBundles);
-  CallInst *Replacement = IRBuilder.CreateCall(TLIFunc, Args, OpBundles);
-  assert(OldFunc->getFunctionType() == TLIFunc->getFunctionType() &&
-         "Expecting function types to be identical");
-  CI.replaceAllUsesWith(Replacement);
-  if (isa<FPMathOperator>(Replacement)) {
-    // Preserve fast math flags for FP math.
-    Replacement->copyFastMathFlags(&CI);
+/// Replace the call to the vector intrinsic ( \p CalltoReplace ) with a call to
+/// the corresponding function from the vector library ( \p TLIVecFunc ).
+static void replaceWithTLIFunction(CallInst &CalltoReplace, VFInfo &Info,
+                                   Function *TLIVecFunc) {
+  IRBuilder<> IRBuilder(&CalltoReplace);
+  SmallVector<Value *> Args(CalltoReplace.args());
+  if (auto OptMaskpos = Info.getParamIndexForOptionalMask()) {
+    auto *MaskTy = VectorType::get(Type::getInt1Ty(CalltoReplace.getContext()),
+                                   Info.Shape.VF);
+    Args.insert(Args.begin() + OptMaskpos.value(),
+                Constant::getAllOnesValue(MaskTy));
   }
 
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
-                    << OldFunc->getName() << "` with call to `" << TLIName
-                    << "`.\n");
-  ++NumCallsReplaced;
-  return true;
+  // Preserve the operand bundles.
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  CalltoReplace.getOperandBundlesAsDefs(OpBundles);
+  CallInst *Replacement = IRBuilder.CreateCall(TLIVecFunc, Args, OpBundles);
+  CalltoReplace.replaceAllUsesWith(Replacement);
+  // Preserve fast math flags for FP math.
+  if (isa<FPMathOperator>(Replacement))
+    Replacement->copyFastMathFlags(&CalltoReplace);
 }
 
+/// Returns true when successfully replaced \p CallToReplace with a suitable
+/// function taking vector arguments, based on available mappings in the \p TLI.
+/// Currently only works when \p CallToReplace is a call to vectorized
+/// intrinsic.
 static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
-                                    CallInst &CI) {
-  if (!CI.getCalledFunction()) {
+                                    CallInst &CallToReplace) {
+  if (!CallToReplace.getCalledFunction())
     return false;
-  }
 
-  auto IntrinsicID = CI.getCalledFunction()->getIntrinsicID();
-  if (IntrinsicID == Intrinsic::not_intrinsic) {
-    // Replacement is only performed for intrinsic functions
+  auto IntrinsicID = CallToReplace.getCalledFunction()->getIntrinsicID();
+  // Replacement is only performed for intrinsic functions.
+  if (IntrinsicID == Intrinsic::not_intrinsic)
     return false;
-  }
 
-  // Convert vector arguments to scalar type and check that
-  // all vector operands have identical vector width.
+  // Compute arguments types of the corresponding scalar call. Additionally
+  // checks if in the vector call, all vector operands have the same EC.
   ElementCount VF = ElementCount::getFixed(0);
-  SmallVector<Type *> ScalarTypes;
-  for (auto Arg : enumerate(CI.args())) {
-    auto *ArgType = Arg.value()->getType();
-    // Vector calls to intrinsics can still have
-    // scalar operands for specific arguments.
+  SmallVector<Type *> ScalarArgTypes;
+  for (auto Arg : enumerate(CallToReplace.args())) {
+    auto *ArgTy = Arg.value()->getType();
     if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) {
-      ScalarTypes.push_back(ArgType);
-    } else {
-      // The argument in this place should be a vector if
-      // this is a call to a vector intrinsic.
-      auto *VectorArgTy = dyn_cast<VectorType>(ArgType);
-      if (!VectorArgTy) {
-        // The argument is not a vector, do not perform
-        // the replacement.
-        return false;
-      }
-      ElementCount NumElements = VectorArgTy->getElementCount();
-      if (NumElements.isScalable()) {
-        // The current implementation does not support
-        // scalable vectors.
+      ScalarArgTypes.push_back(ArgTy);
+    } else if (auto *VectorArgTy = dyn_cast<VectorType>(ArgTy)) {
+      ScalarArgTypes.push_back(ArgTy->getScalarType());
+      // Disallow vector arguments with different VFs. When processing the first
+      // vector argument, store it's VF, and for the rest ensure that they match
+      // it.
+      if (VF.isZero())
+        VF = VectorArgTy->getElementCount();
+      else if (VF != VectorArgTy->getElementCount())
         return false;
-      }
-      if (VF.isNonZero() && VF != NumElements) {
-        // The different arguments differ in vector size.
-        return false;
-      } else {
-        VF = NumElements;
-      }
-      ScalarTypes.push_back(VectorArgTy->getElementType());
-    }
+    } else
+      // Exit when it is supposed to be a vector argument but it isn't.
+      return false;
   }
 
-  // Try to reconstruct the name for the scalar version of this
-  // intrinsic using the intrinsic ID and the argument types
-  // converted to scalar above.
-  std::string ScalarName;
-  if (Intrinsic::isOverloaded(IntrinsicID)) {
-    ScalarName = Intrinsic::getName(IntrinsicID, ScalarTypes, CI.getModule());
-  } else {
-    ScalarName = Intrinsic::getName(IntrinsicID).str();
-  }
+  // Try to reconstruct the name for the scalar version of this intrinsic using
+  // the intrinsic ID and the argument types converted to scalar above.
+  std::string ScalarName =
+      (Intrinsic::isOverloaded(IntrinsicID)
+           ? Intrinsic::getName(IntrinsicID, ScalarArgTypes,
+                                CallToReplace.getModule())
+           : Intrinsic::getName(IntrinsicID).str());
+
+  // Try to find the mapping for the scalar version of this intrinsic and the
+  // exact vector width of the call operands in the TargetLibraryInfo. First,
+  // check with a non-masked variant, and if that fails try with a masked one.
+  const VecDesc *VD =
+      TLI.getVectorMappingInfo(ScalarName, VF, /*Masked*/ false);
+  if (!VD && !(VD = TLI.getVectorMappingInfo(ScalarName, VF, /*Masked*/ true)))
+    return false;
 
-  if (!TLI.isFunctionVectorizable(ScalarName)) {
-    // The TargetLibraryInfo does not contain a vectorized version of
-    // the scalar function.
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI mapping from: `" << ScalarName
+                    << "` and vector width " << VF << " to: `"
+                    << VD->getVectorFnName() << "`.\n");
+
+  // Replace the call to the intrinsic with a call to the vector library
+  // function.
+  Type *ScalarRetTy = CallToReplace.getType()->getScalarType();
+  FunctionType *ScalarFTy =
+      FunctionType::get(ScalarRetTy, ScalarArgTypes, /*isVarArg*/ false);
+  const std::string MangledName = VD->getVectorFunctionABIVariantString();
+  auto OptInfo = VFABI::tryDemangleForVFABI(MangledName, ScalarFTy);
+  if (!OptInfo)
     return false;
-  }
 
-  // Try to find the mapping for the scalar version of this intrinsic
-  // and the exact vector width of the call operands in the
-  // TargetLibraryInfo.
-  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF);
-
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
-                    << ScalarName << "` and vector width " << VF << ".\n");
-
-  if (!TLIName.empty()) {
-    // Found the correct mapping in the TargetLibraryInfo,
-    // replace the call to the intrinsic with a call to
-    // the vector library function.
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName
-                      << "`.\n");
-    return replaceWithTLIFunction(CI, TLIName);
-  }
+  FunctionType *VectorFTy = VFABI::createFunctionType(*OptInfo, ScalarFTy);
+  if (!VectorFTy)
+    return false;
+
+  Function *FuncToReplace = CallToReplace.getCalledFunction();
+  Function *TLIFunc = getTLIFunction(CallToReplace.getModule(), VectorFTy,
+                                     VD->getVectorFnName(), FuncToReplace);
+  replaceWithTLIFunction(CallToReplace, *OptInfo, TLIFunc);
 
-  return false;
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
+                    << FuncToReplace->getName() << "` with call to `"
+                    << TLIFunc->getName() << "`.\n");
+  ++NumCallsReplaced;
+  return true;
 }
 
 static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
@@ -185,9 +187,8 @@ static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
   }
   // Erase the calls to the intrinsics that have been replaced
   // with calls to the vector library.
-  for (auto *CI : ReplacedCalls) {
+  for (auto *CI : ReplacedCalls)
     CI->eraseFromParent();
-  }
   return Changed;
 }
 
@@ -207,10 +208,10 @@ PreservedAnalyses ReplaceWithVeclib::run(Function &F,
     PA.preserve<DemandedBitsAnalysis>();
     PA.preserve<OptimizationRemarkEmitterAnalysis>();
     return PA;
-  } else {
-    // The pass did not replace any calls, hence it preserves all analyses.
-    return PreservedAnalyses::all();
   }
+
+  // The pass did not replace any calls, hence it preserves all analyses.
+  return PreservedAnalyses::all();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c92a0c2a06d4..eafa95ce7fcf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -546,6 +546,7 @@ namespace {
     SDValue visitFP_TO_FP16(SDNode *N);
     SDValue visitFP16_TO_FP(SDNode *N);
     SDValue visitFP_TO_BF16(SDNode *N);
+    SDValue visitBF16_TO_FP(SDNode *N);
     SDValue visitVECREDUCE(SDNode *N);
     SDValue visitVPOp(SDNode *N);
     SDValue visitGET_FPENV_MEM(SDNode *N);
@@ -2047,6 +2048,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FP_TO_FP16:         return visitFP_TO_FP16(N);
   case ISD::FP16_TO_FP:         return visitFP16_TO_FP(N);
   case ISD::FP_TO_BF16:         return visitFP_TO_BF16(N);
+  case ISD::BF16_TO_FP:         return visitBF16_TO_FP(N);
   case ISD::FREEZE:             return visitFREEZE(N);
   case ISD::GET_FPENV_MEM:      return visitGET_FPENV_MEM(N);
   case ISD::SET_FPENV_MEM:      return visitSET_FPENV_MEM(N);
@@ -13703,8 +13705,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   if (N0.getOpcode() == ISD::AND &&
       N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
-      (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
-                           N0.getValueType()) ||
+      (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0), N0.getValueType()) ||
        !TLI.isZExtFree(N0.getValueType(), VT))) {
     SDValue X = N0.getOperand(0).getOperand(0);
     X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
@@ -13935,8 +13936,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   if (N0.getOpcode() == ISD::AND &&
       N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
-      !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
-                          N0.getValueType())) {
+      !TLI.isTruncateFree(N0.getOperand(0).getOperand(0), N0.getValueType())) {
     SDLoc DL(N);
     SDValue X = DAG.getAnyExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT);
     SDValue Y = DAG.getNode(ISD::ANY_EXTEND, DL, VT, N0.getOperand(1));
@@ -18855,8 +18855,7 @@ struct LoadedSlice {
     void addSliceGain(const LoadedSlice &LS) {
       // Each slice saves a truncate.
       const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
-      if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(),
-                              LS.Inst->getValueType(0)))
+      if (!TLI.isTruncateFree(LS.Inst->getOperand(0), LS.Inst->getValueType(0)))
         ++Truncates;
       // If there is a shift amount, this slice gets rid of it.
       if (LS.Shift)
@@ -26259,14 +26258,17 @@ SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
+  auto Op = N->getOpcode();
+  assert((Op == ISD::FP16_TO_FP || Op == ISD::BF16_TO_FP) &&
+         "opcode should be FP16_TO_FP or BF16_TO_FP.");
   SDValue N0 = N->getOperand(0);
 
-  // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
+  // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op) or
+  // fold bf16_to_fp(op & 0xffff) -> bf16_to_fp(op)
   if (!TLI.shouldKeepZExtForFP16Conv() && N0->getOpcode() == ISD::AND) {
     ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
     if (AndConst && AndConst->getAPIntValue() == 0xffff) {
-      return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
-                         N0.getOperand(0));
+      return DAG.getNode(Op, SDLoc(N), N->getValueType(0), N0.getOperand(0));
     }
   }
 
@@ -26283,6 +26285,11 @@ SDValue DAGCombiner::visitFP_TO_BF16(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitBF16_TO_FP(SDNode *N) {
+  // fold bf16_to_fp(op & 0xffff) -> bf16_to_fp(op)
+  return visitFP16_TO_FP(N);
+}
+
 SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N0.getValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index a83129586339..f3d8edb8926b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1000,8 +1000,7 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
   if (!CanLowerReturn)
     return false;
 
-  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
-    EVT VT = RetTys[I];
+  for (EVT VT : RetTys) {
     MVT RegisterVT = TLI.getRegisterType(CLI.RetTy->getContext(), VT);
     unsigned NumRegs = TLI.getNumRegisters(CLI.RetTy->getContext(), VT);
     for (unsigned i = 0; i != NumRegs; ++i) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 03cba892a167..5926a6058111 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -377,8 +377,7 @@ Register FunctionLoweringInfo::CreateRegs(Type *Ty, bool isDivergent) {
   ComputeValueVTs(*TLI, MF->getDataLayout(), Ty, ValueVTs);
 
   Register FirstReg;
-  for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
-    EVT ValueVT = ValueVTs[Value];
+  for (EVT ValueVT : ValueVTs) {
     MVT RegisterVT = TLI->getRegisterType(Ty->getContext(), ValueVT);
 
     unsigned NumRegs = TLI->getNumRegisters(Ty->getContext(), ValueVT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index a27febe15db8..34fa1f5a7ed1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -495,7 +495,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     // EXTRACT_SUBREG is lowered as %dst = COPY %src:sub.  There are no
     // constraints on the %dst register, COPY can target all legal register
     // classes.
-    unsigned SubIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    unsigned SubIdx = Node->getConstantOperandVal(1);
     const TargetRegisterClass *TRC =
       TLI->getRegClassFor(Node->getSimpleValueType(0), Node->isDivergent());
 
@@ -611,7 +611,7 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
   unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
 
   // Create the new VReg in the destination class and emit a copy.
-  unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+  unsigned DstRCIdx = Node->getConstantOperandVal(1);
   const TargetRegisterClass *DstRC =
     TRI->getAllocatableClass(TRI->getRegClass(DstRCIdx));
   Register NewVReg = MRI->createVirtualRegister(DstRC);
@@ -629,7 +629,7 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
 void InstrEmitter::EmitRegSequence(SDNode *Node,
                                   DenseMap<SDValue, Register> &VRBaseMap,
                                   bool IsClone, bool IsCloned) {
-  unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+  unsigned DstRCIdx = Node->getConstantOperandVal(0);
   const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
   Register NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC));
   const MCInstrDesc &II = TII->get(TargetOpcode::REG_SEQUENCE);
@@ -1309,8 +1309,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
 
     // Add all of the operand registers to the instruction.
     for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
-      unsigned Flags =
-        cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+      unsigned Flags = Node->getConstantOperandVal(i);
       const InlineAsm::Flag F(Flags);
       const unsigned NumVals = F.getNumOperandRegisters();
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 0917d0e4eb3e..4e317062cec4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4908,7 +4908,9 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
 static MVT getPromotedVectorElementType(const TargetLowering &TLI,
                                         MVT EltVT, MVT NewEltVT) {
   unsigned OldEltsPerNewElt = EltVT.getSizeInBits() / NewEltVT.getSizeInBits();
-  MVT MidVT = MVT::getVectorVT(NewEltVT, OldEltsPerNewElt);
+  MVT MidVT = OldEltsPerNewElt == 1
+                  ? NewEltVT
+                  : MVT::getVectorVT(NewEltVT, OldEltsPerNewElt);
   assert(TLI.isTypeLegal(MidVT) && "unexpected");
   return MidVT;
 }
@@ -5352,6 +5354,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::FEXP:
   case ISD::FEXP2:
   case ISD::FEXP10:
+  case ISD::FCANONICALIZE:
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
     Results.push_back(
@@ -5394,7 +5397,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
 
     assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
            "Invalid promote type for build_vector");
-    assert(NewEltVT.bitsLT(EltVT) && "not handled");
+    assert(NewEltVT.bitsLE(EltVT) && "not handled");
 
     MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
 
@@ -5405,7 +5408,9 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     }
 
     SDLoc SL(Node);
-    SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewOps);
+    SDValue Concat =
+        DAG.getNode(MidVT == NewEltVT ? ISD::BUILD_VECTOR : ISD::CONCAT_VECTORS,
+                    SL, NVT, NewOps);
     SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
     Results.push_back(CvtVec);
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index c4605a6b9598..65919a64b806 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2214,6 +2214,9 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
     case ISD::FP_TO_UINT_SAT:
                           R = PromoteFloatOp_FP_TO_XINT_SAT(N, OpNo); break;
     case ISD::FP_EXTEND:  R = PromoteFloatOp_FP_EXTEND(N, OpNo); break;
+    case ISD::STRICT_FP_EXTEND:
+      R = PromoteFloatOp_STRICT_FP_EXTEND(N, OpNo);
+      break;
     case ISD::SELECT_CC:  R = PromoteFloatOp_SELECT_CC(N, OpNo); break;
     case ISD::SETCC:      R = PromoteFloatOp_SETCC(N, OpNo); break;
     case ISD::STORE:      R = PromoteFloatOp_STORE(N, OpNo); break;
@@ -2276,6 +2279,26 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo) {
   return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Op);
 }
 
+SDValue DAGTypeLegalizer::PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N,
+                                                          unsigned OpNo) {
+  assert(OpNo == 1);
+
+  SDValue Op = GetPromotedFloat(N->getOperand(1));
+  EVT VT = N->getValueType(0);
+
+  // Desired VT is same as promoted type.  Use promoted float directly.
+  if (VT == Op->getValueType(0)) {
+    ReplaceValueWith(SDValue(N, 1), N->getOperand(0));
+    return Op;
+  }
+
+  // Else, extend the promoted float value to the desired VT.
+  SDValue Res = DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(N), N->getVTList(),
+                            N->getOperand(0), Op);
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 // Promote the float operands used for comparison.  The true- and false-
 // operands have the same type as the result and are promoted, if needed, by
 // PromoteFloatRes_SELECT_CC
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 362fa92dd44b..3d21bd22e6ef 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1871,6 +1871,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
     Res = PromoteIntOp_VP_STRIDED(N, OpNo);
     break;
+  case ISD::EXPERIMENTAL_VP_SPLICE:
+    Res = PromoteIntOp_VP_SPLICE(N, OpNo);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -2549,6 +2552,20 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) {
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo) {
+  SmallVector<SDValue, 6> NewOps(N->op_begin(), N->op_end());
+
+  if (OpNo == 2) { // Offset operand
+    NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+    return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+  }
+
+  assert((OpNo == 4 || OpNo == 5) && "Unexpected operand for promotion");
+
+  NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo));
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
 //===----------------------------------------------------------------------===//
 //  Integer Result Expansion
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9d5931b44ac6..84b1b2c71fd0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -410,6 +410,7 @@ private:
   SDValue PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo);
 
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
 
@@ -712,6 +713,7 @@ private:
   SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_UnaryOp(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FP_TO_XINT_SAT(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_STORE(SDNode *N, unsigned OpNo);
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index ab4c33c9e976..e3acb58327a8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -296,28 +296,24 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
       if (isNewLoad)
         AddPred(LoadSU, ChainPred);
     }
-    for (unsigned i = 0, e = LoadPreds.size(); i != e; ++i) {
-      const SDep &Pred = LoadPreds[i];
+    for (const SDep &Pred : LoadPreds) {
       RemovePred(SU, Pred);
       if (isNewLoad) {
         AddPred(LoadSU, Pred);
       }
     }
-    for (unsigned i = 0, e = NodePreds.size(); i != e; ++i) {
-      const SDep &Pred = NodePreds[i];
+    for (const SDep &Pred : NodePreds) {
       RemovePred(SU, Pred);
       AddPred(NewSU, Pred);
     }
-    for (unsigned i = 0, e = NodeSuccs.size(); i != e; ++i) {
-      SDep D = NodeSuccs[i];
+    for (SDep D : NodeSuccs) {
       SUnit *SuccDep = D.getSUnit();
       D.setSUnit(SU);
       RemovePred(SuccDep, D);
       D.setSUnit(NewSU);
       AddPred(SuccDep, D);
     }
-    for (unsigned i = 0, e = ChainSuccs.size(); i != e; ++i) {
-      SDep D = ChainSuccs[i];
+    for (SDep D : ChainSuccs) {
       SUnit *SuccDep = D.getSUnit();
       D.setSUnit(SU);
       RemovePred(SuccDep, D);
@@ -496,8 +492,7 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
         --NumOps;  // Ignore the glue operand.
 
       for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
-        unsigned Flags =
-          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned Flags = Node->getConstantOperandVal(i);
         const InlineAsm::Flag F(Flags);
         unsigned NumVals = F.getNumOperandRegisters();
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 47c137d2bcad..dcecb2e0e7fa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -331,7 +331,7 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
 
     unsigned Opcode = Node->getMachineOpcode();
     if (Opcode == TargetOpcode::REG_SEQUENCE) {
-      unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+      unsigned DstRCIdx = Node->getConstantOperandVal(0);
       const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
       RegClass = RC->getID();
       Cost = RegSequenceCost;
@@ -1369,8 +1369,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
         --NumOps;  // Ignore the glue operand.
 
       for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
-        unsigned Flags =
-          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned Flags = Node->getConstantOperandVal(i);
         const InlineAsm::Flag F(Flags);
         unsigned NumVals = F.getNumOperandRegisters();
 
@@ -2298,8 +2297,7 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) {
       continue;
     }
     if (POpc == TargetOpcode::REG_SEQUENCE) {
-      unsigned DstRCIdx =
-          cast<ConstantSDNode>(PN->getOperand(0))->getZExtValue();
+      unsigned DstRCIdx = PN->getConstantOperandVal(0);
       const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
       unsigned RCId = RC->getID();
       // REG_SEQUENCE is untyped, so getRepRegClassCostFor could not be used
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 5be1892a44f6..0e17bba2398e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5470,7 +5470,7 @@ static SDValue FoldBUILD_VECTOR(const SDLoc &DL, EVT VT,
         Ops[i].getOperand(0).getValueType() != VT ||
         (IdentitySrc && Ops[i].getOperand(0) != IdentitySrc) ||
         !isa<ConstantSDNode>(Ops[i].getOperand(1)) ||
-        cast<ConstantSDNode>(Ops[i].getOperand(1))->getAPIntValue() != i) {
+        Ops[i].getConstantOperandAPInt(1) != i) {
       IsIdentity = false;
       break;
     }
@@ -6858,8 +6858,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // expanding copies of large vectors from registers. This only works for
     // fixed length vectors, since we need to know the exact number of
     // elements.
-    if (N2C && N1.getOperand(0).getValueType().isFixedLengthVector() &&
-        N1.getOpcode() == ISD::CONCAT_VECTORS && N1.getNumOperands() > 0) {
+    if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS &&
+        N1.getOperand(0).getValueType().isFixedLengthVector()) {
       unsigned Factor =
         N1.getOperand(0).getValueType().getVectorNumElements();
       return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
@@ -6976,7 +6976,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 
     // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
     // the concat have the same type as the extract.
-    if (N1.getOpcode() == ISD::CONCAT_VECTORS && N1.getNumOperands() > 0 &&
+    if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
         VT == N1.getOperand(0).getValueType()) {
       unsigned Factor = VT.getVectorMinNumElements();
       return N1.getOperand(N2C->getZExtValue() / Factor);
@@ -7408,7 +7408,7 @@ static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) {
            Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
            Src.getOperand(1).getOpcode() == ISD::Constant) {
     G = cast<GlobalAddressSDNode>(Src.getOperand(0));
-    SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getZExtValue();
+    SrcDelta = Src.getConstantOperandVal(1);
   }
   if (!G)
     return false;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 12ed4a82ee91..3c4b285cb067 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -10627,8 +10627,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     else if (CLI.RetZExt)
       AssertOp = ISD::AssertZext;
     unsigned CurReg = 0;
-    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
-      EVT VT = RetTys[I];
+    for (EVT VT : RetTys) {
       MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
                                                      CLI.CallConv, VT);
       unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 3dc6e4bbcf46..f28211ac113c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -4181,8 +4181,7 @@ void SelectionDAGISel::CannotYetSelect(SDNode *N) {
     Msg << "\nIn function: " << MF->getName();
   } else {
     bool HasInputChain = N->getOperand(0).getValueType() == MVT::Other;
-    unsigned iid =
-      cast<ConstantSDNode>(N->getOperand(HasInputChain))->getZExtValue();
+    unsigned iid = N->getConstantOperandVal(HasInputChain);
     if (iid < Intrinsic::num_intrinsics)
       Msg << "intrinsic %" << Intrinsic::getBaseName((Intrinsic::ID)iid);
     else if (const TargetIntrinsicInfo *TII = TM.getIntrinsicInfo())
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index bf689dbd308f..526cb847e8a0 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1124,8 +1124,7 @@ bool TwoAddressInstructionPass::rescheduleKillAboveMI(
       }
     }
 
-    for (unsigned i = 0, e = OtherDefs.size(); i != e; ++i) {
-      Register MOReg = OtherDefs[i];
+    for (Register MOReg : OtherDefs) {
       if (regOverlapsSet(Uses, MOReg))
         return false;
       if (MOReg.isPhysical() && regOverlapsSet(LiveDefs, MOReg))
diff --git a/llvm/lib/Demangle/Demangle.cpp b/llvm/lib/Demangle/Demangle.cpp
index 83f3cdc88c01..117b849d1c78 100644
--- a/llvm/lib/Demangle/Demangle.cpp
+++ b/llvm/lib/Demangle/Demangle.cpp
@@ -47,7 +47,8 @@ static bool isRustEncoding(std::string_view S) { return starts_with(S, "_R"); }
 static bool isDLangEncoding(std::string_view S) { return starts_with(S, "_D"); }
 
 bool llvm::nonMicrosoftDemangle(std::string_view MangledName,
-                                std::string &Result, bool CanHaveLeadingDot) {
+                                std::string &Result, bool CanHaveLeadingDot,
+                                bool ParseParams) {
   char *Demangled = nullptr;
 
   // Do not consider the dot prefix as part of the demangled symbol name.
@@ -57,7 +58,7 @@ bool llvm::nonMicrosoftDemangle(std::string_view MangledName,
   }
 
   if (isItaniumEncoding(MangledName))
-    Demangled = itaniumDemangle(MangledName);
+    Demangled = itaniumDemangle(MangledName, ParseParams);
   else if (isRustEncoding(MangledName))
     Demangled = rustDemangle(MangledName);
   else if (isDLangEncoding(MangledName))
diff --git a/llvm/lib/Demangle/ItaniumDemangle.cpp b/llvm/lib/Demangle/ItaniumDemangle.cpp
index e3f208f0adf8..5c21b06a1d09 100644
--- a/llvm/lib/Demangle/ItaniumDemangle.cpp
+++ b/llvm/lib/Demangle/ItaniumDemangle.cpp
@@ -366,13 +366,13 @@ public:
 
 using Demangler = itanium_demangle::ManglingParser<DefaultAllocator>;
 
-char *llvm::itaniumDemangle(std::string_view MangledName) {
+char *llvm::itaniumDemangle(std::string_view MangledName, bool ParseParams) {
   if (MangledName.empty())
     return nullptr;
 
   Demangler Parser(MangledName.data(),
                    MangledName.data() + MangledName.length());
-  Node *AST = Parser.parse();
+  Node *AST = Parser.parse(ParseParams);
   if (!AST)
     return nullptr;
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index 409bec7a874b..809b2d51f059 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -188,21 +188,41 @@ private:
     Edge::Kind DeltaKind;
     Symbol *TargetSymbol;
     uint64_t Addend;
+
+    bool FixingFromSymbol = true;
     if (&BlockToFix == &FromSymbol->getAddressable()) {
+      if (LLVM_UNLIKELY(&BlockToFix == &ToSymbol->getAddressable())) {
+        // From and To are symbols in the same block. Decide direction by offset
+        // instead.
+        if (ToSymbol->getAddress() > FixupAddress)
+          FixingFromSymbol = true;
+        else if (FromSymbol->getAddress() > FixupAddress)
+          FixingFromSymbol = false;
+        else
+          FixingFromSymbol = FromSymbol->getAddress() >= ToSymbol->getAddress();
+      } else
+        FixingFromSymbol = true;
+    } else {
+      if (&BlockToFix == &ToSymbol->getAddressable())
+        FixingFromSymbol = false;
+      else {
+        // BlockToFix was neither FromSymbol nor ToSymbol.
+        return make_error<JITLinkError>("SUBTRACTOR relocation must fix up "
+                                        "either 'A' or 'B' (or a symbol in one "
+                                        "of their alt-entry groups)");
+      }
+    }
+
+    if (FixingFromSymbol) {
       TargetSymbol = ToSymbol;
       DeltaKind = (SubRI.r_length == 3) ? aarch64::Delta64 : aarch64::Delta32;
       Addend = FixupValue + (FixupAddress - FromSymbol->getAddress());
       // FIXME: handle extern 'from'.
-    } else if (&BlockToFix == &ToSymbol->getAddressable()) {
+    } else {
       TargetSymbol = &*FromSymbol;
       DeltaKind =
           (SubRI.r_length == 3) ? aarch64::NegDelta64 : aarch64::NegDelta32;
       Addend = FixupValue - (FixupAddress - ToSymbol->getAddress());
-    } else {
-      // BlockToFix was neither FromSymbol nor ToSymbol.
-      return make_error<JITLinkError>("SUBTRACTOR relocation must fix up "
-                                      "either 'A' or 'B' (or a symbol in one "
-                                      "of their alt-entry groups)");
     }
 
     return PairRelocInfo(DeltaKind, TargetSymbol, Addend);
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index 49f619357f08..eeca27771ad6 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -179,21 +179,41 @@ private:
     Edge::Kind DeltaKind;
     Symbol *TargetSymbol;
     uint64_t Addend;
+
+    bool FixingFromSymbol = true;
     if (&BlockToFix == &FromSymbol->getAddressable()) {
+      if (LLVM_UNLIKELY(&BlockToFix == &ToSymbol->getAddressable())) {
+        // From and To are symbols in the same block. Decide direction by offset
+        // instead.
+        if (ToSymbol->getAddress() > FixupAddress)
+          FixingFromSymbol = true;
+        else if (FromSymbol->getAddress() > FixupAddress)
+          FixingFromSymbol = false;
+        else
+          FixingFromSymbol = FromSymbol->getAddress() >= ToSymbol->getAddress();
+      } else
+        FixingFromSymbol = true;
+    } else {
+      if (&BlockToFix == &ToSymbol->getAddressable())
+        FixingFromSymbol = false;
+      else {
+        // BlockToFix was neither FromSymbol nor ToSymbol.
+        return make_error<JITLinkError>("SUBTRACTOR relocation must fix up "
+                                        "either 'A' or 'B' (or a symbol in one "
+                                        "of their alt-entry groups)");
+      }
+    }
+
+    if (FixingFromSymbol) {
       TargetSymbol = ToSymbol;
       DeltaKind = (SubRI.r_length == 3) ? x86_64::Delta64 : x86_64::Delta32;
       Addend = FixupValue + (FixupAddress - FromSymbol->getAddress());
       // FIXME: handle extern 'from'.
-    } else if (&BlockToFix == &ToSymbol->getAddressable()) {
+    } else {
       TargetSymbol = FromSymbol;
       DeltaKind =
           (SubRI.r_length == 3) ? x86_64::NegDelta64 : x86_64::NegDelta32;
       Addend = FixupValue - (FixupAddress - ToSymbol->getAddress());
-    } else {
-      // BlockToFix was neither FromSymbol nor ToSymbol.
-      return make_error<JITLinkError>("SUBTRACTOR relocation must fix up "
-                                      "either 'A' or 'B' (or a symbol in one "
-                                      "of their alt-entry chains)");
     }
 
     return PairRelocInfo(DeltaKind, TargetSymbol, Addend);
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebugInfoSupport.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebugInfoSupport.cpp
index f65ec27ff875..5a058bd712a3 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebugInfoSupport.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebugInfoSupport.cpp
@@ -105,8 +105,7 @@ llvm::orc::createDWARFContext(LinkGraph &G) {
       auto SecData = getSectionData(Sec);
       auto Name = Sec.getName();
       // DWARFContext expects the section name to not start with a dot
-      if (Name.starts_with("."))
-        Name = Name.drop_front();
+      Name.consume_front(".");
       LLVM_DEBUG(dbgs() << "Creating DWARFContext section " << Name
                         << " with size " << SecData.size() << "\n");
       DWARFSectionData[Name] =
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index a19e17029810..e259c393d07e 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -768,11 +768,11 @@ Error LLJITBuilderState::prepareForConstruction() {
   // create a default one.
   if (!SetupProcessSymbolsJITDylib && LinkProcessSymbolsByDefault) {
     LLVM_DEBUG(dbgs() << "Creating default Process JD setup function\n");
-    SetupProcessSymbolsJITDylib = [this](LLJIT &J) -> Expected<JITDylibSP> {
+    SetupProcessSymbolsJITDylib = [](LLJIT &J) -> Expected<JITDylibSP> {
       auto &JD =
           J.getExecutionSession().createBareJITDylib("<Process Symbols>");
-      auto G = orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
-          DL->getGlobalPrefix());
+      auto G = EPCDynamicLibrarySearchGenerator::GetForTargetProcess(
+          J.getExecutionSession());
       if (!G)
         return G.takeError();
       JD.addGenerator(std::move(*G));
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ce428f78dc84..f6cf358119fb 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6026,6 +6026,17 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
     omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
     bool IsFailOnly) {
 
+  AtomicOrdering Failure = AtomicCmpXchgInst::getStrongestFailureOrdering(AO);
+  return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
+                             IsPostfixUpdate, IsFailOnly, Failure);
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
+    const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
+    AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
+    omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
+    bool IsFailOnly, AtomicOrdering Failure) {
+
   if (!updateToLocation(Loc))
     return Loc.IP;
 
@@ -6040,7 +6051,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
   bool IsInteger = E->getType()->isIntegerTy();
 
   if (Op == OMPAtomicCompareOp::EQ) {
-    AtomicOrdering Failure = AtomicCmpXchgInst::getStrongestFailureOrdering(AO);
     AtomicCmpXchgInst *Result = nullptr;
     if (!IsInteger) {
       IntegerType *IntCastTy =
diff --git a/llvm/lib/FuzzMutate/FuzzerCLI.cpp b/llvm/lib/FuzzMutate/FuzzerCLI.cpp
index c64e9c04e199..58e4b74f4b22 100644
--- a/llvm/lib/FuzzMutate/FuzzerCLI.cpp
+++ b/llvm/lib/FuzzMutate/FuzzerCLI.cpp
@@ -86,13 +86,12 @@ void llvm::handleExecNameEncodedOptimizerOpts(StringRef ExecName) {
       Args.push_back("-passes=gvn");
     } else if (Opt == "sccp") {
       Args.push_back("-passes=sccp");
-
     } else if (Opt == "loop_predication") {
       Args.push_back("-passes=loop-predication");
     } else if (Opt == "guard_widening") {
       Args.push_back("-passes=guard-widening");
     } else if (Opt == "loop_rotate") {
-      Args.push_back("-passes=loop(rotate)");
+      Args.push_back("-passes=loop-rotate");
     } else if (Opt == "loop_unswitch") {
       Args.push_back("-passes=loop(simple-loop-unswitch)");
     } else if (Opt == "loop_unroll") {
@@ -107,7 +106,18 @@ void llvm::handleExecNameEncodedOptimizerOpts(StringRef ExecName) {
       Args.push_back("-passes=loop-reduce");
     } else if (Opt == "irce") {
       Args.push_back("-passes=irce");
-
+    } else if (Opt == "dse") {
+      Args.push_back("-passes=dse");
+    } else if (Opt == "loop_idiom") {
+      Args.push_back("-passes=loop-idiom");
+    } else if (Opt == "reassociate") {
+      Args.push_back("-passes=reassociate");
+    } else if (Opt == "lower_matrix_intrinsics") {
+      Args.push_back("-passes=lower-matrix-intrinsics");
+    } else if (Opt == "memcpyopt") {
+      Args.push_back("-passes=memcpyopt");
+    } else if (Opt == "sroa") {
+      Args.push_back("-passes=sroa");
     } else if (Triple(Opt).getArch()) {
       Args.push_back("-mtriple=" + Opt.str());
     } else {
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index fd48d5080ff6..e43f111113b4 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -1526,8 +1526,7 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
       StringRef Name = SectionName;
 
       // For user-defined custom sections, strip the prefix
-      if (Name.starts_with(".custom_section."))
-        Name = Name.substr(strlen(".custom_section."));
+      Name.consume_front(".custom_section.");
 
       MCSymbol *Begin = Sec.getBeginSymbol();
       if (Begin) {
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index 168fb57935d6..40665d686cf9 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -265,7 +265,6 @@ static wasm::WasmTableType readTableType(WasmObjectFile::ReadContext &Ctx) {
 
 static Error readSection(WasmSection &Section, WasmObjectFile::ReadContext &Ctx,
                          WasmSectionOrderChecker &Checker) {
-  Section.Offset = Ctx.Ptr - Ctx.Start;
   Section.Type = readUint8(Ctx);
   LLVM_DEBUG(dbgs() << "readSection type=" << Section.Type << "\n");
   // When reading the section's size, store the size of the LEB used to encode
@@ -273,6 +272,7 @@ static Error readSection(WasmSection &Section, WasmObjectFile::ReadContext &Ctx,
   const uint8_t *PreSizePtr = Ctx.Ptr;
   uint32_t Size = readVaruint32(Ctx);
   Section.HeaderSecSizeEncodingLen = Ctx.Ptr - PreSizePtr;
+  Section.Offset = Ctx.Ptr - Ctx.Start;
   if (Size == 0)
     return make_error<StringError>("zero length section",
                                    object_error::parse_failed);
@@ -599,6 +599,10 @@ Error WasmObjectFile::parseLinkingSection(ReadContext &Ctx) {
 
 Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
   uint32_t Count = readVaruint32(Ctx);
+  // Clear out any symbol information that was derived from the exports
+  // section.
+  LinkingData.SymbolTable.clear();
+  Symbols.clear();
   LinkingData.SymbolTable.reserve(Count);
   Symbols.reserve(Count);
   StringSet<> SymbolNames;
@@ -1290,37 +1294,75 @@ Error WasmObjectFile::parseGlobalSection(ReadContext &Ctx) {
 Error WasmObjectFile::parseExportSection(ReadContext &Ctx) {
   uint32_t Count = readVaruint32(Ctx);
   Exports.reserve(Count);
+  LinkingData.SymbolTable.reserve(Count);
+  Symbols.reserve(Count);
   for (uint32_t I = 0; I < Count; I++) {
     wasm::WasmExport Ex;
     Ex.Name = readString(Ctx);
     Ex.Kind = readUint8(Ctx);
     Ex.Index = readVaruint32(Ctx);
+    const wasm::WasmSignature *Signature = nullptr;
+    const wasm::WasmGlobalType *GlobalType = nullptr;
+    const wasm::WasmTableType *TableType = nullptr;
+    wasm::WasmSymbolInfo Info;
+    Info.Name = Ex.Name;
+    Info.Flags = 0;
     switch (Ex.Kind) {
-    case wasm::WASM_EXTERNAL_FUNCTION:
-
+    case wasm::WASM_EXTERNAL_FUNCTION: {
       if (!isDefinedFunctionIndex(Ex.Index))
         return make_error<GenericBinaryError>("invalid function export",
                                               object_error::parse_failed);
       getDefinedFunction(Ex.Index).ExportName = Ex.Name;
+      Info.Kind = wasm::WASM_SYMBOL_TYPE_FUNCTION;
+      Info.ElementIndex = Ex.Index;
+      unsigned FuncIndex = Info.ElementIndex - NumImportedFunctions;
+      wasm::WasmFunction &Function = Functions[FuncIndex];
+      Signature = &Signatures[Function.SigIndex];
       break;
-    case wasm::WASM_EXTERNAL_GLOBAL:
+    }
+    case wasm::WASM_EXTERNAL_GLOBAL: {
       if (!isValidGlobalIndex(Ex.Index))
         return make_error<GenericBinaryError>("invalid global export",
                                               object_error::parse_failed);
+      Info.Kind = wasm::WASM_SYMBOL_TYPE_DATA;
+      uint64_t Offset = 0;
+      if (isDefinedGlobalIndex(Ex.Index)) {
+        auto Global = getDefinedGlobal(Ex.Index);
+        if (!Global.InitExpr.Extended) {
+          auto Inst = Global.InitExpr.Inst;
+          if (Inst.Opcode == wasm::WASM_OPCODE_I32_CONST) {
+            Offset = Inst.Value.Int32;
+          } else if (Inst.Opcode == wasm::WASM_OPCODE_I64_CONST) {
+            Offset = Inst.Value.Int64;
+          }
+        }
+      }
+      Info.DataRef = wasm::WasmDataReference{0, Offset, 0};
       break;
+    }
     case wasm::WASM_EXTERNAL_TAG:
       if (!isValidTagIndex(Ex.Index))
         return make_error<GenericBinaryError>("invalid tag export",
                                               object_error::parse_failed);
+      Info.Kind = wasm::WASM_SYMBOL_TYPE_TAG;
+      Info.ElementIndex = Ex.Index;
       break;
     case wasm::WASM_EXTERNAL_MEMORY:
+      break;
     case wasm::WASM_EXTERNAL_TABLE:
+      Info.Kind = wasm::WASM_SYMBOL_TYPE_TABLE;
       break;
     default:
       return make_error<GenericBinaryError>("unexpected export kind",
                                             object_error::parse_failed);
     }
     Exports.push_back(Ex);
+    if (Ex.Kind != wasm::WASM_EXTERNAL_MEMORY) {
+      LinkingData.SymbolTable.emplace_back(Info);
+      Symbols.emplace_back(LinkingData.SymbolTable.back(), GlobalType,
+                           TableType, Signature);
+      LLVM_DEBUG(dbgs() << "Adding symbol: " << Symbols.back() << "\n");
+    }
   }
   if (Ctx.Ptr != Ctx.End)
     return make_error<GenericBinaryError>("export section ended prematurely",
@@ -1442,6 +1484,11 @@ Error WasmObjectFile::parseCodeSection(ReadContext &Ctx) {
     }
 
     uint32_t BodySize = FunctionEnd - Ctx.Ptr;
+    // Ensure that Function is within Ctx's buffer.
+    if (Ctx.Ptr + BodySize > Ctx.End) {
+      return make_error<GenericBinaryError>("Function extends beyond buffer",
+                                            object_error::parse_failed);
+    }
     Function.Body = ArrayRef<uint8_t>(Ctx.Ptr, BodySize);
     // This will be set later when reading in the linking metadata section.
     Function.Comdat = UINT32_MAX;
@@ -1644,6 +1691,8 @@ uint64_t WasmObjectFile::getWasmSymbolValue(const WasmSymbol &Sym) const {
       return Segment.Offset.Inst.Value.Int32 + Sym.Info.DataRef.Offset;
     } else if (Segment.Offset.Inst.Opcode == wasm::WASM_OPCODE_I64_CONST) {
       return Segment.Offset.Inst.Value.Int64 + Sym.Info.DataRef.Offset;
+    } else if (Segment.Offset.Inst.Opcode == wasm::WASM_OPCODE_GLOBAL_GET) {
+      return Sym.Info.DataRef.Offset;
     } else {
       llvm_unreachable("unknown init expr opcode");
     }
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 8f62df79d5b7..b547cf7181b1 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -539,7 +539,7 @@ Error RawInstrProfReader<IntPtrT>::createSymtab(InstrProfSymtab &Symtab) {
     const IntPtrT FPtr = swap(I->FunctionPointer);
     if (!FPtr)
       continue;
-    Symtab.mapAddress(FPtr, I->NameRef);
+    Symtab.mapAddress(FPtr, swap(I->NameRef));
   }
   return success();
 }
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index 7256e9a29329..a9b7e209915a 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -75,7 +75,6 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
     {"xcvmac", RISCVExtensionVersion{1, 0}},
     {"xcvmem", RISCVExtensionVersion{1, 0}},
     {"xcvsimd", RISCVExtensionVersion{1, 0}},
-    {"xsfcie", RISCVExtensionVersion{1, 0}},
     {"xsfvcp", RISCVExtensionVersion{1, 0}},
     {"xsfvfnrclipxfqf", RISCVExtensionVersion{1, 0}},
     {"xsfvfwmaccqqq", RISCVExtensionVersion{1, 0}},
@@ -191,11 +190,17 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
 static const RISCVSupportedExtension SupportedExperimentalExtensions[] = {
     {"zacas", RISCVExtensionVersion{1, 0}},
 
+    {"zcmop", RISCVExtensionVersion{0, 2}},
+
     {"zfbfmin", RISCVExtensionVersion{0, 8}},
 
     {"zicfilp", RISCVExtensionVersion{0, 4}},
+    {"zicfiss", RISCVExtensionVersion{0, 4}},
+
     {"zicond", RISCVExtensionVersion{1, 0}},
 
+    {"zimop", RISCVExtensionVersion{0, 1}},
+
     {"ztso", RISCVExtensionVersion{0, 1}},
 
     {"zvfbfmin", RISCVExtensionVersion{0, 8}},
@@ -1006,6 +1011,7 @@ static const char *ImpliedExtsZcb[] = {"zca"};
 static const char *ImpliedExtsZcd[] = {"d", "zca"};
 static const char *ImpliedExtsZce[] = {"zcb", "zcmp", "zcmt"};
 static const char *ImpliedExtsZcf[] = {"f", "zca"};
+static const char *ImpliedExtsZcmop[] = {"zca"};
 static const char *ImpliedExtsZcmp[] = {"zca"};
 static const char *ImpliedExtsZcmt[] = {"zca", "zicsr"};
 static const char *ImpliedExtsZdinx[] = {"zfinx"};
@@ -1017,6 +1023,7 @@ static const char *ImpliedExtsZfinx[] = {"zicsr"};
 static const char *ImpliedExtsZhinx[] = {"zhinxmin"};
 static const char *ImpliedExtsZhinxmin[] = {"zfinx"};
 static const char *ImpliedExtsZicntr[] = {"zicsr"};
+static const char *ImpliedExtsZicfiss[] = {"zicsr", "zimop"};
 static const char *ImpliedExtsZihpm[] = {"zicsr"};
 static const char *ImpliedExtsZk[] = {"zkn", "zkt", "zkr"};
 static const char *ImpliedExtsZkn[] = {"zbkb", "zbkc", "zbkx",
@@ -1078,6 +1085,7 @@ static constexpr ImpliedExtsEntry ImpliedExts[] = {
     {{"zcd"}, {ImpliedExtsZcd}},
     {{"zce"}, {ImpliedExtsZce}},
     {{"zcf"}, {ImpliedExtsZcf}},
+    {{"zcmop"}, {ImpliedExtsZcmop}},
     {{"zcmp"}, {ImpliedExtsZcmp}},
     {{"zcmt"}, {ImpliedExtsZcmt}},
     {{"zdinx"}, {ImpliedExtsZdinx}},
@@ -1088,6 +1096,7 @@ static constexpr ImpliedExtsEntry ImpliedExts[] = {
     {{"zfinx"}, {ImpliedExtsZfinx}},
     {{"zhinx"}, {ImpliedExtsZhinx}},
     {{"zhinxmin"}, {ImpliedExtsZhinxmin}},
+    {{"zicfiss"}, {ImpliedExtsZicfiss}},
     {{"zicntr"}, {ImpliedExtsZicntr}},
     {{"zihpm"}, {ImpliedExtsZihpm}},
     {{"zk"}, {ImpliedExtsZk}},
diff --git a/llvm/lib/Support/Signals.cpp b/llvm/lib/Support/Signals.cpp
index 669a9e2a8396..9f9030e79d10 100644
--- a/llvm/lib/Support/Signals.cpp
+++ b/llvm/lib/Support/Signals.cpp
@@ -145,7 +145,7 @@ static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace,
     return false;
 
   // Don't recursively invoke the llvm-symbolizer binary.
-  if (Argv0.find("llvm-symbolizer") != std::string::npos)
+  if (Argv0.contains("llvm-symbolizer"))
     return false;
 
   // FIXME: Subtract necessary number from StackTrace entries to turn return addresses
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index 168a63bb2d96..2bf68b7972e7 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -154,7 +154,10 @@ std::string getMainExecutable(const char *argv0, void *MainExecAddr) {
     return "";
 
   llvm::sys::path::make_preferred(PathNameUTF8);
-  return std::string(PathNameUTF8.data());
+
+  SmallString<256> RealPath;
+  sys::fs::real_path(PathNameUTF8, RealPath);
+  return std::string(RealPath);
 }
 
 UniqueID file_status::getUniqueID() const {
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index db92a94e40e4..68f452039c9b 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -622,8 +622,16 @@ def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedO
 def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly",
     "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">;
 
+// AArch64 2023 Architecture Extensions (v9.5-A)
+
 def FeatureCPA : SubtargetFeature<"cpa", "HasCPA", "true",
-  "Enable ARMv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">;
+    "Enable Armv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">;
+
+def FeaturePAuthLR : SubtargetFeature<"pauth-lr", "HasPAuthLR",
+    "true", "Enable Armv9.5-A PAC enhancements (FEAT_PAuth_LR)">;
+
+def FeatureTLBIW : SubtargetFeature<"tlbiw", "HasTLBIW", "true",
+  "Enable ARMv9.5-A TLBI VMALL for Dirty State (FEAT_TLBIW)">;
 
 //===----------------------------------------------------------------------===//
 // Architectures.
@@ -810,7 +818,7 @@ def SMEUnsupported : AArch64Unsupported {
                       SME2Unsupported.F);
 }
 
-let F = [HasPAuth] in
+let F = [HasPAuth, HasPAuthLR] in
 def PAUnsupported : AArch64Unsupported;
 
 include "AArch64SchedA53.td"
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index d6c00be80bd9..99f256b88782 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -134,6 +134,14 @@ def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn,
                                               form_duplane,
                                               shuf_to_ins]>;
 
+// Turn G_UNMERGE_VALUES -> G_EXTRACT_VECTOR_ELT's
+def vector_unmerge_lowering : GICombineRule <
+  (defs root:$root),
+  (match (wip_match_opcode G_UNMERGE_VALUES):$root,
+          [{ return matchScalarizeVectorUnmerge(*${root}, MRI); }]),
+  (apply [{ applyScalarizeVectorUnmerge(*${root}, MRI, B); }])
+>;
+
 def adjust_icmp_imm_matchdata :
   GIDefMatchData<"std::pair<uint64_t, CmpInst::Predicate>">;
 def adjust_icmp_imm : GICombineRule <
@@ -251,7 +259,8 @@ def AArch64PostLegalizerLowering
                         icmp_lowering, build_vector_lowering,
                         lower_vector_fcmp, form_truncstore,
                         vector_sext_inreg_to_shift,
-                        unmerge_ext_to_unmerge, lower_mull]> {
+                        unmerge_ext_to_unmerge, lower_mull,
+                        vector_unmerge_lowering]> {
 }
 
 // Post-legalization combines which are primarily optimizations.
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 757471d6a905..bb7f4d907ffd 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -747,6 +747,15 @@ bool AArch64ExpandPseudo::expandSetTagLoop(
 bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB,
                                              MachineBasicBlock::iterator MBBI,
                                              unsigned Opc, unsigned N) {
+  assert((Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI ||
+          Opc == AArch64::LDR_PXI || Opc == AArch64::STR_PXI) &&
+         "Unexpected opcode");
+  unsigned RState = (Opc == AArch64::LDR_ZXI || Opc == AArch64::LDR_PXI)
+                        ? RegState::Define
+                        : 0;
+  unsigned sub0 = (Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI)
+                      ? AArch64::zsub0
+                      : AArch64::psub0;
   const TargetRegisterInfo *TRI =
       MBB.getParent()->getSubtarget().getRegisterInfo();
   MachineInstr &MI = *MBBI;
@@ -756,9 +765,8 @@ bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB,
     assert(ImmOffset >= -256 && ImmOffset < 256 &&
            "Immediate spill offset out of range");
     BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
-        .addReg(
-            TRI->getSubReg(MI.getOperand(0).getReg(), AArch64::zsub0 + Offset),
-            Opc == AArch64::LDR_ZXI ? RegState::Define : 0)
+        .addReg(TRI->getSubReg(MI.getOperand(0).getReg(), sub0 + Offset),
+                RState)
         .addReg(MI.getOperand(1).getReg(), getKillRegState(Kill))
         .addImm(ImmOffset);
   }
@@ -1492,12 +1500,16 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
    case AArch64::STR_ZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
+   case AArch64::STR_PPXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::STR_PXI, 2);
    case AArch64::LDR_ZZZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
    case AArch64::LDR_ZZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
    case AArch64::LDR_ZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
+   case AArch64::LDR_PPXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::LDR_PXI, 2);
    case AArch64::BLR_RVMARKER:
      return expandCALL_RVMARKER(MBB, MBBI);
    case AArch64::BLR_BTI:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 463ec41b94e9..476d99c2a7e0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1950,7 +1950,7 @@ void AArch64DAGToDAGISel::SelectMultiVectorMove(SDNode *N, unsigned NumVecs,
                                                 unsigned BaseReg, unsigned Op) {
   unsigned TileNum = 0;
   if (BaseReg != AArch64::ZA)
-    TileNum = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    TileNum = N->getConstantOperandVal(2);
 
   if (!SelectSMETile(BaseReg, TileNum))
     return;
@@ -2145,8 +2145,7 @@ void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
 
   const EVT ResTys[] = {MVT::Untyped, MVT::Other};
 
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+  unsigned LaneNo = N->getConstantOperandVal(NumVecs + 2);
 
   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
                    N->getOperand(NumVecs + 3), N->getOperand(0)};
@@ -2185,8 +2184,7 @@ void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
   const EVT ResTys[] = {MVT::i64, // Type of the write back register
                         RegSeq->getValueType(0), MVT::Other};
 
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+  unsigned LaneNo = N->getConstantOperandVal(NumVecs + 1);
 
   SDValue Ops[] = {RegSeq,
                    CurDAG->getTargetConstant(LaneNo, dl,
@@ -2237,8 +2235,7 @@ void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
 
   SDValue RegSeq = createQTuple(Regs);
 
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+  unsigned LaneNo = N->getConstantOperandVal(NumVecs + 2);
 
   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
                    N->getOperand(NumVecs + 3), N->getOperand(0)};
@@ -2269,8 +2266,7 @@ void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
   const EVT ResTys[] = {MVT::i64, // Type of the write back register
                         MVT::Other};
 
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+  unsigned LaneNo = N->getConstantOperandVal(NumVecs + 1);
 
   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
                    N->getOperand(NumVecs + 2), // Base Register
@@ -2576,8 +2572,8 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
   case AArch64::UBFMXri:
     Opc = NOpc;
     Opd0 = N->getOperand(0);
-    Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
-    Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+    Immr = N->getConstantOperandVal(1);
+    Imms = N->getConstantOperandVal(2);
     return true;
   }
   // Unreachable
@@ -3877,7 +3873,7 @@ bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
         assert(isa<ConstantSDNode>(N->getOperand(2)) &&
                "Expected a constant integer expression.");
         unsigned Reg = PMapper->Encoding;
-        uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+        uint64_t Immed = N->getConstantOperandVal(2);
         CurDAG->SelectNodeTo(
             N, State, MVT::Other, CurDAG->getTargetConstant(Reg, DL, MVT::i32),
             CurDAG->getTargetConstant(Immed, DL, MVT::i16), N->getOperand(0));
@@ -4173,8 +4169,7 @@ bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
 
   SDValue IRG_SP = N->getOperand(2);
   if (IRG_SP->getOpcode() != ISD::INTRINSIC_W_CHAIN ||
-      cast<ConstantSDNode>(IRG_SP->getOperand(1))->getZExtValue() !=
-          Intrinsic::aarch64_irg_sp) {
+      IRG_SP->getConstantOperandVal(1) != Intrinsic::aarch64_irg_sp) {
     return false;
   }
 
@@ -4183,7 +4178,7 @@ bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
   int FI = cast<FrameIndexSDNode>(N->getOperand(1))->getIndex();
   SDValue FiOp = CurDAG->getTargetFrameIndex(
       FI, TLI->getPointerTy(CurDAG->getDataLayout()));
-  int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+  int TagOffset = N->getConstantOperandVal(3);
 
   SDNode *Out = CurDAG->getMachineNode(
       AArch64::TAGPstack, DL, MVT::i64,
@@ -4203,7 +4198,7 @@ void AArch64DAGToDAGISel::SelectTagP(SDNode *N) {
 
   // General case for unrelated pointers in Op1 and Op2.
   SDLoc DL(N);
-  int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+  int TagOffset = N->getConstantOperandVal(3);
   SDNode *N1 = CurDAG->getMachineNode(AArch64::SUBP, DL, MVT::i64,
                                       {N->getOperand(1), N->getOperand(2)});
   SDNode *N2 = CurDAG->getMachineNode(AArch64::ADDXrr, DL, MVT::i64,
@@ -4219,7 +4214,7 @@ bool AArch64DAGToDAGISel::trySelectCastFixedLengthToScalableVector(SDNode *N) {
   assert(N->getOpcode() == ISD::INSERT_SUBVECTOR && "Invalid Node!");
 
   // Bail when not a "cast" like insert_subvector.
-  if (cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() != 0)
+  if (N->getConstantOperandVal(2) != 0)
     return false;
   if (!N->getOperand(0).isUndef())
     return false;
@@ -4250,7 +4245,7 @@ bool AArch64DAGToDAGISel::trySelectCastScalableToFixedLengthVector(SDNode *N) {
   assert(N->getOpcode() == ISD::EXTRACT_SUBVECTOR && "Invalid Node!");
 
   // Bail when not a "cast" like extract_subvector.
-  if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 0)
+  if (N->getConstantOperandVal(1) != 0)
     return false;
 
   // Bail when normal isel can do the job.
@@ -4422,7 +4417,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     return;
   }
   case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(1);
     switch (IntNo) {
     default:
       break;
@@ -5179,7 +5174,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     }
   } break;
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(0);
     switch (IntNo) {
     default:
       break;
@@ -5782,7 +5777,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     break;
   }
   case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(1);
     if (Node->getNumOperands() >= 3)
       VT = Node->getOperand(2)->getValueType(0);
     switch (IntNo) {
@@ -6806,7 +6801,7 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
   if (Opcode != ISD::INTRINSIC_VOID && Opcode != ISD::INTRINSIC_W_CHAIN)
     return EVT();
 
-  switch (cast<ConstantSDNode>(Root->getOperand(1))->getZExtValue()) {
+  switch (Root->getConstantOperandVal(1)) {
   default:
     return EVT();
   case Intrinsic::aarch64_sme_ldr:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dffe69bdb900..102fd0c3dae2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2196,7 +2196,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
   }
   case ISD::INTRINSIC_WO_CHAIN:
   case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned IntNo = Op.getConstantOperandVal(0);
     switch (IntNo) {
     default:
       break;
@@ -3922,9 +3922,9 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
 // 4: bool isDataCache
 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
-  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
-  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  unsigned IsWrite = Op.getConstantOperandVal(2);
+  unsigned Locality = Op.getConstantOperandVal(3);
+  unsigned IsData = Op.getConstantOperandVal(4);
 
   bool IsStream = !Locality;
   // When the locality number is set
@@ -4973,10 +4973,10 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     SDValue Chain = Op.getOperand(0);
     SDValue Addr = Op.getOperand(2);
 
-    unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-    unsigned Locality = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
-    unsigned IsStream = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
-    unsigned IsData = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+    unsigned IsWrite = Op.getConstantOperandVal(3);
+    unsigned Locality = Op.getConstantOperandVal(4);
+    unsigned IsStream = Op.getConstantOperandVal(5);
+    unsigned IsData = Op.getConstantOperandVal(6);
     unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit
                      (!IsData << 3) |    // IsDataCache bit
                      (Locality << 1) |   // Cache level bits
@@ -5039,7 +5039,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
@@ -5218,8 +5218,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::aarch64_sve_ptrue:
-    return getPTrue(DAG, dl, Op.getValueType(),
-                    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+    return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
   case Intrinsic::aarch64_sve_clz:
     return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
@@ -6478,7 +6477,7 @@ static unsigned getIntrinsicID(const SDNode *N) {
   default:
     return Intrinsic::not_intrinsic;
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    unsigned IID = N->getConstantOperandVal(0);
     if (IID < Intrinsic::num_intrinsics)
       return IID;
     return Intrinsic::not_intrinsic;
@@ -10009,7 +10008,7 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   SDValue FrameAddr =
       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
   while (Depth--)
@@ -10076,7 +10075,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   SDValue ReturnAddress;
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -10942,7 +10941,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
 
     // Update the minimum and maximum lane number seen.
-    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    unsigned EltNo = V.getConstantOperandVal(1);
     Source->MinElt = std::min(Source->MinElt, EltNo);
     Source->MaxElt = std::max(Source->MaxElt, EltNo);
   }
@@ -13329,7 +13328,7 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
          "Only cases that extract a fixed length vector are supported!");
 
   EVT InVT = Op.getOperand(0).getValueType();
-  unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned Idx = Op.getConstantOperandVal(1);
   unsigned Size = Op.getValueSizeInBits();
 
   // If we don't have legal types yet, do nothing
@@ -13375,7 +13374,7 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
          "Only expect to lower inserts into scalable vectors!");
 
   EVT InVT = Op.getOperand(1).getValueType();
-  unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Idx = Op.getConstantOperandVal(2);
 
   SDValue Vec0 = Op.getOperand(0);
   SDValue Vec1 = Op.getOperand(1);
@@ -13715,11 +13714,10 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
   bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
                                             SplatBitSize, HasAnyUndefs);
 
-  bool IsSplatUniform =
-      SrcVT.getVectorElementType().getSizeInBits() >= SplatBitSize;
-  bool IsZero = IsCnst && SplatValue == 0 && IsSplatUniform;
-  bool IsOne = IsCnst && SplatValue == 1 && IsSplatUniform;
-  bool IsMinusOne = IsCnst && SplatValue.isAllOnes() && IsSplatUniform;
+  bool IsZero = IsCnst && SplatValue == 0;
+  bool IsOne =
+      IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
+  bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
 
   if (SrcVT.getVectorElementType().isFloatingPoint()) {
     switch (CC) {
@@ -14247,7 +14245,7 @@ SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
   assert(VT != MVT::i64 && "Expected illegal VSCALE node");
 
   SDLoc DL(Op);
-  APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
+  APInt MulImm = Op.getConstantOperandAPInt(0);
   return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
                             VT);
 }
@@ -18343,7 +18341,7 @@ static bool isEssentiallyExtractHighSubvector(SDValue N) {
     return false;
   if (N.getOperand(0).getValueType().isScalableVector())
     return false;
-  return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
+  return N.getConstantOperandAPInt(1) ==
          N.getOperand(0).getValueType().getVectorNumElements() / 2;
 }
 
@@ -18399,8 +18397,8 @@ static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
   // TODO: we want the operands of the Cmp not the csel
   SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
   SetCCInfo.IsAArch64 = true;
-  SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
-      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+  SetCCInfo.Info.AArch64.CC =
+      static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
 
   // Check that the operands matches the constraints:
   // (1) Both operands must be constants.
@@ -21585,7 +21583,7 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
     bool IsDupOp = false;
     unsigned NewOpc = 0;
     unsigned NumVecs = 0;
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned IntNo = N->getConstantOperandVal(1);
     switch (IntNo) {
     default: llvm_unreachable("unexpected intrinsic for Neon base update");
     case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
@@ -22501,7 +22499,7 @@ static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
 static SDValue performTBZCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  SelectionDAG &DAG) {
-  unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  unsigned Bit = N->getConstantOperandVal(2);
   bool Invert = false;
   SDValue TestSrc = N->getOperand(1);
   SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
@@ -23789,7 +23787,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performMULLCombine(N, DCI, DAG);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    switch (N->getConstantOperandVal(1)) {
     case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
       return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
     case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
@@ -23940,8 +23938,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
     case Intrinsic::aarch64_rndr:
     case Intrinsic::aarch64_rndrrs: {
-      unsigned IntrinsicID =
-          cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      unsigned IntrinsicID = N->getConstantOperandVal(1);
       auto Register =
           (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
                                                   : AArch64SysReg::RNDRRS);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 690ac0dcda62..cb63d8726744 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -2368,6 +2368,80 @@ class ClearAuth<bits<1> data, string asm>
   let Inst{4-0} = Rd;
 }
 
+// v9.5-A FEAT_PAuth_LR
+
+class SignAuthFixedRegs<bits<5> opcode2, bits<6> opcode, string asm>
+  : I<(outs), (ins), asm, "", "", []>,
+    Sched<[WriteI, ReadI]> {
+  let Inst{31} = 0b1; // sf
+  let Inst{30} = 0b1;
+  let Inst{29} = 0b0; // S
+  let Inst{28-21} = 0b11010110;
+  let Inst{20-16} = opcode2;
+  let Inst{15-10} = opcode;
+  let Inst{9-5} = 0b11111; // Rn
+  let Inst{4-0} = 0b11110; // Rd
+}
+
+def PAuthPCRelLabel16Operand : PCRelLabel<16> {
+  let Name = "PAuthPCRelLabel16";
+  let PredicateMethod = "isPAuthPCRelLabel16Operand";
+}
+def am_pauth_pcrel : Operand<OtherVT> {
+  let EncoderMethod = "getPAuthPCRelOpValue";
+  let DecoderMethod = "DecodePCRelLabel16";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PAuthPCRelLabel16Operand;
+  let OperandType = "OPERAND_PCREL";
+}
+
+class SignAuthPCRel<bits<2> opc, string asm>
+  : I<(outs), (ins am_pauth_pcrel:$label), asm, "\t$label", "", []>,
+    Sched<[]> {
+  bits<16> label;
+  let Inst{31} = 0b1; // sf
+  let Inst{30-23} = 0b11100111;
+  let Inst{22-21} = opc;
+  let Inst{20-5} = label; // imm
+  let Inst{4-0} = 0b11111; // Rd
+}
+
+class SignAuthOneReg<bits<5> opcode2, bits<6> opcode, string asm>
+  : I<(outs), (ins GPR64:$Rn), asm, "\t$Rn", "", []>,
+    Sched<[]> {
+  bits<5> Rn;
+  let Inst{31} = 0b1; // sf
+  let Inst{30} = 0b1;
+  let Inst{29} = 0b0; // S
+  let Inst{28-21} = 0b11010110;
+  let Inst{20-16} = opcode2;
+  let Inst{15-10} = opcode;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = 0b11110; // Rd
+}
+
+class SignAuthReturnPCRel<bits<3> opc, bits<5> op2, string asm>
+  : I<(outs), (ins am_pauth_pcrel:$label), asm, "\t$label", "", []>,
+    Sched<[WriteAtomic]> {
+  bits<16> label;
+  let Inst{31-24} = 0b01010101;
+  let Inst{23-21} = opc;
+  let Inst{20-5} = label; // imm16
+  let Inst{4-0} = op2;
+}
+
+class SignAuthReturnReg<bits<6> op3, string asm>
+  : I<(outs), (ins GPR64common:$Rm), asm, "\t$Rm", "", []>,
+    Sched<[WriteAtomic]> {
+  bits<5> Rm;
+  let Inst{31-25} = 0b1101011;
+  let Inst{24-21} = 0b0010; // opc
+  let Inst{20-16} = 0b11111; // op2
+  let Inst{15-10} = op3;
+  let Inst{9-5} = 0b11111; // Rn
+  let Inst{4-0} = Rm; // op4 (Rm)
+}
+
 // Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions
 class BaseFlagManipulation<bit sf, bit sz, dag iops, string asm, string ops>
     : I<(outs), iops, asm, ops, "", []>,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 7d71c316bcb0..1cfbf4737a6f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3771,6 +3771,13 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     MinOffset = -256;
     MaxOffset = 255;
     break;
+  case AArch64::LDR_PPXI:
+  case AArch64::STR_PPXI:
+    Scale = TypeSize::getScalable(2);
+    Width = TypeSize::getScalable(2 * 2);
+    MinOffset = -256;
+    MaxOffset = 254;
+    break;
   case AArch64::LDR_ZXI:
   case AArch64::STR_ZXI:
     Scale = TypeSize::getScalable(16);
@@ -4089,6 +4096,16 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
   return MI.getOperand(Idx);
 }
 
+const MachineOperand &
+AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case AArch64::LDRBBroX:
+    return MI.getOperand(4);
+  }
+}
+
 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
                                               Register Reg) {
   if (MI.getParent() == nullptr)
@@ -4804,6 +4821,10 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
         assert(SrcReg != AArch64::WSP);
     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
       Opc = AArch64::STRSui;
+    else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STR_PPXI;
+      StackID = TargetStackID::ScalableVector;
+    }
     break;
   case 8:
     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
@@ -4980,6 +5001,10 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
         assert(DestReg != AArch64::WSP);
     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
       Opc = AArch64::LDRSui;
+    else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDR_PPXI;
+      StackID = TargetStackID::ScalableVector;
+    }
     break;
   case 8:
     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
@@ -8779,12 +8804,23 @@ AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
   // Don't outline anything used for return address signing. The outlined
   // function will get signed later if needed
   switch (MI.getOpcode()) {
+  case AArch64::PACM:
   case AArch64::PACIASP:
   case AArch64::PACIBSP:
+  case AArch64::PACIASPPC:
+  case AArch64::PACIBSPPC:
   case AArch64::AUTIASP:
   case AArch64::AUTIBSP:
+  case AArch64::AUTIASPPCi:
+  case AArch64::AUTIASPPCr:
+  case AArch64::AUTIBSPPCi:
+  case AArch64::AUTIBSPPCr:
   case AArch64::RETAA:
   case AArch64::RETAB:
+  case AArch64::RETAASPPCi:
+  case AArch64::RETAASPPCr:
+  case AArch64::RETABSPPCi:
+  case AArch64::RETABSPPCr:
   case AArch64::EMITBKEY:
   case AArch64::PAUTH_PROLOGUE:
   case AArch64::PAUTH_EPILOGUE:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 6526f6740747..db24a19fe5f8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -111,6 +111,9 @@ public:
   /// Returns the immediate offset operator of a load/store.
   static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
 
+  /// Returns the shift amount operator of a load/store.
+  static const MachineOperand &getLdStAmountOp(const MachineInstr &MI);
+
   /// Returns whether the instruction is FP or NEON.
   static bool isFpOrNEON(const MachineInstr &MI);
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index bdb38f0c3789..62b2bf490f37 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -61,6 +61,9 @@ def HasLOR           : Predicate<"Subtarget->hasLOR()">,
 def HasPAuth         : Predicate<"Subtarget->hasPAuth()">,
                        AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">;
 
+def HasPAuthLR       : Predicate<"Subtarget->hasPAuthLR()">,
+                       AssemblerPredicateWithAll<(all_of FeaturePAuthLR), "pauth-lr">;
+
 def HasJS            : Predicate<"Subtarget->hasJS()">,
                        AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">;
 
@@ -1646,6 +1649,47 @@ let Predicates = [HasPAuth] in {
 
 }
 
+// v9.5-A pointer authentication extensions
+
+// Always accept "pacm" as an alias for "hint #39", but don't emit it when
+// disassembling if we don't have the pauth-lr feature.
+let CRm = 0b0100 in {
+  def PACM : SystemNoOperands<0b111, "hint\t#39">;
+}
+def : InstAlias<"pacm", (PACM), 0>;
+
+let Predicates = [HasPAuthLR] in {
+  let Defs = [LR], Uses = [LR, SP] in {
+    //                                opcode2, opcode,   asm
+    def PACIASPPC : SignAuthFixedRegs<0b00001, 0b101000, "paciasppc">;
+    def PACIBSPPC : SignAuthFixedRegs<0b00001, 0b101001, "pacibsppc">;
+    def PACNBIASPPC : SignAuthFixedRegs<0b00001, 0b100000, "pacnbiasppc">;
+    def PACNBIBSPPC : SignAuthFixedRegs<0b00001, 0b100001, "pacnbibsppc">;
+    //                             opc,  asm
+    def AUTIASPPCi : SignAuthPCRel<0b00, "autiasppc">;
+    def AUTIBSPPCi : SignAuthPCRel<0b01, "autibsppc">;
+    //                              opcode2, opcode,   asm
+    def AUTIASPPCr : SignAuthOneReg<0b00001, 0b100100, "autiasppc">;
+    def AUTIBSPPCr : SignAuthOneReg<0b00001, 0b100101, "autibsppc">;
+    //                                  opcode2, opcode,   asm
+    def PACIA171615 : SignAuthFixedRegs<0b00001, 0b100010, "pacia171615">;
+    def PACIB171615 : SignAuthFixedRegs<0b00001, 0b100011, "pacib171615">;
+    def AUTIA171615 : SignAuthFixedRegs<0b00001, 0b101110, "autia171615">;
+    def AUTIB171615 : SignAuthFixedRegs<0b00001, 0b101111, "autib171615">;
+  }
+
+  let Uses = [LR, SP], isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+    //                                   opc,   op2,     asm
+    def RETAASPPCi : SignAuthReturnPCRel<0b000, 0b11111, "retaasppc">;
+    def RETABSPPCi : SignAuthReturnPCRel<0b001, 0b11111, "retabsppc">;
+    //                                 op3,      asm
+    def RETAASPPCr : SignAuthReturnReg<0b000010, "retaasppc">;
+    def RETABSPPCr : SignAuthReturnReg<0b000011, "retabsppc">;
+  }
+  def : InstAlias<"pacm", (PACM), 1>;
+}
+
+
 // v8.3a floating point conversion for javascript
 let Predicates = [HasJS, HasFPARMv8], Defs = [NZCV] in
 def FJCVTZS  : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
@@ -6482,23 +6526,23 @@ def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))),
 // f32 bitcast(vector_extract(v4i32 src, lane)) -> EXTRACT_SUBREG(INSvi32lane(-, 0, src, lane))
 def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))),
           (EXTRACT_SUBREG (INSvi32lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), ssub)>;
-def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, 0)))),
+def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, (i64 0))))),
           (EXTRACT_SUBREG V128:$src, ssub)>;
 def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, imm:$Immd)))),
           (EXTRACT_SUBREG (INSvi64lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), dsub)>;
-def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, 0)))),
+def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, (i64 0))))),
           (EXTRACT_SUBREG V128:$src, dsub)>;
 
 // Floating point vector extractions are codegen'd as either a sequence of
 // subregister extractions, or a MOV (aka DUP here) if
 // the lane number is anything other than zero.
-def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
+def : Pat<(f64 (vector_extract (v2f64 V128:$Rn), (i64 0))),
           (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
-def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
+def : Pat<(f32 (vector_extract (v4f32 V128:$Rn), (i64 0))),
           (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
-def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
+def : Pat<(f16 (vector_extract (v8f16 V128:$Rn), (i64 0))),
           (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
-def : Pat<(vector_extract (v8bf16 V128:$Rn), 0),
+def : Pat<(bf16 (vector_extract (v8bf16 V128:$Rn), (i64 0))),
           (bf16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
 
 
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index dc6d5b8950c3..b435b3ce03e7 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -62,6 +62,8 @@ STATISTIC(NumUnscaledPairCreated,
           "Number of load/store from unscaled generated");
 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
+STATISTIC(NumConstOffsetFolded,
+          "Number of const offset of index address folded");
 
 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
               "Controls which pairs are considered for renaming");
@@ -75,6 +77,11 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
                                      cl::Hidden);
 
+// The LdStConstLimit limits how far we search for const offset instructions
+// when we form index address load/store instructions.
+static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
+                                        cl::init(10), cl::Hidden);
+
 // Enable register renaming to find additional store pairing opportunities.
 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
                                     cl::init(true), cl::Hidden);
@@ -171,6 +178,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
                                 int UnscaledOffset, unsigned Limit);
 
+  // Scan the instruction list to find a register assigned with a const
+  // value that can be combined with the current instruction (a load or store)
+  // using base addressing with writeback. Scan forwards.
+  MachineBasicBlock::iterator
+  findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
+                                  unsigned &Offset);
+
   // Scan the instruction list to find a base register update that can
   // be combined with the current instruction (a load or store) using
   // pre or post indexed addressing with writeback. Scan backwards.
@@ -182,11 +196,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
                             unsigned BaseReg, int Offset);
 
+  bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
+                              unsigned IndexReg, unsigned &Offset);
+
   // Merge a pre- or post-index base register update into a ld/st instruction.
   MachineBasicBlock::iterator
   mergeUpdateInsn(MachineBasicBlock::iterator I,
                   MachineBasicBlock::iterator Update, bool IsPreIdx);
 
+  MachineBasicBlock::iterator
+  mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+                       MachineBasicBlock::iterator Update, unsigned Offset,
+                       int Scale);
+
   // Find and merge zero store instructions.
   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
 
@@ -199,6 +221,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge a base register updates before or after a ld/st instruction.
   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
 
+  // Find and merge a index ldr/st instructions into a base ld/st instruction.
+  bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
+
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -481,6 +506,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
   }
 }
 
+static unsigned getBaseAddressOpcode(unsigned Opc) {
+  // TODO: Add more index address loads/stores.
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no base address equivalent!");
+  case AArch64::LDRBBroX:
+    return AArch64::LDRBBui;
+  }
+}
+
 static unsigned getPostIndexedOpcode(unsigned Opc) {
   switch (Opc) {
   default:
@@ -722,6 +757,20 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
   }
 }
 
+// Make sure this is a reg+reg Ld/St
+static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default:
+    return false;
+  // Scaled instructions.
+  // TODO: Add more index address loads/stores.
+  case AArch64::LDRBBroX:
+    Scale = 1;
+    return true;
+  }
+}
+
 static bool isRewritableImplicitDef(unsigned Opc) {
   switch (Opc) {
   default:
@@ -2018,6 +2067,63 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   return NextI;
 }
 
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+                                          MachineBasicBlock::iterator Update,
+                                          unsigned Offset, int Scale) {
+  assert((Update->getOpcode() == AArch64::MOVKWi) &&
+         "Unexpected const mov instruction to merge!");
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+  MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E);
+  MachineInstr &MemMI = *I;
+  unsigned Mask = (1 << 12) * Scale - 1;
+  unsigned Low = Offset & Mask;
+  unsigned High = Offset - Low;
+  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+  Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+  MachineInstrBuilder AddMIB, MemMIB;
+
+  // Add IndexReg, BaseReg, High (the BaseReg may be SP)
+  AddMIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri))
+          .addDef(IndexReg)
+          .addUse(BaseReg)
+          .addImm(High >> 12) // shifted value
+          .addImm(12);        // shift 12
+  (void)AddMIB;
+  // Ld/St DestReg, IndexReg, Imm12
+  unsigned NewOpc = getBaseAddressOpcode(I->getOpcode());
+  MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+               .add(getLdStRegOp(MemMI))
+               .add(AArch64InstrInfo::getLdStOffsetOp(MemMI))
+               .addImm(Low / Scale)
+               .setMemRefs(I->memoperands())
+               .setMIFlags(I->mergeFlagsWith(*Update));
+  (void)MemMIB;
+
+  ++NumConstOffsetFolded;
+  LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
+  LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  LLVM_DEBUG(PrevI->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(Update->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(I->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
+  LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  PrevI->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
                                                MachineInstr &MI,
                                                unsigned BaseReg, int Offset) {
@@ -2065,6 +2171,31 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
   return false;
 }
 
+bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
+                                                 MachineInstr &MI,
+                                                 unsigned IndexReg,
+                                                 unsigned &Offset) {
+  // The update instruction source and destination register must be the
+  // same as the load/store index register.
+  if (MI.getOpcode() == AArch64::MOVKWi &&
+      TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) {
+
+    // movz + movk hold a large offset of a Ld/St instruction.
+    MachineBasicBlock::iterator B = MI.getParent()->begin();
+    MachineBasicBlock::iterator MBBI = &MI;
+    MBBI = prev_nodbg(MBBI, B);
+    MachineInstr &MovzMI = *MBBI;
+    if (MovzMI.getOpcode() == AArch64::MOVZWi) {
+      unsigned Low = MovzMI.getOperand(1).getImm();
+      unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm();
+      Offset = High + Low;
+      // 12-bit optionally shifted immediates are legal for adds.
+      return Offset >> 24 == 0;
+    }
+  }
+  return false;
+}
+
 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
@@ -2220,6 +2351,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   return E;
 }
 
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
+    MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr &MemMI = *I;
+  MachineBasicBlock::iterator MBBI = I;
+
+  // If the load is the first instruction in the block, there's obviously
+  // not any matching load or store.
+  if (MBBI == B)
+    return E;
+
+  // Make sure the IndexReg is killed and the shift amount is zero.
+  // TODO: Relex this restriction to extend, simplify processing now.
+  if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() ||
+      !AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() ||
+      (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0))
+    return E;
+
+  Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+
+  // Track which register units have been modified and used between the first
+  // insn (inclusive) and the second insn.
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
+  unsigned Count = 0;
+  do {
+    MBBI = prev_nodbg(MBBI, B);
+    MachineInstr &MI = *MBBI;
+
+    // Don't count transient instructions towards the search limit since there
+    // may be different numbers of them if e.g. debug information is present.
+    if (!MI.isTransient())
+      ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) {
+      return MBBI;
+    }
+
+    // Update the status of what the instruction clobbered and used.
+    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
+
+    // Otherwise, if the index register is used or modified, we have no match,
+    // so return early.
+    if (!ModifiedRegUnits.available(IndexReg) ||
+        !UsedRegUnits.available(IndexReg))
+      return E;
+
+  } while (MBBI != B && Count < Limit);
+  return E;
+}
+
 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
     MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
@@ -2404,6 +2589,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
   return false;
 }
 
+bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
+                                              int Scale) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock::iterator E = MI.getParent()->end();
+  MachineBasicBlock::iterator Update;
+
+  // Don't know how to handle unscaled pre/post-index versions below, so bail.
+  if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
+    return false;
+
+  // Look back to try to find a const offset for index LdSt instruction. For
+  // example,
+  // mov x8, #LargeImm   ; = a * (1<<12) + imm12
+  // ldr x1, [x0, x8]
+  // merged into:
+  // add x8, x0, a * (1<<12)
+  // ldr x1, [x8, imm12]
+  unsigned Offset;
+  Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset);
+  if (Update != E && (Offset & (Scale - 1)) == 0) {
+    // Merge the imm12 into the ld/st.
+    MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale);
+    return true;
+  }
+
+  return false;
+}
+
 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
                                         bool EnableNarrowZeroStOpt) {
 
@@ -2482,6 +2695,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
   }
 
+  // 5) Find a register assigned with a const value that can be combined with
+  // into the load or store. e.g.,
+  //        mov x8, #LargeImm   ; = a * (1<<12) + imm12
+  //        ldr x1, [x0, x8]
+  //        ; becomes
+  //        add x8, x0, a * (1<<12)
+  //        ldr x1, [x8, imm12]
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    int Scale;
+    if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
+      Modified = true;
+    else
+      ++MBBI;
+  }
+
   return Modified;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 9da59ef2a806..1a8c71888a85 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -93,16 +93,24 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F,
   // TODO: skip functions that have no instrumented allocas for optimization
   IsMTETagged = F.hasFnAttribute(Attribute::SanitizeMemTag);
 
-  if (!F.hasFnAttribute("branch-target-enforcement")) {
-    if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
-            F.getParent()->getModuleFlag("branch-target-enforcement")))
-      BranchTargetEnforcement = BTE->getZExtValue();
-  } else {
-    const StringRef BTIEnable =
-        F.getFnAttribute("branch-target-enforcement").getValueAsString();
-    assert(BTIEnable == "true" || BTIEnable == "false");
-    BranchTargetEnforcement = BTIEnable == "true";
-  }
+  // BTI/PAuthLR may be set either on the function or the module. Set Bool from
+  // either the function attribute or module attribute, depending on what is
+  // set.
+  // Note: the module attributed is numeric (0 or 1) but the function attribute
+  // is stringy ("true" or "false").
+  auto TryFnThenModule = [&](StringRef AttrName, bool &Bool) {
+    if (F.hasFnAttribute(AttrName)) {
+      const StringRef V = F.getFnAttribute(AttrName).getValueAsString();
+      assert(V.equals_insensitive("true") || V.equals_insensitive("false"));
+      Bool = V.equals_insensitive("true");
+    } else if (const auto *ModVal = mdconst::extract_or_null<ConstantInt>(
+                   F.getParent()->getModuleFlag(AttrName))) {
+      Bool = ModVal->getZExtValue();
+    }
+  };
+
+  TryFnThenModule("branch-target-enforcement", BranchTargetEnforcement);
+  TryFnThenModule("branch-protection-pauth-lr", BranchProtectionPAuthLR);
 
   // The default stack probe size is 4096 if the function has no
   // stack-probe-size attribute. This is a safe default because it is the
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 219f83cfd32e..cd4a18bfbc23 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCSymbol.h"
 #include <cassert>
 #include <optional>
 
@@ -164,10 +165,21 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// SignWithBKey modifies the default PAC-RET mode to signing with the B key.
   bool SignWithBKey = false;
 
+  /// SigningInstrOffset captures the offset of the PAC-RET signing instruction
+  /// within the prologue, so it can be re-used for authentication in the
+  /// epilogue when using PC as a second salt (FEAT_PAuth_LR)
+  MCSymbol *SignInstrLabel = nullptr;
+
   /// BranchTargetEnforcement enables placing BTI instructions at potential
   /// indirect branch destinations.
   bool BranchTargetEnforcement = false;
 
+  /// Indicates that SP signing should be diversified with PC as-per PAuthLR.
+  /// This is set by -mbranch-protection and will emit NOP instructions unless
+  /// the subtarget feature +pauthlr is also used (in which case non-NOP
+  /// instructions are emitted).
+  bool BranchProtectionPAuthLR = false;
+
   /// Whether this function has an extended frame record [Ctx, FP, LR]. If so,
   /// bit 60 of the in-memory FP will be 1 to enable other tools to detect the
   /// extended record.
@@ -436,10 +448,16 @@ public:
   bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF) const;
 
   bool shouldSignWithBKey() const { return SignWithBKey; }
+
+  MCSymbol *getSigningInstrLabel() const { return SignInstrLabel; }
+  void setSigningInstrLabel(MCSymbol *Label) { SignInstrLabel = Label; }
+
   bool isMTETagged() const { return IsMTETagged; }
 
   bool branchTargetEnforcement() const { return BranchTargetEnforcement; }
 
+  bool branchProtectionPAuthLR() const { return BranchProtectionPAuthLR; }
+
   void setHasSwiftAsyncContext(bool HasContext) {
     HasSwiftAsyncContext = HasContext;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
index 7576d2a899d1..7509afaeb5fe 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
@@ -60,11 +60,35 @@ FunctionPass *llvm::createAArch64PointerAuthPass() {
 
 char AArch64PointerAuth::ID = 0;
 
+// Where PAuthLR support is not known at compile time, it is supported using
+// PACM. PACM is in the hint space so has no effect when PAuthLR is not
+// supported by the hardware, but will alter the behaviour of PACI*SP, AUTI*SP
+// and RETAA/RETAB if the hardware supports PAuthLR.
+static void BuildPACM(const AArch64Subtarget &Subtarget, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                      MachineInstr::MIFlag Flags, MCSymbol *PACSym = nullptr) {
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  auto &MFnI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
+
+  // ADR X16,<address_of_PACIASP>
+  if (PACSym) {
+    assert(Flags == MachineInstr::FrameDestroy);
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADR))
+        .addReg(AArch64::X16, RegState::Define)
+        .addSym(PACSym);
+  }
+
+  // Only emit PACM if -mbranch-protection has +pc and the target does not
+  // have feature +pauth-lr.
+  if (MFnI.branchProtectionPAuthLR() && !Subtarget.hasPAuthLR())
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACM)).setMIFlag(Flags);
+}
+
 void AArch64PointerAuth::signLR(MachineFunction &MF,
                                 MachineBasicBlock::iterator MBBI) const {
-  const AArch64FunctionInfo *MFnI = MF.getInfo<AArch64FunctionInfo>();
-  bool UseBKey = MFnI->shouldSignWithBKey();
-  bool EmitCFI = MFnI->needsDwarfUnwindInfo(MF);
+  auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
+  bool UseBKey = MFnI.shouldSignWithBKey();
+  bool EmitCFI = MFnI.needsDwarfUnwindInfo(MF);
   bool NeedsWinCFI = MF.hasWinCFI();
 
   MachineBasicBlock &MBB = *MBBI->getParent();
@@ -77,11 +101,29 @@ void AArch64PointerAuth::signLR(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  // PAuthLR authentication instructions need to know the value of PC at the
+  // point of signing (PACI*).
+  if (MFnI.branchProtectionPAuthLR()) {
+    MCSymbol *PACSym = MF.getMMI().getContext().createTempSymbol();
+    MFnI.setSigningInstrLabel(PACSym);
+  }
+
   // No SEH opcode for this one; it doesn't materialize into an
   // instruction on Windows.
-  BuildMI(MBB, MBBI, DL,
-          TII->get(UseBKey ? AArch64::PACIBSP : AArch64::PACIASP))
-      .setMIFlag(MachineInstr::FrameSetup);
+  if (MFnI.branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
+    BuildMI(MBB, MBBI, DL,
+            TII->get(MFnI.shouldSignWithBKey() ? AArch64::PACIBSPPC
+                                               : AArch64::PACIASPPC))
+        .setMIFlag(MachineInstr::FrameSetup)
+        ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
+  } else {
+    BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL,
+            TII->get(MFnI.shouldSignWithBKey() ? AArch64::PACIBSP
+                                               : AArch64::PACIASP))
+        .setMIFlag(MachineInstr::FrameSetup)
+        ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
+  }
 
   if (EmitCFI) {
     unsigned CFIIndex =
@@ -118,15 +160,37 @@ void AArch64PointerAuth::authenticateLR(
   // DW_CFA_AARCH64_negate_ra_state can't be emitted.
   bool TerminatorIsCombinable =
       TI != MBB.end() && TI->getOpcode() == AArch64::RET;
+  MCSymbol *PACSym = MFnI->getSigningInstrLabel();
+
   if (Subtarget->hasPAuth() && TerminatorIsCombinable && !NeedsWinCFI &&
       !MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
-    unsigned CombinedRetOpcode = UseBKey ? AArch64::RETAB : AArch64::RETAA;
-    BuildMI(MBB, TI, DL, TII->get(CombinedRetOpcode)).copyImplicitOps(*TI);
+    if (MFnI->branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
+      assert(PACSym && "No PAC instruction to refer to");
+      BuildMI(MBB, TI, DL,
+              TII->get(UseBKey ? AArch64::RETABSPPCi : AArch64::RETAASPPCi))
+          .addSym(PACSym)
+          .copyImplicitOps(*MBBI)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    } else {
+      BuildPACM(*Subtarget, MBB, TI, DL, MachineInstr::FrameDestroy, PACSym);
+      BuildMI(MBB, TI, DL, TII->get(UseBKey ? AArch64::RETAB : AArch64::RETAA))
+          .copyImplicitOps(*MBBI)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    }
     MBB.erase(TI);
   } else {
-    unsigned AutOpcode = UseBKey ? AArch64::AUTIBSP : AArch64::AUTIASP;
-    BuildMI(MBB, MBBI, DL, TII->get(AutOpcode))
-        .setMIFlag(MachineInstr::FrameDestroy);
+    if (MFnI->branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
+      assert(PACSym && "No PAC instruction to refer to");
+      BuildMI(MBB, MBBI, DL,
+              TII->get(UseBKey ? AArch64::AUTIBSPPCi : AArch64::AUTIASPPCi))
+          .addSym(PACSym)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    } else {
+      BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy, PACSym);
+      BuildMI(MBB, MBBI, DL,
+              TII->get(UseBKey ? AArch64::AUTIBSP : AArch64::AUTIASP))
+          .setMIFlag(MachineInstr::FrameDestroy);
+    }
 
     if (EmitAsyncCFI) {
       unsigned CFIIndex =
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3dae6f7795ee..344a15389063 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2398,11 +2398,13 @@ let Predicates = [HasSVEorSME] in {
     def LDR_ZZXI   : Pseudo<(outs   ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def LDR_ZZZXI  : Pseudo<(outs  ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_PPXI   : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
   }
   let mayStore = 1, hasSideEffects = 0 in {
     def STR_ZZXI   : Pseudo<(outs), (ins   ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def STR_ZZZXI  : Pseudo<(outs), (ins  ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_PPXI   : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
   }
 
   let AddedComplexity = 1 in {
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
index 813b4a3affcf..7edce4b61605 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -22,7 +22,7 @@ def A64FXModel : SchedMachineModel {
 
   list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F, SVEUnsupported.F,
                                                     [HasMTE, HasMatMulInt8, HasBF16,
-                                                    HasPAuth, HasCPA]);
+                                                    HasPAuth, HasPAuthLR, HasCPA]);
   let FullInstRWOverlapCheck = 0;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index 53cf725f0e23..a6fab5e6245f 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -19,7 +19,7 @@ def NeoverseN2Model : SchedMachineModel {
   let CompleteModel         =   1;
 
   list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
-                                                    [HasSVE2p1, HasCPA]);
+    [HasSVE2p1, HasPAuthLR, HasCPA]);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index e8b5f6059c9e..0b80f263e12e 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -643,6 +643,14 @@ defm : TLBI<"PAALLOS",      0b110, 0b1000, 0b0001, 0b100, 0>;
 defm : TLBI<"PAALL",        0b110, 0b1000, 0b0111, 0b100, 0>;
 }
 
+// Armv9.5-A TLBI VMALL for Dirty State
+let Requires = ["AArch64::FeatureTLBIW"] in {
+//                           op1,   CRn,    CRm,    op2,   needsreg
+defm : TLBI<"VMALLWS2E1",    0b100, 0b1000, 0b0110, 0b010, 0>;
+defm : TLBI<"VMALLWS2E1IS",  0b100, 0b1000, 0b0010, 0b010, 0>;
+defm : TLBI<"VMALLWS2E1OS",  0b100, 0b1000, 0b0101, 0b010, 0>;
+}
+
 //===----------------------------------------------------------------------===//
 // MRS/MSR (system register read/write) instruction options.
 //===----------------------------------------------------------------------===//
@@ -1946,3 +1954,22 @@ def : RWSysReg<"MDSTEPOP_EL1",      0b10, 0b000, 0b0000, 0b0101, 0b010>;
 // v9.5a System PMU zero register (FEAT_SPMU2)
 //                                  Op0   Op1    CRn     CRm     Op2
 def : WOSysReg<"SPMZR_EL0",         0b10, 0b011, 0b1001, 0b1100, 0b100>;
+
+// v9.5a Delegated SError exceptions for EL3 (FEAT_E3DSE)
+//                                  Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"VDISR_EL3",         0b11, 0b110, 0b1100, 0b0001, 0b001>;
+def : RWSysReg<"VSESR_EL3",         0b11, 0b110, 0b0101, 0b0010, 0b011>;
+
+// v9.5a Hardware Dirty State Tracking Structure (FEAT_HDBSS)
+//                                  Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"HDBSSBR_EL2",       0b11, 0b100, 0b0010, 0b0011, 0b010>;
+def : RWSysReg<"HDBSSPROD_EL2",     0b11, 0b100, 0b0010, 0b0011, 0b011>;
+
+// v9.5a Hardware Accelerator for Cleaning Dirty State (FEAT_HACDBS)
+//                                  Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"HACDBSBR_EL2",      0b11, 0b100, 0b0010, 0b0011, 0b100>;
+def : RWSysReg<"HACDBSCONS_EL2",    0b11, 0b100, 0b0010, 0b0011, 0b101>;
+
+// v9.5a Fine Grained Write Trap EL3 (FEAT_FGWTE3)
+//                                  Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"FGWTE3_EL3",        0b11, 0b110, 0b0001, 0b0001, 0b101>;
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 74afa4183e67..be66790c4277 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1696,6 +1696,21 @@ public:
     return DiagnosticPredicateTy::Match;
   }
 
+  bool isPAuthPCRelLabel16Operand() const {
+    // PAuth PCRel16 operands are similar to regular branch targets, but only
+    // negative values are allowed for concrete immediates as signing instr
+    // should be in a lower address.
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0b11)
+      return false;
+    return (Val <= 0) && (Val > -(1 << 18));
+  }
+
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
     if (!Expr)
@@ -1997,6 +2012,19 @@ public:
     Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
   }
 
+  void addPAuthPCRelLabel16Operands(MCInst &Inst, unsigned N) const {
+    // PC-relative operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
+  }
+
   void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
     // Branch operands don't encode the low bits, so shift them off
     // here. If it's a label, however, just put it on directly as there's
@@ -3678,6 +3706,7 @@ static const struct Extension {
     {"sme-f8f32", {AArch64::FeatureSMEF8F32}},
     {"sme-fa64",  {AArch64::FeatureSMEFA64}},
     {"cpa", {AArch64::FeatureCPA}},
+    {"tlbiw", {AArch64::FeatureTLBIW}},
 };
 
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index cf2d3879292d..a21b4b77166e 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -165,6 +165,9 @@ static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
 static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
                                                uint64_t Address,
                                                const MCDisassembler *Decoder);
+static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm,
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder);
 static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
                                        uint64_t Address,
                                        const MCDisassembler *Decoder);
@@ -887,6 +890,21 @@ static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
   return Success;
 }
 
+static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm,
+                                       uint64_t Addr,
+                                       const MCDisassembler *Decoder) {
+  // Immediate is encoded as the top 16-bits of an unsigned 18-bit negative
+  // PC-relative offset.
+  uint64_t ImmVal = Imm;
+  if (ImmVal > (1 << 16))
+    return Fail;
+  ImmVal = -ImmVal;
+  if (!Decoder->tryAddingSymbolicOperand(Inst, (ImmVal << 2), Addr,
+                                         /*IsBranch=*/false, 0, 0, 4))
+    Inst.addOperand(MCOperand::createImm(ImmVal));
+  return Success;
+}
+
 static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
                                        uint64_t Addr,
                                        const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 8b909f53c844..1d0e8be80d07 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -623,6 +623,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalFor({s32, s64})
       .legalFor(PackedVectorAllTypeList)
       .maxScalar(0, s64)
+      .clampNumElements(0, v8s8, v16s8)
+      .clampNumElements(0, v4s16, v8s16)
+      .clampNumElements(0, v2s32, v4s32)
+      .clampMaxNumElements(0, s64, 2)
       .lower();
 
   // FP conversions
@@ -1406,7 +1410,9 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::aarch64_neon_umax:
   case Intrinsic::aarch64_neon_umin:
   case Intrinsic::aarch64_neon_fmax:
-  case Intrinsic::aarch64_neon_fmin: {
+  case Intrinsic::aarch64_neon_fmin:
+  case Intrinsic::aarch64_neon_fmaxnm:
+  case Intrinsic::aarch64_neon_fminnm: {
     MachineIRBuilder MIB(MI);
     if (IntrinsicID == Intrinsic::aarch64_neon_smax)
       MIB.buildSMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
@@ -1422,6 +1428,12 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     else if (IntrinsicID == Intrinsic::aarch64_neon_fmin)
       MIB.buildInstr(TargetOpcode::G_FMINIMUM, {MI.getOperand(0)},
                      {MI.getOperand(2), MI.getOperand(3)});
+    else if (IntrinsicID == Intrinsic::aarch64_neon_fmaxnm)
+      MIB.buildInstr(TargetOpcode::G_FMAXNUM, {MI.getOperand(0)},
+                     {MI.getOperand(2), MI.getOperand(3)});
+    else if (IntrinsicID == Intrinsic::aarch64_neon_fminnm)
+      MIB.buildInstr(TargetOpcode::G_FMINNUM, {MI.getOperand(0)},
+                     {MI.getOperand(2), MI.getOperand(3)});
     MI.eraseFromParent();
     return true;
   }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 687063873a16..830203b61c58 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -769,6 +769,27 @@ void applyDupLane(MachineInstr &MI, MachineRegisterInfo &MRI,
   MI.eraseFromParent();
 }
 
+bool matchScalarizeVectorUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI) {
+  auto &Unmerge = cast<GUnmerge>(MI);
+  Register Src1Reg = Unmerge.getReg(Unmerge.getNumOperands() - 1);
+  const LLT SrcTy = MRI.getType(Src1Reg);
+  return SrcTy.isVector() && !SrcTy.isScalable() &&
+         Unmerge.getNumOperands() == (unsigned)SrcTy.getNumElements() + 1;
+}
+
+void applyScalarizeVectorUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                 MachineIRBuilder &B) {
+  auto &Unmerge = cast<GUnmerge>(MI);
+  Register Src1Reg = Unmerge.getReg(Unmerge.getNumOperands() - 1);
+  const LLT SrcTy = MRI.getType(Src1Reg);
+  assert((SrcTy.isVector() && !SrcTy.isScalable()) &&
+         "Expected a fixed length vector");
+
+  for (int I = 0; I < SrcTy.getNumElements(); ++I)
+    B.buildExtractVectorElementConstant(Unmerge.getReg(I), Src1Reg, I);
+  MI.eraseFromParent();
+}
+
 bool matchBuildVectorToDup(MachineInstr &MI, MachineRegisterInfo &MRI) {
   assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
   auto Splat = getAArch64VectorSplat(MI, MRI);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index a6900b8963bb..30ef3680ae79 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -67,6 +67,7 @@ public:
         {"fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal},
         {"fixup_aarch64_movw", 5, 16, 0},
         {"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_branch16", 5, 16, PCRelFlagVal},
         {"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal},
         {"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal},
         {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal}};
@@ -121,6 +122,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
 
   case AArch64::fixup_aarch64_movw:
   case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_pcrel_branch16:
   case AArch64::fixup_aarch64_add_imm12:
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
   case AArch64::fixup_aarch64_ldst_imm12_scale2:
@@ -314,6 +316,17 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
     if (Value & 0x3)
       Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3fff;
+  case AArch64::fixup_aarch64_pcrel_branch16:
+    // Unsigned PC-relative offset, so invert the negative immediate.
+    SignedValue = -SignedValue;
+    Value = static_cast<uint64_t>(SignedValue);
+    // Check valid 18-bit unsigned range.
+    if (SignedValue < 0 || SignedValue > ((1 << 18) - 1))
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0b11)
+      Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+    return (Value >> 2) & 0xffff;
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
     if (TheTriple.isOSBinFormatCOFF() && !IsResolved && SignedValue != 0) {
@@ -380,6 +393,7 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con
 
   case AArch64::fixup_aarch64_movw:
   case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_pcrel_branch16:
   case AArch64::fixup_aarch64_add_imm12:
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
   case AArch64::fixup_aarch64_ldst_imm12_scale2:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 9de40661298c..496ab18e9b19 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -186,6 +186,10 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
       return R_CLS(LD_PREL_LO19);
     case AArch64::fixup_aarch64_pcrel_branch14:
       return R_CLS(TSTBR14);
+    case AArch64::fixup_aarch64_pcrel_branch16:
+      Ctx.reportError(Fixup.getLoc(),
+                      "relocation of PAC/AUT instructions is not supported");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_pcrel_branch19:
       return R_CLS(CONDBR19);
     default:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index 767dd8805520..fdee2d5ad2bf 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -43,6 +43,11 @@ enum Fixups {
   // The high 14 bits of a 21-bit pc-relative immediate.
   fixup_aarch64_pcrel_branch14,
 
+  // The high 16 bits of a 18-bit unsigned PC-relative immediate. Used by
+  // pointer authentication, only within a function, so no relocation can be
+  // generated.
+  fixup_aarch64_pcrel_branch16,
+
   // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as
   // fixup_aarch64_pcrel_adrhi, except this is use by b.cc and generates
   // relocations directly when necessary.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index dbc4323a860f..c3e12b6d8024 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -88,6 +88,12 @@ public:
                                       SmallVectorImpl<MCFixup> &Fixups,
                                       const MCSubtargetInfo &STI) const;
 
+  /// getPAuthPCRelOpValue - Return the encoded value for a pointer
+  /// authentication pc-relative operand.
+  uint32_t getPAuthPCRelOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
   /// getLoadLiteralOpValue - Return the encoded value for a load-literal
   /// pc-relative address.
   uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
@@ -327,6 +333,29 @@ uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue(
   return 0;
 }
 
+/// getPAuthPCRelOpValue - Return the encoded value for a pointer
+/// authentication pc-relative operand.
+uint32_t
+AArch64MCCodeEmitter::getPAuthPCRelOpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, invert sign as it's a negative value
+  // that should be encoded as unsigned
+  if (MO.isImm())
+    return -(MO.getImm());
+  assert(MO.isExpr() && "Unexpected target type!");
+
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch16);
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
 /// getLoadLiteralOpValue - Return the encoded value for a load-literal
 /// pc-relative address.
 uint32_t
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 060fb66d38f7..d2a325d5ad89 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1100,8 +1100,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
    FeatureVOP3Literal, FeatureDPP8,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
-   FeatureGWS, FeatureTrue16BitInsts
+   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
+   FeatureTrue16BitInsts
   ]
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b0eac567ec9f..bffea82ab8f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -377,7 +377,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
     return Subtarget->getRegisterInfo()->getRegClass(RegClass);
   }
   case AMDGPU::REG_SEQUENCE: {
-    unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    unsigned RCID = N->getConstantOperandVal(0);
     const TargetRegisterClass *SuperRC =
         Subtarget->getRegisterInfo()->getRegClass(RCID);
 
@@ -724,7 +724,7 @@ bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
                                              unsigned ShAmtBits) const {
   assert(N->getOpcode() == ISD::AND);
 
-  const APInt &RHS = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+  const APInt &RHS = N->getConstantOperandAPInt(1);
   if (RHS.countr_one() >= ShAmtBits)
     return true;
 
@@ -2672,7 +2672,7 @@ void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
 }
 
 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
-  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned IntrID = N->getConstantOperandVal(1);
   switch (IntrID) {
   case Intrinsic::amdgcn_ds_append:
   case Intrinsic::amdgcn_ds_consume: {
@@ -2690,7 +2690,7 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
 }
 
 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
-  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IntrID = N->getConstantOperandVal(0);
   unsigned Opcode;
   switch (IntrID) {
   case Intrinsic::amdgcn_wqm:
@@ -2731,7 +2731,7 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
 }
 
 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
-  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned IntrID = N->getConstantOperandVal(1);
   switch (IntrID) {
   case Intrinsic::amdgcn_ds_gws_init:
   case Intrinsic::amdgcn_ds_gws_barrier:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 4bf4707553e5..8fbc90a6db9f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -682,7 +682,7 @@ static bool hasSourceMods(const SDNode *N) {
   case ISD::BITCAST:
     return false;
   case ISD::INTRINSIC_WO_CHAIN: {
-    switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
+    switch (N->getConstantOperandVal(0)) {
     case Intrinsic::amdgcn_interp_p1:
     case Intrinsic::amdgcn_interp_p2:
     case Intrinsic::amdgcn_interp_mov:
@@ -837,7 +837,7 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
   case ISD::TokenFactor:
     return true;
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    unsigned IntrID = N->getConstantOperandVal(0);
     switch (IntrID) {
     case Intrinsic::amdgcn_readfirstlane:
     case Intrinsic::amdgcn_readlane:
@@ -1489,7 +1489,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
                                                      SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SmallVector<SDValue, 8> Args;
-  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned Start = Op.getConstantOperandVal(1);
   EVT VT = Op.getValueType();
   EVT SrcVT = Op.getOperand(0).getValueType();
 
@@ -2502,8 +2502,7 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) {
   case ISD::FFREXP:
     return true;
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntrinsicID =
-        cast<ConstantSDNode>(Src.getOperand(0))->getZExtValue();
+    unsigned IntrinsicID = Src.getConstantOperandVal(0);
     switch (IntrinsicID) {
     case Intrinsic::amdgcn_frexp_mant:
       return true;
@@ -3070,18 +3069,26 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
 
   bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
                    Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
+  bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
 
-  if (Src.getValueType() == MVT::i32) {
+  if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
     // (ctlz hi:lo) -> (umin (ffbh src), 32)
     // (cttz hi:lo) -> (umin (ffbl src), 32)
     // (ctlz_zero_undef src) -> (ffbh src)
     // (cttz_zero_undef src) -> (ffbl src)
+
+    //  64-bit scalar version produce 32-bit result
+    // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
+    // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
+    // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
+    // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
     SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
     if (!ZeroUndef) {
-      const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
-      NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
+      const SDValue ConstVal = DAG.getConstant(
+          Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
+      NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
     }
-    return NewOpr;
+    return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
   }
 
   SDValue Lo, Hi;
@@ -3593,7 +3600,7 @@ static SDValue simplifyMul24(SDNode *Node24,
   SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
   unsigned NewOpcode = Node24->getOpcode();
   if (IsIntrin) {
-    unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
+    unsigned IID = Node24->getConstantOperandVal(0);
     switch (IID) {
     case Intrinsic::amdgcn_mul_i24:
       NewOpcode = AMDGPUISD::MUL_I24;
@@ -3813,7 +3820,7 @@ SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
 
 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
   SDNode *N, DAGCombinerInfo &DCI) const {
-  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IID = N->getConstantOperandVal(0);
   switch (IID) {
   case Intrinsic::amdgcn_mul_i24:
   case Intrinsic::amdgcn_mul_u24:
@@ -5644,7 +5651,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned IID = Op.getConstantOperandVal(0);
     switch (IID) {
     case Intrinsic::amdgcn_workitem_id_x:
     case Intrinsic::amdgcn_workitem_id_y:
@@ -5826,8 +5833,7 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
     return SNaN;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntrinsicID
-      = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned IntrinsicID = Op.getConstantOperandVal(0);
     // TODO: Handle more intrinsics
     switch (IntrinsicID) {
     case Intrinsic::amdgcn_cubeid:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index eaf72d7157ee..36e07d944c94 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -642,6 +642,7 @@ defm int_amdgcn_global_atomic_fmax : noret_op;
 defm int_amdgcn_global_atomic_csub : noret_op;
 defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
 defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
+defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
 defm int_amdgcn_flat_atomic_fmin_num : noret_op;
 defm int_amdgcn_flat_atomic_fmax_num : noret_op;
 defm int_amdgcn_global_atomic_fmin_num : noret_op;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c9412f720c62..fba060464a6e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4690,6 +4690,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_flat_atomic_fmax_num:
     case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
     case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
+    case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
       return getDefaultMappingAllVGPR(MI);
     case Intrinsic::amdgcn_ds_ordered_add:
     case Intrinsic::amdgcn_ds_ordered_swap:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index e83e644d13f3..2d8dc9d47225 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -11,7 +11,7 @@ def SGPRRegBank : RegisterBank<"SGPR",
 >;
 
 def VGPRRegBank : RegisterBank<"VGPR",
-  [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384, VReg_512, VReg_1024]
+  [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384, VReg_512, VReg_1024]
 >;
 
 // It is helpful to distinguish conditions from ordinary SGPRs.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index db5d2bbcf5bb..fc47b02c98e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -346,8 +346,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
           IsSGPR = true;
           Width = 1;
         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
-                   AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
-                   AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
+                   AMDGPU::VGPR_16RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 1;
         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
index 459400e3359c..79e9312034da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
@@ -85,7 +85,6 @@ public:
     AU.addRequired<DominatorTreeWrapperPass>();
 
     AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<UniformityInfoWrapperPass>();
     AU.setPreservesCFG();
   }
 };
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index beb670669581..4cc8871a00fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -243,6 +243,7 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 9bc3ba161c9e..1bfb7c0edd80 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -109,9 +109,6 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const {
     // FIXME: preserve PostDominatorTreeWrapperPass
   }
 
-  // No divergent values are changed, only blocks and branch edges.
-  AU.addPreserved<UniformityInfoWrapperPass>();
-
   // We preserve the non-critical-edgeness property
   AU.addPreservedID(BreakCriticalEdgesID);
 
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 3b69a37728ea..abd7e911beef 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -5416,11 +5416,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS, Val,
                        ValRange);
     } else if (ID == ".amdhsa_shared_vgpr_count") {
-      if (IVersion.Major < 10)
-        return Error(IDRange.Start, "directive requires gfx10+", IDRange);
+      if (IVersion.Major < 10 || IVersion.Major >= 12)
+        return Error(IDRange.Start, "directive requires gfx10 or gfx11",
+                     IDRange);
       SharedVGPRCount = Val;
       PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3,
-                       COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT, Val,
+                       COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT, Val,
                        ValRange);
     } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") {
       PARSE_BITS_ENTRY(
@@ -5522,7 +5523,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                     (AccumOffset / 4 - 1));
   }
 
-  if (IVersion.Major >= 10) {
+  if (IVersion.Major >= 10 && IVersion.Major < 12) {
     // SharedVGPRCount < 16 checked by PARSE_ENTRY_BITS
     if (SharedVGPRCount && EnableWavefrontSize32 && *EnableWavefrontSize32) {
       return TokError("shared_vgpr_count directive not valid on "
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 3a895923fa4b..bc9049b4ef33 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1147,7 +1147,8 @@ def : GCNPat <
 >;
 } // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
 
-def : Pat <
+let OtherPredicates = [HasGDS] in
+def : GCNPat <
   (SIds_ordered_count i32:$value, i16:$offset),
   (DS_ORDERED_COUNT $value, (as_i16imm $offset))
 >;
@@ -1189,7 +1190,8 @@ def : GCNPat <
 //===----------------------------------------------------------------------===//
 
 class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
-                                               string opName = ps.Mnemonic>
+                                               string opName = ps.Mnemonic,
+                                               bit hasGFX12Enc = 0>
     : DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> {
 
   let Inst{7-0}   = !if(ps.has_offset0, offset0, 0);
@@ -1201,6 +1203,8 @@ class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
   let Inst{47-40} = !if(ps.has_data0, data0{7-0}, 0);
   let Inst{55-48} = !if(ps.has_data1, data1{7-0}, 0);
   let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0);
+
+  let gds = !if(hasGFX12Enc, 0, ?);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1212,7 +1216,7 @@ let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in {
     defvar ps = !cast<DS_Pseudo>(NAME);
     def _gfx12 :
       Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX12,
-                                               ps.Mnemonic>;
+                                               ps.Mnemonic, 1>;
   }
 
   multiclass DS_Real_Renamed_gfx12<bits<8> op, DS_Pseudo backing_pseudo,
@@ -1220,7 +1224,7 @@ let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in {
     def _gfx12 :
       Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, backing_pseudo,
                                                SIEncodingFamily.GFX12,
-                                               real_name>,
+                                               real_name, 1>,
       MnemonicAlias<backing_pseudo.Mnemonic, real_name>,
       Requires<[isGFX12Plus]>;
   }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ed2e7e4f189e..67be7b0fd642 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -702,6 +702,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                          AMDGPU::OpName::src2_modifiers);
   }
 
+  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
+      !AMDGPU::hasGDS(STI)) {
+    insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
+  }
+
   if (Res && (MCII->get(MI.getOpcode()).TSFlags &
           (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) {
     int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
@@ -1279,9 +1284,8 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
 
 MCOperand AMDGPUDisassembler::createVGPR16Operand(unsigned RegIdx,
                                                   bool IsHi) const {
-  unsigned RCID =
-      IsHi ? AMDGPU::VGPR_HI16RegClassID : AMDGPU::VGPR_LO16RegClassID;
-  return createRegOperand(RCID, RegIdx);
+  unsigned RegIdxInVGPR16 = RegIdx * 2 + (IsHi ? 1 : 0);
+  return createRegOperand(AMDGPU::VGPR_16RegClassID, RegIdxInVGPR16);
 }
 
 // Decode Literals for insts which always have a literal in the encoding
@@ -1995,34 +1999,60 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
     if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX90A_RESERVED1)
       return MCDisassembler::Fail;
   } else if (isGFX10Plus()) {
-    if (!EnableWavefrontSize32 || !*EnableWavefrontSize32) {
-      PRINT_DIRECTIVE(".amdhsa_shared_vgpr_count",
-                      COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
+    // Bits [0-3].
+    if (!isGFX12Plus()) {
+      if (!EnableWavefrontSize32 || !*EnableWavefrontSize32) {
+        PRINT_DIRECTIVE(".amdhsa_shared_vgpr_count",
+                        COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
+      } else {
+        PRINT_PSEUDO_DIRECTIVE_COMMENT(
+            "SHARED_VGPR_COUNT",
+            COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
+      }
     } else {
-      PRINT_PSEUDO_DIRECTIVE_COMMENT(
-          "SHARED_VGPR_COUNT", COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
+      if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX12_PLUS_RESERVED0)
+        return MCDisassembler::Fail;
     }
 
-    if (isGFX11Plus()) {
+    // Bits [4-11].
+    if (isGFX11()) {
       PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE",
-                                     COMPUTE_PGM_RSRC3_GFX11_PLUS_INST_PREF_SIZE);
+                                     COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE);
       PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START",
-                                     COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_START);
+                                     COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_START);
       PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END",
-                                     COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_END);
+                                     COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_END);
+    } else if (isGFX12Plus()) {
+      PRINT_PSEUDO_DIRECTIVE_COMMENT(
+          "INST_PREF_SIZE", COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE);
+    } else {
+      if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED1)
+        return MCDisassembler::Fail;
+    }
+
+    // Bits [12].
+    if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED2)
+      return MCDisassembler::Fail;
+
+    // Bits [13].
+    if (isGFX12Plus()) {
+      PRINT_PSEUDO_DIRECTIVE_COMMENT("GLG_EN",
+                                     COMPUTE_PGM_RSRC3_GFX12_PLUS_GLG_EN);
     } else {
-      if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED0)
+      if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_GFX11_RESERVED3)
         return MCDisassembler::Fail;
     }
 
-    if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED1)
+    // Bits [14-30].
+    if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED4)
       return MCDisassembler::Fail;
 
+    // Bits [31].
     if (isGFX11Plus()) {
       PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP",
-                                     COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_START);
+                                     COMPUTE_PGM_RSRC3_GFX11_PLUS_IMAGE_OP);
     } else {
-      if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED2)
+      if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED5)
         return MCDisassembler::Fail;
     }
   } else if (FourByteBuffer) {
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 0dd2b3f5c2c9..615f8cd54d8f 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -926,9 +926,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_usho
 defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">;
 defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;
 
-} // End is_flat_global = 1
-
+let SubtargetPredicate = isGFX12Plus in {
+  defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
+} // End SubtargetPredicate = isGFX12Plus
 
+} // End is_flat_global = 1
 
 let SubtargetPredicate = HasFlatScratchInsts in {
 defm SCRATCH_LOAD_UBYTE    : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte", VGPR_32>;
@@ -1529,6 +1531,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", "atomic_swap_global", i64>
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_global", i64, v2i64>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
 
+let OtherPredicates = [isGFX12Plus] in {
+  defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>;
+}
+
 let OtherPredicates = [isGFX10Plus] in {
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
@@ -2654,6 +2660,7 @@ defm GLOBAL_ATOMIC_DEC_U64         : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_A
 defm GLOBAL_ATOMIC_MIN_NUM_F32     : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">;
 defm GLOBAL_ATOMIC_MAX_NUM_F32     : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">;
 defm GLOBAL_ATOMIC_ADD_F32         : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
+defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">;
 
 // ENC_VSCRATCH.
 defm SCRATCH_LOAD_U8               : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index a855cf585205..e135a4e25dd1 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -475,8 +475,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     PRINT_FIELD(OS, ".amdhsa_forward_progress", KD,
                 compute_pgm_rsrc1,
                 amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS);
+  }
+  if (IVersion.Major >= 10 && IVersion.Major < 12) {
     PRINT_FIELD(OS, ".amdhsa_shared_vgpr_count", KD, compute_pgm_rsrc3,
-                amdhsa::COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
+                amdhsa::COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT);
   }
   if (IVersion.Major >= 12)
     PRINT_FIELD(OS, ".amdhsa_round_robin_scheduling", KD, compute_pgm_rsrc1,
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index c1ba9c514874..9a2fb0bc37b2 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -424,8 +424,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     return lowerADDRSPACECAST(Op, DAG);
   case ISD::INTRINSIC_VOID: {
     SDValue Chain = Op.getOperand(0);
-    unsigned IntrinsicID =
-                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    unsigned IntrinsicID = Op.getConstantOperandVal(1);
     switch (IntrinsicID) {
     case Intrinsic::r600_store_swizzle: {
       SDLoc DL(Op);
@@ -449,8 +448,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntrinsicID =
-                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned IntrinsicID = Op.getConstantOperandVal(0);
     EVT VT = Op.getValueType();
     SDLoc DL(Op);
     switch (IntrinsicID) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4f4bc45e49b4..0e857e6ac71b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -540,10 +540,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        MVT::f16, Custom);
 
     setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
-
-    setOperationAction(
-        {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP},
-        MVT::f16, Promote);
+    setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
 
     // F16 - VOP2 Actions.
     setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
@@ -1243,6 +1240,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::amdgcn_global_atomic_fmax:
   case Intrinsic::amdgcn_global_atomic_fmin_num:
   case Intrinsic::amdgcn_global_atomic_fmax_num:
+  case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
   case Intrinsic::amdgcn_flat_atomic_fadd:
   case Intrinsic::amdgcn_flat_atomic_fmin:
   case Intrinsic::amdgcn_flat_atomic_fmax:
@@ -5307,7 +5305,7 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
          VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
-         VT == MVT::v32f32);
+         VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16);
 
   SDValue Lo0, Hi0;
   SDValue Op0 = Op.getOperand(0);
@@ -5391,7 +5389,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
       return SDValue();
 
     // Get the rounding mode from the last operand
-    int RoundMode = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    int RoundMode = Op.getConstantOperandVal(1);
     if (RoundMode == (int)RoundingMode::TowardPositive)
       Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD;
     else if (RoundMode == (int)RoundingMode::TowardNegative)
@@ -5701,7 +5699,7 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    unsigned IID = N->getConstantOperandVal(0);
     switch (IID) {
     case Intrinsic::amdgcn_make_buffer_rsrc:
       Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
@@ -5839,7 +5837,7 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
 
 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
-    switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
+    switch (Intr->getConstantOperandVal(1)) {
     case Intrinsic::amdgcn_if:
       return AMDGPUISD::IF;
     case Intrinsic::amdgcn_else:
@@ -5988,7 +5986,7 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   SDLoc DL(Op);
   // Checking the depth
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0)
+  if (Op.getConstantOperandVal(0) != 0)
     return DAG.getConstant(0, DL, VT);
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -7637,7 +7635,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntrinsicID = Op.getConstantOperandVal(0);
 
   // TODO: Should this propagate fast-math-flags?
 
@@ -7791,7 +7789,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
                            SDLoc(Op), MVT::i32);
   case Intrinsic::amdgcn_s_buffer_load: {
-    unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+    unsigned CPol = Op.getConstantOperandVal(3);
     if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
                      ? AMDGPU::CPol::ALL
                      : AMDGPU::CPol::ALL_pregfx12))
@@ -8041,7 +8039,7 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
 
 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                  SelectionDAG &DAG) const {
-  unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned IntrID = Op.getConstantOperandVal(1);
   SDLoc DL(Op);
 
   switch (IntrID) {
@@ -8137,8 +8135,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_buffer_load:
   case Intrinsic::amdgcn_buffer_load_format: {
-    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+    unsigned Glc = Op.getConstantOperandVal(5);
+    unsigned Slc = Op.getConstantOperandVal(6);
     unsigned IdxEn = getIdxEn(Op.getOperand(3));
     SDValue Ops[] = {
       Op.getOperand(0), // Chain
@@ -8226,10 +8224,10 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     EVT LoadVT = Op.getValueType();
 
     auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
-    unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
-    unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
-    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
+    unsigned Dfmt = Op.getConstantOperandVal(7);
+    unsigned Nfmt = Op.getConstantOperandVal(8);
+    unsigned Glc = Op.getConstantOperandVal(9);
+    unsigned Slc = Op.getConstantOperandVal(10);
     unsigned IdxEn = getIdxEn(Op.getOperand(3));
     SDValue Ops[] = {
         Op.getOperand(0),                                        // Chain
@@ -8316,7 +8314,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_buffer_atomic_or:
   case Intrinsic::amdgcn_buffer_atomic_xor:
   case Intrinsic::amdgcn_buffer_atomic_fadd: {
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+    unsigned Slc = Op.getConstantOperandVal(6);
     unsigned IdxEn = getIdxEn(Op.getOperand(4));
     SDValue Ops[] = {
       Op.getOperand(0), // Chain
@@ -8477,7 +8475,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
 
   case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
+    unsigned Slc = Op.getConstantOperandVal(7);
     unsigned IdxEn = getIdxEn(Op.getOperand(5));
     SDValue Ops[] = {
       Op.getOperand(0), // Chain
@@ -8881,7 +8879,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDLoc DL(Op);
   SDValue Chain = Op.getOperand(0);
-  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned IntrinsicID = Op.getConstantOperandVal(1);
   MachineFunction &MF = DAG.getMachineFunction();
 
   switch (IntrinsicID) {
@@ -8946,10 +8944,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
     if (IsD16)
       VData = handleD16VData(VData, DAG);
-    unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
-    unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
-    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
+    unsigned Dfmt = Op.getConstantOperandVal(8);
+    unsigned Nfmt = Op.getConstantOperandVal(9);
+    unsigned Glc = Op.getConstantOperandVal(10);
+    unsigned Slc = Op.getConstantOperandVal(11);
     unsigned IdxEn = getIdxEn(Op.getOperand(4));
     SDValue Ops[] = {
       Chain,
@@ -9032,8 +9030,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
     if (IsD16)
       VData = handleD16VData(VData, DAG);
-    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
-    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
+    unsigned Glc = Op.getConstantOperandVal(6);
+    unsigned Slc = Op.getConstantOperandVal(7);
     unsigned IdxEn = getIdxEn(Op.getOperand(4));
     SDValue Ops[] = {
       Chain,
@@ -12072,8 +12070,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
     return false;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntrinsicID
-      = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned IntrinsicID = Op.getConstantOperandVal(0);
     // TODO: Handle more intrinsics
     switch (IntrinsicID) {
     case Intrinsic::amdgcn_cvt_pkrtz:
@@ -15011,7 +15008,7 @@ void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   unsigned Opc = Op.getOpcode();
   switch (Opc) {
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned IID = Op.getConstantOperandVal(0);
     switch (IID) {
     case Intrinsic::amdgcn_mbcnt_lo:
     case Intrinsic::amdgcn_mbcnt_hi: {
@@ -15254,11 +15251,9 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
   case ISD::CALLSEQ_END:
     return true;
   case ISD::INTRINSIC_WO_CHAIN:
-    return AMDGPU::isIntrinsicSourceOfDivergence(
-        cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
+    return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
   case ISD::INTRINSIC_W_CHAIN:
-    return AMDGPU::isIntrinsicSourceOfDivergence(
-        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+    return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
   case AMDGPUISD::ATOMIC_CMP_SWAP:
   case AMDGPUISD::ATOMIC_LOAD_FMIN:
   case AMDGPUISD::ATOMIC_LOAD_FMAX:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e599f23101c8..396d22c7ec18 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -273,8 +273,8 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
     // subtract the index by one.
     Offset0Idx -= get(Opc0).NumDefs;
     Offset1Idx -= get(Opc1).NumDefs;
-    Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue();
-    Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue();
+    Offset0 = Load0->getConstantOperandVal(Offset0Idx);
+    Offset1 = Load1->getConstantOperandVal(Offset1Idx);
     return true;
   }
 
@@ -955,12 +955,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
     bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
     bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
-    bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) ||
-                  AMDGPU::SReg_LO16RegClass.contains(DestReg) ||
-                  AMDGPU::AGPR_LO16RegClass.contains(DestReg);
-    bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
-                  AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
-                  AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
+    bool DstLow = !AMDGPU::isHi(DestReg, RI);
+    bool SrcLow = !AMDGPU::isHi(SrcReg, RI);
     MCRegister NewDestReg = RI.get32BitRegister(DestReg);
     MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
 
@@ -4983,6 +4979,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  if (isDS(MI) && !ST.hasGDS()) {
+    const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
+    if (GDSOp && GDSOp->getImm() != 0) {
+      ErrInfo = "GDS is not supported on this subtarget";
+      return false;
+    }
+  }
+
   if (isImage(MI)) {
     const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
     if (DimOp) {
@@ -6904,6 +6908,15 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     Inst.eraseFromParent();
     return;
 
+  case AMDGPU::S_FLBIT_I32_B64:
+    splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
+    Inst.eraseFromParent();
+    return;
+  case AMDGPU::S_FF1_I32_B64:
+    splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
+    Inst.eraseFromParent();
+    return;
+
   case AMDGPU::S_LSHL_B32:
     if (ST.hasOnlyRevVALUShifts()) {
       NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
@@ -7185,6 +7198,18 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     Register DstReg = Inst.getOperand(0).getReg();
     const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
 
+    // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
+    // hope for the best.
+    if (Inst.isCopy() && DstReg.isPhysical() &&
+        RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
+      // TODO: Only works for 32 bit registers.
+      BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+              get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
+          .add(Inst.getOperand(1));
+      Inst.eraseFromParent();
+      return;
+    }
+
     if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
         NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
       // Instead of creating a copy where src and dst are the same register
@@ -7837,6 +7862,61 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
+void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
+                                          MachineInstr &Inst, unsigned Opcode,
+                                          MachineDominatorTree *MDT) const {
+  //  (S_FLBIT_I32_B64 hi:lo) ->
+  // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
+  //  (S_FF1_I32_B64 hi:lo) ->
+  // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
+
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  const DebugLoc &DL = Inst.getDebugLoc();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src = Inst.getOperand(1);
+
+  const MCInstrDesc &InstDesc = get(Opcode);
+
+  bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
+  unsigned OpcodeAdd =
+      ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
+
+  const TargetRegisterClass *SrcRC =
+      Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
+  const TargetRegisterClass *SrcSubRC =
+      RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
+
+  MachineOperand SrcRegSub0 =
+      buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
+  MachineOperand SrcRegSub1 =
+      buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
+
+  Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
+
+  BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
+
+  BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
+      .addReg(IsCtlz ? MidReg1 : MidReg2)
+      .addImm(32)
+      .addImm(1); // enable clamp
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
+      .addReg(MidReg3)
+      .addReg(IsCtlz ? MidReg2 : MidReg1);
+
+  MRI.replaceRegWith(Dest.getReg(), MidReg4);
+
+  addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
+}
+
 void SIInstrInfo::addUsersToMoveToVALUWorklist(
     Register DstReg, MachineRegisterInfo &MRI,
     SIInstrWorklist &Worklist) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index affe52046752..46eee6fae0a5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -144,6 +144,9 @@ private:
   void splitScalar64BitBCNT(SIInstrWorklist &Worklist,
                             MachineInstr &Inst) const;
   void splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
+  void splitScalar64BitCountOp(SIInstrWorklist &Worklist, MachineInstr &Inst,
+                               unsigned Opcode,
+                               MachineDominatorTree *MDT = nullptr) const;
   void movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI,
                       MachineInstr &Inst) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f9bc623abcd0..8310c6b57dad 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1487,8 +1487,18 @@ foreach Index = 0-31 in {
 // 16-bit bitcast
 def : BitConvert <i16, f16, VGPR_32>;
 def : BitConvert <f16, i16, VGPR_32>;
+def : BitConvert <f16, bf16, VGPR_32>;
+def : BitConvert <bf16, f16, VGPR_32>;
+
 def : BitConvert <i16, f16, SReg_32>;
 def : BitConvert <f16, i16, SReg_32>;
+def : BitConvert <f16, bf16, SReg_32>;
+def : BitConvert <bf16, f16, SReg_32>;
+
+def : BitConvert <i16, bf16, VGPR_32>;
+def : BitConvert <bf16, i16, VGPR_32>;
+def : BitConvert <i16, bf16, SReg_32>;
+def : BitConvert <bf16, i16, SReg_32>;
 
 // 32-bit bitcast
 def : BitConvert <i32, f32, VGPR_32>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 021d797344c5..a93cf5cad411 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -330,8 +330,10 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
 
   RegPressureIgnoredUnits.resize(getNumRegUnits());
   RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
-  for (auto Reg : AMDGPU::VGPR_HI16RegClass)
-    RegPressureIgnoredUnits.set(*regunits(Reg).begin());
+  for (auto Reg : AMDGPU::VGPR_16RegClass) {
+    if (AMDGPU::isHi(Reg, *this))
+      RegPressureIgnoredUnits.set(*regunits(Reg).begin());
+  }
 
   // HACK: Until this is fully tablegen'd.
   static llvm::once_flag InitializeRegSplitPartsFlag;
@@ -2661,7 +2663,7 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
   if (BitWidth == 1)
     return &AMDGPU::VReg_1RegClass;
   if (BitWidth == 16)
-    return &AMDGPU::VGPR_LO16RegClass;
+    return &AMDGPU::VGPR_16RegClass;
   if (BitWidth == 32)
     return &AMDGPU::VGPR_32RegClass;
   return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
@@ -2808,8 +2810,6 @@ getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) {
 
 const TargetRegisterClass *
 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
-  if (BitWidth == 16)
-    return &AMDGPU::VGPR_LO16RegClass;
   if (BitWidth == 32)
     return &AMDGPU::AV_32RegClass;
   return ST.needsAlignedVGPRs()
@@ -3041,8 +3041,6 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   default:
     return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
   case AMDGPU::VGPR_32RegClassID:
-  case AMDGPU::VGPR_LO16RegClassID:
-  case AMDGPU::VGPR_HI16RegClassID:
     return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
   case AMDGPU::SGPR_32RegClassID:
   case AMDGPU::SGPR_LO16RegClassID:
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 981da13fe089..c94b894c5841 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -376,7 +376,7 @@ def M0_CLASS : SIRegisterClass<"AMDGPU", [i32], 32, (add M0)> {
   let HasSGPR = 1;
 }
 
-def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
+def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add M0_LO16)> {
   let CopyCost = 1;
   let Size = 16;
   let isAllocatable = 0;
@@ -385,7 +385,7 @@ def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
 
 // TODO: Do we need to set DwarfRegAlias on register tuples?
 
-def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
+def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
                               (add (sequence "SGPR%u_LO16", 0, 105))> {
   let AllocationPriority = 0;
   let Size = 16;
@@ -393,7 +393,7 @@ def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   let HasSGPR = 1;
 }
 
-def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
+def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
                               (add (sequence "SGPR%u_HI16", 0, 105))> {
   let isAllocatable = 0;
   let Size = 16;
@@ -402,7 +402,7 @@ def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
 }
 
 // SGPR 32-bit registers
-def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                             (add (sequence "SGPR%u", 0, 105))> {
   // Give all SGPR classes higher priority than VGPR classes, because
   // we want to spill SGPRs to VGPRs.
@@ -451,14 +451,14 @@ def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s"
 def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">;
 
 // Trap handler TMP 32-bit registers
-def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
+def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16, v2bf16], 32,
                             (add (sequence "TTMP%u", 0, 15))> {
   let isAllocatable = 0;
   let HasSGPR = 1;
 }
 
 // Trap handler TMP 16-bit registers
-def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
+def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
                               (add (sequence "TTMP%u_LO16", 0, 15))> {
   let Size = 16;
   let isAllocatable = 0;
@@ -584,24 +584,10 @@ class RegisterTypes<list<ValueType> reg_types> {
   list<ValueType> types = reg_types;
 }
 
-def Reg16Types : RegisterTypes<[i16, f16]>;
-def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
+def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
+def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
 
 let HasVGPR = 1 in {
-def VGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
-                              (add (sequence "VGPR%u_LO16", 0, 255))> {
-  let AllocationPriority = 0;
-  let Size = 16;
-  let GeneratePressureSet = 0;
-}
-
-def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
-                              (add (sequence "VGPR%u_HI16", 0, 255))> {
-  let AllocationPriority = 0;
-  let Size = 16;
-  let GeneratePressureSet = 0;
-}
-
 // VOP3 and VINTERP can access 256 lo and 256 hi registers.
 def VGPR_16 : SIRegisterClass<"AMDGPU",  Reg16Types.types, 16,
                             (add (interleave (sequence "VGPR%u_LO16", 0, 255),
@@ -697,7 +683,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
 }
 
 // AccVGPR 32-bit registers
-def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                             (add (sequence "AGPR%u", 0, 255))> {
   let AllocationPriority = 0;
   let Size = 32;
@@ -749,7 +735,7 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
 //  Register classes used as source and destination
 //===----------------------------------------------------------------------===//
 
-def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
   (add FP_REG, SP_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
@@ -757,7 +743,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16
   let BaseClassOrder = 10000;
 }
 
-def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16], 32,
+def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16, v8bf16], 32,
   (add PRIVATE_RSRC_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
@@ -774,7 +760,7 @@ def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
 let GeneratePressureSet = 0, HasSGPR = 1 in {
 // Subset of SReg_32 without M0 for SMRD instructions and alike.
 // See comments in SIInstructions.td for more info.
-def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
    SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO,
    SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI,
@@ -783,7 +769,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2
   let AllocationPriority = 0;
 }
 
-def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
+def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
   (add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16,
    XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, SGPR_NULL_HI_LO16, TTMP_LO16,
    TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16,
@@ -796,17 +782,17 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   let BaseClassOrder = 16;
 }
 
-def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SReg_32_XM0_XEXEC, M0_CLASS)> {
   let AllocationPriority = 0;
 }
 
-def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SReg_32_XEXEC, EXEC_LO)> {
   let AllocationPriority = 0;
 }
 
-def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
   let AllocationPriority = 0;
 }
@@ -814,7 +800,7 @@ def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i
 } // End GeneratePressureSet = 0
 
 // Register class for all scalar registers (SGPRs + Special Registers)
-def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SReg_32_XM0, M0_CLASS)> {
   let AllocationPriority = 0;
   let HasSGPR = 1;
@@ -822,13 +808,13 @@ def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1],
 }
 
 let GeneratePressureSet = 0 in {
-def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
   (add SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
   let HasSGPR = 1;
 }
 
-def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
+def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4bf16], 32,
                             (add SGPR_64Regs)> {
   let CopyCost = 1;
   let AllocationPriority = 1;
@@ -850,13 +836,13 @@ def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
   let HasSGPR = 1;
 }
 
-def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
+def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16], 32,
                             (add TTMP_64Regs)> {
   let isAllocatable = 0;
   let HasSGPR = 1;
 }
 
-def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
+def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
   (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, SRC_SHARED_BASE,
        SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> {
   let CopyCost = 1;
@@ -864,7 +850,7 @@ def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16
   let HasSGPR = 1;
 }
 
-def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
+def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
   (add SReg_64_XEXEC, EXEC)> {
   let CopyCost = 1;
   let AllocationPriority = 1;
@@ -919,11 +905,11 @@ multiclass SRegClass<int numRegs,
 }
 
 defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
-defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], SGPR_128Regs, TTMP_128Regs>;
 defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
 defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
 defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
-defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
+defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs>;
 defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>;
 defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>;
 defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
@@ -934,7 +920,7 @@ defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512
 defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
 }
 
-def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                                  (add VGPR_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
@@ -969,15 +955,15 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
   }
 }
 
-defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
+defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16, p0, p1, p4],
                                 (add VGPR_64)>;
 defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
-defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add VGPR_128)>;
+defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add VGPR_128)>;
 defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
 
 defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
 defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
-defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>;
+defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], (add VGPR_256)>;
 defm VReg_288 : VRegClass<9, [v9i32, v9f32], (add VGPR_288)>;
 defm VReg_320 : VRegClass<10, [v10i32, v10f32], (add VGPR_320)>;
 defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
@@ -1007,7 +993,7 @@ multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
 defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
                         (add AGPR_64)>;
 defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
-defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add AGPR_128)>;
+defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add AGPR_128)>;
 defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
 defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
 defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
@@ -1046,14 +1032,14 @@ def VS_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
   let HasVGPR = 1;
 }
 
-def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                           (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
   let HasSGPR = 1;
 }
 
-def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                           (add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 0f92a56237ac..a91d77175234 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2296,8 +2296,6 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
 // (move from MC* level to Target* level). Return size in bits.
 unsigned getRegBitWidth(unsigned RCID) {
   switch (RCID) {
-  case AMDGPU::VGPR_LO16RegClassID:
-  case AMDGPU::VGPR_HI16RegClassID:
   case AMDGPU::SGPR_LO16RegClassID:
   case AMDGPU::AGPR_LO16RegClassID:
     return 16;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index d3cefb339d9e..7f52501b5d90 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -190,9 +190,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
   // because dealing with the write to high half of the register is
   // difficult.
   def : GCNPat <
-    (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                                                (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                                (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+    (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+                                                     (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+                                                     (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
     (v2f16 (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        $src2_modifiers, $src2,
@@ -203,9 +203,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
   def : GCNPat <
     (build_vector
       f16:$elt0,
-      (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+      (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
                                       (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                      (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
+                                      (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
     (v2f16 (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        $src2_modifiers, $src2,
@@ -215,12 +215,12 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
 
   def : GCNPat <
     (AMDGPUclamp (build_vector
-      (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
+      (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
                          (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
-                         (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
-      (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
+                         (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
+      (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
                          (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
-                         (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
+                         (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
     (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
                        $hi_src1_modifiers, $hi_src1,
                        $hi_src2_modifiers, $hi_src2,
@@ -243,8 +243,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
   >;
 
   def : GCNPat <
-    (build_vector f16:$elt0, (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
-                                            (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
+    (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
+                                            (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
     (v2f16 (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        (i32 0), (i32 0),
diff --git a/llvm/lib/Target/ARC/ARCISelLowering.cpp b/llvm/lib/Target/ARC/ARCISelLowering.cpp
index 5d9a366f5ed5..2265f5db6737 100644
--- a/llvm/lib/Target/ARC/ARCISelLowering.cpp
+++ b/llvm/lib/Target/ARC/ARCISelLowering.cpp
@@ -751,7 +751,7 @@ SDValue ARCTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  assert(cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0 &&
+  assert(Op.getConstantOperandVal(0) == 0 &&
          "Only support lowering frame addr of current frame.");
   Register FrameReg = ARI.getFrameRegister(MF);
   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index a0776296b8eb..ef02dc997011 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -4499,8 +4499,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     default: break;
     case ARM::LDRrs:
     case ARM::LDRBrs: {
-      unsigned ShOpVal =
-        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShOpVal = DefNode->getConstantOperandVal(2);
       unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
       if (ShImm == 0 ||
           (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
@@ -4512,8 +4511,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::t2LDRHs:
     case ARM::t2LDRSHs: {
       // Thumb2 mode: lsl only.
-      unsigned ShAmt =
-        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShAmt = DefNode->getConstantOperandVal(2);
       if (ShAmt == 0 || ShAmt == 2)
         Latency = *Latency - 1;
       break;
@@ -4526,8 +4524,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     default: break;
     case ARM::LDRrs:
     case ARM::LDRBrs: {
-      unsigned ShOpVal =
-        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShOpVal = DefNode->getConstantOperandVal(2);
       unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
       if (ShImm == 0 ||
           ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 10d9c7f275be..eeb7f64aa581 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -2692,8 +2692,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
     const Align TargetAlign = getStackAlign();
     if (TargetAlign >= Align(8) && (NumGPRSpills & 1)) {
       if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
-        for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
-          unsigned Reg = UnspilledCS1GPRs[i];
+        for (unsigned Reg : UnspilledCS1GPRs) {
           // Don't spill high register if the function is thumb.  In the case of
           // Windows on ARM, accept R11 (frame pointer)
           if (!AFI->isThumbFunction() ||
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 984d8d3e0b08..adc429b61bbc 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -2422,8 +2422,7 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
 
   SDValue Chain = N->getOperand(0);
-  unsigned Lane =
-    cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
+  unsigned Lane = N->getConstantOperandVal(Vec0Idx + NumVecs);
   EVT VT = N->getOperand(Vec0Idx).getValueType();
   bool is64BitVector = VT.is64BitVector();
 
@@ -2587,7 +2586,7 @@ void ARMDAGToDAGISel::SelectMVE_WB(SDNode *N, const uint16_t *Opcodes,
 
   Ops.push_back(N->getOperand(2)); // vector of base addresses
 
-  int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+  int32_t ImmValue = N->getConstantOperandVal(3);
   Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate offset
 
   if (Predicated)
@@ -2622,7 +2621,7 @@ void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode,
 
   // The shift count
   if (Immediate) {
-    int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+    int32_t ImmValue = N->getConstantOperandVal(3);
     Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count
   } else {
     Ops.push_back(N->getOperand(3));
@@ -2630,7 +2629,7 @@ void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode,
 
   // The immediate saturation operand, if any
   if (HasSaturationOperand) {
-    int32_t SatOp = cast<ConstantSDNode>(N->getOperand(4))->getZExtValue();
+    int32_t SatOp = N->getConstantOperandVal(4);
     int SatBit = (SatOp == 64 ? 0 : 1);
     Ops.push_back(getI32Imm(SatBit, Loc));
   }
@@ -2685,7 +2684,7 @@ void ARMDAGToDAGISel::SelectMVE_VSHLC(SDNode *N, bool Predicated) {
   // and then an immediate shift count
   Ops.push_back(N->getOperand(1));
   Ops.push_back(N->getOperand(2));
-  int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+  int32_t ImmValue = N->getConstantOperandVal(3);
   Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count
 
   if (Predicated)
@@ -4138,14 +4137,13 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     if (InGlue.getOpcode() == ARMISD::CMPZ) {
       if (InGlue.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN) {
         SDValue Int = InGlue.getOperand(0);
-        uint64_t ID = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
+        uint64_t ID = Int->getConstantOperandVal(1);
 
         // Handle low-overhead loops.
         if (ID == Intrinsic::loop_decrement_reg) {
           SDValue Elements = Int.getOperand(2);
-          SDValue Size = CurDAG->getTargetConstant(
-            cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl,
-                                 MVT::i32);
+          SDValue Size = CurDAG->getTargetConstant(Int.getConstantOperandVal(3),
+                                                   dl, MVT::i32);
 
           SDValue Args[] = { Elements, Size, Int.getOperand(0) };
           SDNode *LoopDec =
@@ -4715,7 +4713,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
 
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned IntNo = N->getConstantOperandVal(1);
     switch (IntNo) {
     default:
       break;
@@ -4732,9 +4730,9 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
         Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::MRRC : ARM::MRRC2);
 
       SmallVector<SDValue, 5> Ops;
-      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(), dl)); /* coproc */
-      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), dl)); /* opc */
-      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(4))->getZExtValue(), dl)); /* CRm */
+      Ops.push_back(getI32Imm(N->getConstantOperandVal(2), dl)); /* coproc */
+      Ops.push_back(getI32Imm(N->getConstantOperandVal(3), dl)); /* opc */
+      Ops.push_back(getI32Imm(N->getConstantOperandVal(4), dl)); /* CRm */
 
       // The mrrc2 instruction in ARM doesn't allow predicates, the top 4 bits of the encoded
       // instruction will always be '1111' but it is possible in assembly language to specify
@@ -5181,7 +5179,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    unsigned IntNo = N->getConstantOperandVal(0);
     switch (IntNo) {
     default:
       break;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index d00b7853816e..9f3bcffc7a99 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -4110,7 +4110,7 @@ SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
 SDValue
 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                           const ARMSubtarget *Subtarget) const {
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
@@ -4289,13 +4289,13 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
     return Op.getOperand(0);
 
   SDLoc dl(Op);
-  unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
+  unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
   if (!isRead &&
       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
     // ARMv7 with MP extension has PLDW.
     return Op.getOperand(0);
 
-  unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  unsigned isData = Op.getConstantOperandVal(4);
   if (Subtarget->isThumb()) {
     // Invert the bits.
     isRead = ~isRead & 1;
@@ -4800,7 +4800,7 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
       LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
       !isSignedIntSetCC(CC)) {
-    unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue();
+    unsigned Mask = LHS.getConstantOperandVal(1);
     auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
     uint64_t RHSV = RHSC->getZExtValue();
     if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
@@ -4823,9 +4823,8 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       isa<ConstantSDNode>(RHS) &&
       cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U &&
       CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
-      cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) {
-    unsigned ShiftAmt =
-      cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1;
+      LHS.getConstantOperandVal(1) < 31) {
+    unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
     SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
                                 DAG.getVTList(MVT::i32, MVT::i32),
                                 LHS.getOperand(0),
@@ -6112,7 +6111,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
@@ -6135,7 +6134,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);  // FIXME probably not meaningful
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   Register FrameReg = ARI.getFrameRegister(MF);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
@@ -8221,7 +8220,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
 
     // Update the minimum and maximum lane number seen.
-    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    unsigned EltNo = V.getConstantOperandVal(1);
     Source->MinElt = std::min(Source->MinElt, EltNo);
     Source->MaxElt = std::max(Source->MaxElt, EltNo);
   }
@@ -9034,7 +9033,7 @@ static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
 
   SDValue Conv =
       DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
-  unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Lane = Op.getConstantOperandVal(2);
   unsigned LaneWidth =
       getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
   unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
@@ -9097,7 +9096,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
 
   SDValue Conv =
       DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
-  unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned Lane = Op.getConstantOperandVal(1);
   unsigned LaneWidth =
       getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
   SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
@@ -10682,7 +10681,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 
 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                  SelectionDAG &DAG) {
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IntNo = N->getConstantOperandVal(0);
   unsigned Opc = 0;
   if (IntNo == Intrinsic::arm_smlald)
     Opc = ARMISD::SMLALD;
@@ -14842,14 +14841,14 @@ static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
   assert(N->getOpcode() == ARMISD::BFI);
 
   SDValue From = N->getOperand(1);
-  ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
+  ToMask = ~N->getConstantOperandAPInt(2);
   FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
 
   // If the Base came from a SHR #C, we can deduce that it is really testing bit
   // #C in the base of the SHR.
   if (From->getOpcode() == ISD::SRL &&
       isa<ConstantSDNode>(From->getOperand(1))) {
-    APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
+    APInt Shift = From->getConstantOperandAPInt(1);
     assert(Shift.getLimitedValue() < 32 && "Shift too large!");
     FromMask <<= Shift.getLimitedValue(31);
     From = From->getOperand(0);
@@ -14908,7 +14907,7 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
     if (!N11C)
       return SDValue();
-    unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    unsigned InvMask = N->getConstantOperandVal(2);
     unsigned LSB = llvm::countr_zero(~InvMask);
     unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
     assert(Width <
@@ -15448,8 +15447,7 @@ static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
-  ARMCC::CondCodes Cond =
-      (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
   SDLoc dl(N);
 
   // vcmp X, 0, cc -> vcmpz X, cc
@@ -15794,7 +15792,7 @@ static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
   unsigned NewOpc = 0;
   unsigned NumVecs = 0;
   if (Target.isIntrinsic) {
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned IntNo = N->getConstantOperandVal(1);
     switch (IntNo) {
     default:
       llvm_unreachable("unexpected intrinsic for Neon base update");
@@ -16254,12 +16252,10 @@ static SDValue PerformMVEVLDCombine(SDNode *N,
 
   // For the stores, where there are multiple intrinsics we only actually want
   // to post-inc the last of the them.
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-  if (IntNo == Intrinsic::arm_mve_vst2q &&
-      cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1)
+  unsigned IntNo = N->getConstantOperandVal(1);
+  if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
     return SDValue();
-  if (IntNo == Intrinsic::arm_mve_vst4q &&
-      cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3)
+  if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
     return SDValue();
 
   // Search for a use of the address operand that is an increment.
@@ -16381,7 +16377,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     return false;
   unsigned NumVecs = 0;
   unsigned NewOpc = 0;
-  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
+  unsigned IntNo = VLD->getConstantOperandVal(1);
   if (IntNo == Intrinsic::arm_neon_vld2lane) {
     NumVecs = 2;
     NewOpc = ARMISD::VLD2DUP;
@@ -16397,8 +16393,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
 
   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
   // numbers match the load.
-  unsigned VLDLaneNo =
-    cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
+  unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
        UI != UE; ++UI) {
     // Ignore uses of the chain result.
@@ -16406,7 +16401,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
       continue;
     SDNode *User = *UI;
     if (User->getOpcode() != ARMISD::VDUPLANE ||
-        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
+        VLDLaneNo != User->getConstantOperandVal(1))
       return false;
   }
 
@@ -16479,7 +16474,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
   // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
   unsigned EltSize = Op.getScalarValueSizeInBits();
   // The canonical VMOV for a zero vector uses a 32-bit element size.
-  unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Imm = Op.getConstantOperandVal(0);
   unsigned EltBits;
   if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
     EltSize = 8;
@@ -17479,7 +17474,7 @@ static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) {
 SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IntNo = N->getConstantOperandVal(0);
   switch (IntNo) {
   default:
     // Don't do anything for most intrinsics.
@@ -17669,7 +17664,7 @@ SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
   case Intrinsic::arm_mve_addv: {
     // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
     // which allow PerformADDVecReduce to turn it into VADDLV when possible.
-    bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    bool Unsigned = N->getConstantOperandVal(2);
     unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
   }
@@ -17678,7 +17673,7 @@ SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
   case Intrinsic::arm_mve_addlv_predicated: {
     // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
     // which recombines the two outputs into an i64
-    bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    bool Unsigned = N->getConstantOperandVal(2);
     unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
                     (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
                     (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
@@ -18193,7 +18188,7 @@ static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
     return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
   }
   case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
+    unsigned IntOp = N.getConstantOperandVal(1);
     if (IntOp != Intrinsic::test_start_loop_iterations &&
         IntOp != Intrinsic::loop_decrement_reg)
       return SDValue();
@@ -18271,7 +18266,7 @@ static SDValue PerformHWLoopCombine(SDNode *N,
   SDLoc dl(Int);
   SelectionDAG &DAG = DCI.DAG;
   SDValue Elements = Int.getOperand(2);
-  unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
+  unsigned IntOp = Int->getConstantOperandVal(1);
   assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
           && "expected single br user");
   SDNode *Br = *N->use_begin();
@@ -18305,8 +18300,8 @@ static SDValue PerformHWLoopCombine(SDNode *N,
     DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
     return Res;
   } else {
-    SDValue Size = DAG.getTargetConstant(
-      cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32);
+    SDValue Size =
+        DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
     SDValue Args[] = { Int.getOperand(0), Elements, Size, };
     SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
                                   DAG.getVTList(MVT::i32, MVT::Other), Args);
@@ -19051,7 +19046,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   }
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    switch (N->getConstantOperandVal(1)) {
     case Intrinsic::arm_neon_vld1:
     case Intrinsic::arm_neon_vld1x2:
     case Intrinsic::arm_neon_vld1x3:
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index a679699a66c7..ed9d30c3c3ab 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2604,16 +2604,14 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
     }
 
     // Re-schedule loads.
-    for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
-      unsigned Base = LdBases[i];
+    for (unsigned Base : LdBases) {
       SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
       if (Lds.size() > 1)
         RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap, RegisterMap);
     }
 
     // Re-schedule stores.
-    for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
-      unsigned Base = StBases[i];
+    for (unsigned Base : StBases) {
       SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
       if (Sts.size() > 1)
         RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap, RegisterMap);
diff --git a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
index 196122e45ab8..e67a1e2ed509 100644
--- a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -335,7 +335,7 @@ template <> bool AVRDAGToDAGISel::select<ISD::STORE>(SDNode *N) {
     return false;
   }
 
-  int CST = (int)cast<ConstantSDNode>(BasePtr.getOperand(1))->getZExtValue();
+  int CST = (int)BasePtr.getConstantOperandVal(1);
   SDValue Chain = ST->getChain();
   EVT VT = ST->getValue().getValueType();
   SDLoc DL(N);
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index cd1dcfaea0eb..d36bfb188ed3 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -298,8 +298,7 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
     SDValue SrcHi =
         DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0),
                     DAG.getConstant(1, dl, MVT::i16));
-    uint64_t ShiftAmount =
-        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    uint64_t ShiftAmount = N->getConstantOperandVal(1);
     if (ShiftAmount == 16) {
       // Special case these two operations because they appear to be used by the
       // generic codegen parts to lower 32-bit numbers.
@@ -367,7 +366,7 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  uint64_t ShiftAmount = N->getConstantOperandVal(1);
   SDValue Victim = N->getOperand(0);
 
   switch (Op.getOpcode()) {
diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 909c7c005735..d8139958e9fc 100644
--- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -193,7 +193,7 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
   default:
     break;
   case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(1);
     switch (IntNo) {
     case Intrinsic::bpf_load_byte:
     case Intrinsic::bpf_load_half:
@@ -469,7 +469,7 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
   if (BaseV.getOpcode() != ISD::INTRINSIC_W_CHAIN)
     return;
 
-  unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue();
+  unsigned IntNo = BaseV->getConstantOperandVal(1);
   uint64_t MaskV = MaskN->getZExtValue();
 
   if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) ||
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index e3b4a2dc048a..90f70b83a02d 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -1219,7 +1219,7 @@ SDValue CSKYTargetLowering::LowerFRAMEADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   Register FrameReg = RI.getFrameRegister(MF);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
@@ -1240,7 +1240,7 @@ SDValue CSKYTargetLowering::LowerRETURNADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index e2d0aeee092e..ebb269c6e6e0 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -937,8 +937,7 @@ void DXILBitcodeWriter::writeAttributeTable() {
   Stream.EnterSubblock(bitc::PARAMATTR_BLOCK_ID, 3);
 
   SmallVector<uint64_t, 64> Record;
-  for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
-    AttributeList AL = Attrs[i];
+  for (AttributeList AL : Attrs) {
     for (unsigned i : AL.indexes()) {
       AttributeSet AS = AL.getAttributes(i);
       if (AS.hasAttributes())
diff --git a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index 47fbf0a69518..dae316ccb5e9 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -2860,8 +2860,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
   // For each defined register, if it is a constant, create an instruction
   //   NewR = const
   // and replace all uses of the defined register with NewR.
-  for (unsigned i = 0, n = DefRegs.size(); i < n; ++i) {
-    unsigned R = DefRegs[i];
+  for (unsigned R : DefRegs) {
     const LatticeCell &L = Inputs.get(R);
     if (L.isBottom())
       continue;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index f930015026a5..eb5c59672224 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -192,7 +192,7 @@ MachineSDNode *HexagonDAGToDAGISel::LoadInstrForLoadIntrinsic(SDNode *IntN) {
     return nullptr;
 
   SDLoc dl(IntN);
-  unsigned IntNo = cast<ConstantSDNode>(IntN->getOperand(1))->getZExtValue();
+  unsigned IntNo = IntN->getConstantOperandVal(1);
 
   static std::map<unsigned,unsigned> LoadPciMap = {
     { Intrinsic::hexagon_circ_ldb,  Hexagon::L2_loadrb_pci  },
@@ -284,18 +284,18 @@ bool HexagonDAGToDAGISel::tryLoadOfLoadIntrinsic(LoadSDNode *N) {
   // can provide an address of an unsigned variable to store the result of
   // a sign-extending intrinsic into (or the other way around).
   ISD::LoadExtType IntExt;
-  switch (cast<ConstantSDNode>(C->getOperand(1))->getZExtValue()) {
-    case Intrinsic::hexagon_circ_ldub:
-    case Intrinsic::hexagon_circ_lduh:
-      IntExt = ISD::ZEXTLOAD;
-      break;
-    case Intrinsic::hexagon_circ_ldw:
-    case Intrinsic::hexagon_circ_ldd:
-      IntExt = ISD::NON_EXTLOAD;
-      break;
-    default:
-      IntExt = ISD::SEXTLOAD;
-      break;
+  switch (C->getConstantOperandVal(1)) {
+  case Intrinsic::hexagon_circ_ldub:
+  case Intrinsic::hexagon_circ_lduh:
+    IntExt = ISD::ZEXTLOAD;
+    break;
+  case Intrinsic::hexagon_circ_ldw:
+  case Intrinsic::hexagon_circ_ldd:
+    IntExt = ISD::NON_EXTLOAD;
+    break;
+  default:
+    IntExt = ISD::SEXTLOAD;
+    break;
   }
   if (N->getExtensionType() != IntExt)
     return false;
@@ -325,7 +325,7 @@ bool HexagonDAGToDAGISel::SelectBrevLdIntrinsic(SDNode *IntN) {
     return false;
 
   const SDLoc &dl(IntN);
-  unsigned IntNo = cast<ConstantSDNode>(IntN->getOperand(1))->getZExtValue();
+  unsigned IntNo = IntN->getConstantOperandVal(1);
 
   static const std::map<unsigned, unsigned> LoadBrevMap = {
     { Intrinsic::hexagon_L2_loadrb_pbr, Hexagon::L2_loadrb_pbr },
@@ -366,7 +366,7 @@ bool HexagonDAGToDAGISel::SelectNewCircIntrinsic(SDNode *IntN) {
     return false;
 
   SDLoc DL(IntN);
-  unsigned IntNo = cast<ConstantSDNode>(IntN->getOperand(1))->getZExtValue();
+  unsigned IntNo = IntN->getConstantOperandVal(1);
   SmallVector<SDValue, 7> Ops;
 
   static std::map<unsigned,unsigned> LoadNPcMap = {
@@ -641,7 +641,7 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
   if (SelectNewCircIntrinsic(N))
     return;
 
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned IntNo = N->getConstantOperandVal(1);
   if (IntNo == Intrinsic::hexagon_V6_vgathermw ||
       IntNo == Intrinsic::hexagon_V6_vgathermw_128B ||
       IntNo == Intrinsic::hexagon_V6_vgathermh ||
@@ -665,7 +665,7 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
 }
 
 void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
-  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IID = N->getConstantOperandVal(0);
   unsigned Bits;
   switch (IID) {
   case Intrinsic::hexagon_S2_vsplatrb:
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index efb0d405fef2..fb156f2583e8 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -1337,8 +1337,7 @@ OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb,
   // segments that are used in the output.
 
   unsigned Seg0 = ~0u, Seg1 = ~0u;
-  for (int I = 0, E = SegMap.size(); I != E; ++I) {
-    unsigned X = SegMap[I];
+  for (unsigned X : SegMap) {
     if (X == ~0u)
       continue;
     if (Seg0 == ~0u)
@@ -2037,8 +2036,7 @@ HvxSelector::completeToPerfect(ArrayRef<uint32_t> Completions, unsigned Width) {
 #ifndef NDEBUG
   // Check that we have generated a valid completion.
   uint32_t OrAll = 0;
-  for (unsigned I = 0, E = Comps.size(); I != E; ++I) {
-    uint32_t C = Comps[I];
+  for (uint32_t C : Comps) {
     assert(isPowerOf2_32(C));
     OrAll |= C;
   }
@@ -2897,7 +2895,7 @@ void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) {
   SDValue ImmOperand = CurDAG->getTargetConstant(0, dl, MVT::i32);
 
   unsigned Opcode;
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned IntNo = N->getConstantOperandVal(1);
   switch (IntNo) {
   default:
     llvm_unreachable("Unexpected HVX gather intrinsic.");
@@ -2936,7 +2934,7 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
   SDValue ImmOperand = CurDAG->getTargetConstant(0, dl, MVT::i32);
 
   unsigned Opcode;
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned IntNo = N->getConstantOperandVal(1);
   switch (IntNo) {
   default:
     llvm_unreachable("Unexpected HVX gather intrinsic.");
@@ -2965,7 +2963,7 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
 }
 
 void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
-  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IID = N->getConstantOperandVal(0);
   SDNode *Result;
   switch (IID) {
   case Intrinsic::hexagon_V6_vaddcarry: {
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index a7d452e7227d..51138091f4a5 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -669,8 +669,7 @@ HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
     --NumOps;  // Ignore the flag operand.
 
   for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
-    const InlineAsm::Flag Flags(
-        cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue());
+    const InlineAsm::Flag Flags(Op.getConstantOperandVal(i));
     unsigned NumVals = Flags.getNumOperandRegisters();
     ++i;  // Skip the ID value.
 
@@ -729,7 +728,7 @@ SDValue HexagonTargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
 SDValue HexagonTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(1);
   // Lower the hexagon_prefetch builtin to DCFETCH, as above.
   if (IntNo == Intrinsic::hexagon_prefetch) {
     SDValue Addr = Op.getOperand(2);
@@ -1176,7 +1175,7 @@ HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
@@ -1198,7 +1197,7 @@ HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                                          HRI.getFrameRegister(), VT);
   while (Depth--)
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index db416a500f59..665e2d79c83d 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -2127,7 +2127,7 @@ HexagonTargetLowering::LowerHvxFunnelShift(SDValue Op,
 SDValue
 HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const {
   const SDLoc &dl(Op);
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
   SmallVector<SDValue> Ops(Op->ops().begin(), Op->ops().end());
 
   auto Swap = [&](SDValue P) {
@@ -2922,7 +2922,7 @@ SDValue
 HexagonTargetLowering::RemoveTLWrapper(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getOpcode() == HexagonISD::TL_EXTEND ||
          Op.getOpcode() == HexagonISD::TL_TRUNCATE);
-  unsigned Opc = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Opc = Op.getConstantOperandVal(2);
   return DAG.getNode(Opc, SDLoc(Op), ty(Op), Op.getOperand(0));
 }
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index fffd5abd9f8b..0740ac58a338 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -554,7 +554,7 @@ MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
   // Add qfloat subtarget feature by default to v68 and above
   // unless explicitely disabled
   if (checkFeature(X, Hexagon::ExtensionHVXV68) &&
-      ArchFS.find("-hvx-qfloat", 0) == std::string::npos) {
+      !ArchFS.contains("-hvx-qfloat")) {
     llvm::FeatureBitset Features = X->getFeatureBits();
     X->setFeatureBits(Features.set(Hexagon::ExtensionHVXQFloat));
   }
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index cbb5c2b998e2..17d7ffb586f4 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -1057,7 +1057,7 @@ SDValue LanaiTargetLowering::LowerRETURNADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     const unsigned Offset = -4;
@@ -1080,7 +1080,7 @@ SDValue LanaiTargetLowering::LowerFRAMEADDR(SDValue Op,
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Lanai::FP, VT);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   while (Depth--) {
     const unsigned Offset = -8;
     SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 276374afee38..66a37fce5dda 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -85,7 +85,7 @@ class LoongArchAsmParser : public MCTargetAsmParser {
   // "emitLoadAddress*" functions.
   void emitLAInstSeq(MCRegister DestReg, MCRegister TmpReg,
                      const MCExpr *Symbol, SmallVectorImpl<Inst> &Insts,
-                     SMLoc IDLoc, MCStreamer &Out);
+                     SMLoc IDLoc, MCStreamer &Out, bool RelaxHint = false);
 
   // Helper to emit pseudo instruction "la.abs $rd, sym".
   void emitLoadAddressAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
@@ -748,12 +748,14 @@ bool LoongArchAsmParser::ParseInstruction(ParseInstructionInfo &Info,
 void LoongArchAsmParser::emitLAInstSeq(MCRegister DestReg, MCRegister TmpReg,
                                        const MCExpr *Symbol,
                                        SmallVectorImpl<Inst> &Insts,
-                                       SMLoc IDLoc, MCStreamer &Out) {
+                                       SMLoc IDLoc, MCStreamer &Out,
+                                       bool RelaxHint) {
   MCContext &Ctx = getContext();
   for (LoongArchAsmParser::Inst &Inst : Insts) {
     unsigned Opc = Inst.Opc;
     LoongArchMCExpr::VariantKind VK = Inst.VK;
-    const LoongArchMCExpr *LE = LoongArchMCExpr::create(Symbol, VK, Ctx);
+    const LoongArchMCExpr *LE =
+        LoongArchMCExpr::create(Symbol, VK, Ctx, RelaxHint);
     switch (Opc) {
     default:
       llvm_unreachable("unexpected opcode");
@@ -854,7 +856,7 @@ void LoongArchAsmParser::emitLoadAddressPcrel(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(
       LoongArchAsmParser::Inst(ADDI, LoongArchMCExpr::VK_LoongArch_PCALA_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true);
 }
 
 void LoongArchAsmParser::emitLoadAddressPcrelLarge(MCInst &Inst, SMLoc IDLoc,
@@ -900,7 +902,7 @@ void LoongArchAsmParser::emitLoadAddressGot(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(
       LoongArchAsmParser::Inst(LD, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true);
 }
 
 void LoongArchAsmParser::emitLoadAddressGotLarge(MCInst &Inst, SMLoc IDLoc,
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 4794a131edae..e14bbadf9ed2 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -286,7 +286,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::UNDEF, VT, Legal);
 
       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 
       setOperationAction(ISD::SETCC, VT, Legal);
@@ -406,6 +406,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
     return lowerWRITE_REGISTER(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:
     return lowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return lowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::BUILD_VECTOR:
     return lowerBUILD_VECTOR(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
@@ -514,6 +516,23 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
 }
 
 SDValue
+LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  EVT VecTy = Op->getOperand(0)->getValueType(0);
+  SDValue Idx = Op->getOperand(1);
+  EVT EltTy = VecTy.getVectorElementType();
+  unsigned NumElts = VecTy.getVectorNumElements();
+
+  if (isa<ConstantSDNode>(Idx) &&
+      (EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 ||
+       EltTy == MVT::f64 ||
+       cast<ConstantSDNode>(Idx)->getZExtValue() < NumElts / 2))
+    return Op;
+
+  return SDValue();
+}
+
+SDValue
 LoongArchTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
                                                 SelectionDAG &DAG) const {
   if (isa<ConstantSDNode>(Op->getOperand(2)))
@@ -569,7 +588,7 @@ SDValue LoongArchTargetLowering::lowerFRAMEADDR(SDValue Op,
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, VT);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   int GRLenInBytes = Subtarget.getGRLen() / 8;
 
   while (Depth--) {
@@ -588,7 +607,7 @@ SDValue LoongArchTargetLowering::lowerRETURNADDR(SDValue Op,
     return SDValue();
 
   // Currently only support lowering return address for current frame.
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0) {
+  if (Op.getConstantOperandVal(0) != 0) {
     DAG.getContext()->emitError(
         "return address can only be determined for the current frame");
     return SDValue();
@@ -1244,7 +1263,7 @@ LoongArchTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
     return emitIntrinsicWithChainErrorMessage(Op, ErrorMsgReqLA64, DAG);
   case Intrinsic::loongarch_csrrd_w:
   case Intrinsic::loongarch_csrrd_d: {
-    unsigned Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+    unsigned Imm = Op.getConstantOperandVal(2);
     return !isUInt<14>(Imm)
                ? emitIntrinsicWithChainErrorMessage(Op, ErrorMsgOOR, DAG)
                : DAG.getNode(LoongArchISD::CSRRD, DL, {GRLenVT, MVT::Other},
@@ -1252,7 +1271,7 @@ LoongArchTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
   }
   case Intrinsic::loongarch_csrwr_w:
   case Intrinsic::loongarch_csrwr_d: {
-    unsigned Imm = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+    unsigned Imm = Op.getConstantOperandVal(3);
     return !isUInt<14>(Imm)
                ? emitIntrinsicWithChainErrorMessage(Op, ErrorMsgOOR, DAG)
                : DAG.getNode(LoongArchISD::CSRWR, DL, {GRLenVT, MVT::Other},
@@ -1261,7 +1280,7 @@ LoongArchTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
   }
   case Intrinsic::loongarch_csrxchg_w:
   case Intrinsic::loongarch_csrxchg_d: {
-    unsigned Imm = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+    unsigned Imm = Op.getConstantOperandVal(4);
     return !isUInt<14>(Imm)
                ? emitIntrinsicWithChainErrorMessage(Op, ErrorMsgOOR, DAG)
                : DAG.getNode(LoongArchISD::CSRXCHG, DL, {GRLenVT, MVT::Other},
@@ -1287,7 +1306,7 @@ LoongArchTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
                        {Chain, Op.getOperand(2)});
   }
   case Intrinsic::loongarch_lddir_d: {
-    unsigned Imm = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+    unsigned Imm = Op.getConstantOperandVal(3);
     return !isUInt<8>(Imm)
                ? emitIntrinsicWithChainErrorMessage(Op, ErrorMsgOOR, DAG)
                : Op;
@@ -1295,7 +1314,7 @@ LoongArchTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::loongarch_movfcsr2gr: {
     if (!Subtarget.hasBasicF())
       return emitIntrinsicWithChainErrorMessage(Op, ErrorMsgReqF, DAG);
-    unsigned Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+    unsigned Imm = Op.getConstantOperandVal(2);
     return !isUInt<2>(Imm)
                ? emitIntrinsicWithChainErrorMessage(Op, ErrorMsgOOR, DAG)
                : DAG.getNode(LoongArchISD::MOVFCSR2GR, DL, {VT, MVT::Other},
@@ -1441,7 +1460,7 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,
     ASRT_LE_GT_CASE(asrtgt_d)
 #undef ASRT_LE_GT_CASE
   case Intrinsic::loongarch_ldpte_d: {
-    unsigned Imm = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+    unsigned Imm = Op.getConstantOperandVal(3);
     return !Subtarget.is64Bit()
                ? emitIntrinsicErrorMessage(Op, ErrorMsgReqLA64, DAG)
            : !isUInt<8>(Imm) ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
@@ -1454,53 +1473,53 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,
                : SDValue();
   case Intrinsic::loongarch_lasx_xvstelm_b:
     return (!isInt<8>(cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<5>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<5>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
                : SDValue();
   case Intrinsic::loongarch_lsx_vstelm_b:
     return (!isInt<8>(cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<4>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<4>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
                : SDValue();
   case Intrinsic::loongarch_lasx_xvstelm_h:
     return (!isShiftedInt<8, 1>(
                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<4>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<4>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(
                      Op, "argument out of range or not a multiple of 2", DAG)
                : SDValue();
   case Intrinsic::loongarch_lsx_vstelm_h:
     return (!isShiftedInt<8, 1>(
                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<3>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<3>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(
                      Op, "argument out of range or not a multiple of 2", DAG)
                : SDValue();
   case Intrinsic::loongarch_lasx_xvstelm_w:
     return (!isShiftedInt<8, 2>(
                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<3>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<3>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(
                      Op, "argument out of range or not a multiple of 4", DAG)
                : SDValue();
   case Intrinsic::loongarch_lsx_vstelm_w:
     return (!isShiftedInt<8, 2>(
                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<2>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<2>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(
                      Op, "argument out of range or not a multiple of 4", DAG)
                : SDValue();
   case Intrinsic::loongarch_lasx_xvstelm_d:
     return (!isShiftedInt<8, 3>(
                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<2>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<2>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(
                      Op, "argument out of range or not a multiple of 8", DAG)
                : SDValue();
   case Intrinsic::loongarch_lsx_vstelm_d:
     return (!isShiftedInt<8, 3>(
                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
-            !isUInt<1>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+            !isUInt<1>(Op.getConstantOperandVal(5)))
                ? emitIntrinsicErrorMessage(
                      Op, "argument out of range or not a multiple of 8", DAG)
                : SDValue();
@@ -1673,7 +1692,7 @@ replaceVPICKVE2GRResults(SDNode *Node, SmallVectorImpl<SDValue> &Results,
                          SelectionDAG &DAG, const LoongArchSubtarget &Subtarget,
                          unsigned ResOp) {
   const StringRef ErrorMsgOOR = "argument out of range";
-  unsigned Imm = cast<ConstantSDNode>(Node->getOperand(2))->getZExtValue();
+  unsigned Imm = Node->getConstantOperandVal(2);
   if (!isUInt<N>(Imm)) {
     emitErrorAndReplaceIntrinsicResults(Node, Results, DAG, ErrorMsgOOR,
                                         /*WithChain=*/false);
@@ -1976,7 +1995,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
       break;
     }
     case Intrinsic::loongarch_csrwr_w: {
-      unsigned Imm = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+      unsigned Imm = N->getConstantOperandVal(3);
       if (!isUInt<14>(Imm)) {
         emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR);
         return;
@@ -1991,7 +2010,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
       break;
     }
     case Intrinsic::loongarch_csrxchg_w: {
-      unsigned Imm = cast<ConstantSDNode>(N->getOperand(4))->getZExtValue();
+      unsigned Imm = N->getConstantOperandVal(4);
       if (!isUInt<14>(Imm)) {
         emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR);
         return;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 2d73a7394946..6f8878f9ccd5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -279,6 +279,7 @@ private:
   SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index ec6983d0f487..b3c11bc5423d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1571,11 +1571,11 @@ def : Pat<(loongarch_vreplve v8i32:$xj, GRLenVT:$rk),
 def : Pat<(loongarch_vreplve v4i64:$xj, GRLenVT:$rk),
           (XVREPLVE_D v4i64:$xj, GRLenVT:$rk)>;
 
-// XVREPL128VEI_{W/D}
+// XVREPLVE0_{W/D}
 def : Pat<(lasxsplatf32 FPR32:$fj),
-          (XVREPL128VEI_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), 0)>;
+          (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
 def : Pat<(lasxsplatf64 FPR64:$fj),
-          (XVREPL128VEI_D (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), 0)>;
+          (XVREPLVE0_D (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64))>;
 
 // Loads/Stores
 foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in {
@@ -1590,42 +1590,18 @@ def : Pat<(i64 (vector_extract v32i8:$xj, uimm4:$imm)),
           (VPICKVE2GR_B (EXTRACT_SUBREG v32i8:$xj, sub_128), uimm4:$imm)>;
 def : Pat<(i64 (vector_extract v16i16:$xj, uimm3:$imm)),
           (VPICKVE2GR_H (EXTRACT_SUBREG v16i16:$xj, sub_128), uimm3:$imm)>;
-def : Pat<(i64 (vector_extract v8i32:$xj, uimm2:$imm)),
-          (VPICKVE2GR_W (EXTRACT_SUBREG v8i32:$xj, sub_128), uimm2:$imm)>;
-def : Pat<(i64 (vector_extract v4i64:$xj, uimm1:$imm)),
-          (VPICKVE2GR_D (EXTRACT_SUBREG v4i64:$xj, sub_128), uimm1:$imm)>;
-def : Pat<(f32 (vector_extract v8f32:$xj, uimm2:$imm)),
-          (f32 (EXTRACT_SUBREG (XVREPL128VEI_W v8f32:$xj, uimm2:$imm), sub_32))>;
-def : Pat<(f64 (vector_extract v4f64:$xj, uimm1:$imm)),
-          (f64 (EXTRACT_SUBREG (XVREPL128VEI_D v4f64:$xj, uimm1:$imm), sub_64))>;
-
-// Vector extraction with variable index.
-def : Pat<(i64 (vector_extract v32i8:$xj, i64:$rk)),
-          (SRAI_W (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (XVREPLVE_B v32i8:$xj,
-                                                                    i64:$rk),
-                                                         sub_32)),
-                                    GPR), (i64 24))>;
-def : Pat<(i64 (vector_extract v16i16:$xj, i64:$rk)),
-          (SRAI_W (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (XVREPLVE_H v16i16:$xj,
-                                                                    i64:$rk),
-                                                         sub_32)),
-                                    GPR), (i64 16))>;
-def : Pat<(i64 (vector_extract v8i32:$xj, i64:$rk)),
-          (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (XVREPLVE_W v8i32:$xj, i64:$rk),
-                                                 sub_32)),
-                            GPR)>;
-def : Pat<(i64 (vector_extract v4i64:$xj, i64:$rk)),
-          (COPY_TO_REGCLASS (f64 (EXTRACT_SUBREG (XVREPLVE_D v4i64:$xj, i64:$rk),
-                                                 sub_64)),
-                            GPR)>;
-def : Pat<(f32 (vector_extract v8f32:$xj, i64:$rk)),
-          (f32 (EXTRACT_SUBREG (XVREPLVE_W v8f32:$xj, i64:$rk), sub_32))>;
-def : Pat<(f64 (vector_extract v4f64:$xj, i64:$rk)),
-          (f64 (EXTRACT_SUBREG (XVREPLVE_D v4f64:$xj, i64:$rk), sub_64))>;
+def : Pat<(i64 (vector_extract v8i32:$xj, uimm3:$imm)),
+          (XVPICKVE2GR_W v8i32:$xj, uimm3:$imm)>;
+def : Pat<(i64 (vector_extract v4i64:$xj, uimm2:$imm)),
+          (XVPICKVE2GR_D v4i64:$xj, uimm2:$imm)>;
+def : Pat<(f32 (vector_extract v8f32:$xj, uimm3:$imm)),
+          (MOVGR2FR_W (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm))>;
+def : Pat<(f64 (vector_extract v4f64:$xj, uimm2:$imm)),
+          (MOVGR2FR_D (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm))>;
 
 // vselect
-def : Pat<(v32i8 (vselect LASX256:$xj, LASX256:$xd,
-                          (v32i8 (SplatPat_uimm8 uimm8:$imm)))),
+def : Pat<(v32i8 (vselect LASX256:$xd, (v32i8 (SplatPat_uimm8 uimm8:$imm)),
+                          LASX256:$xj)),
           (XVBITSELI_B LASX256:$xd, LASX256:$xj, uimm8:$imm)>;
 foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
   def  : Pat<(vt (vselect LASX256:$xa, LASX256:$xk, LASX256:$xj)),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index e468176885d7..5569c2cd15b5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1731,8 +1731,8 @@ def : Pat<(f64 (vector_extract v2f64:$vj, i64:$rk)),
           (f64 (EXTRACT_SUBREG (VREPLVE_D v2f64:$vj, i64:$rk), sub_64))>;
 
 // vselect
-def : Pat<(v16i8 (vselect LSX128:$vj, LSX128:$vd,
-                          (v16i8 (SplatPat_uimm8 uimm8:$imm)))),
+def : Pat<(v16i8 (vselect LSX128:$vd, (v16i8 (SplatPat_uimm8 uimm8:$imm)),
+                          LSX128:$vj)),
           (VBITSELI_B LSX128:$vd, LSX128:$vj, uimm8:$imm)>;
 foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
   def  : Pat<(vt (vselect LSX128:$va, LSX128:$vk, LSX128:$vj)),
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index 45169becca37..d2ea062dc09a 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/EndianStream.h"
 
@@ -120,12 +121,15 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
                                        SmallVectorImpl<MCFixup> &Fixups,
                                        const MCSubtargetInfo &STI) const {
   assert(MO.isExpr() && "getExprOpValue expects only expressions");
+  bool RelaxCandidate = false;
+  bool EnableRelax = STI.hasFeature(LoongArch::FeatureRelax);
   const MCExpr *Expr = MO.getExpr();
   MCExpr::ExprKind Kind = Expr->getKind();
   LoongArch::Fixups FixupKind = LoongArch::fixup_loongarch_invalid;
   if (Kind == MCExpr::Target) {
     const LoongArchMCExpr *LAExpr = cast<LoongArchMCExpr>(Expr);
 
+    RelaxCandidate = LAExpr->getRelaxHint();
     switch (LAExpr->getKind()) {
     case LoongArchMCExpr::VK_LoongArch_None:
     case LoongArchMCExpr::VK_LoongArch_Invalid:
@@ -270,6 +274,15 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
 
   Fixups.push_back(
       MCFixup::create(0, Expr, MCFixupKind(FixupKind), MI.getLoc()));
+
+  // Emit an R_LARCH_RELAX if linker relaxation is enabled and LAExpr has relax
+  // hint.
+  if (EnableRelax && RelaxCandidate) {
+    const MCConstantExpr *Dummy = MCConstantExpr::create(0, Ctx);
+    Fixups.push_back(MCFixup::create(
+        0, Dummy, MCFixupKind(LoongArch::fixup_loongarch_relax), MI.getLoc()));
+  }
+
   return 0;
 }
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index 993111552a31..82c992b1cc8c 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -25,9 +25,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loongarch-mcexpr"
 
-const LoongArchMCExpr *
-LoongArchMCExpr::create(const MCExpr *Expr, VariantKind Kind, MCContext &Ctx) {
-  return new (Ctx) LoongArchMCExpr(Expr, Kind);
+const LoongArchMCExpr *LoongArchMCExpr::create(const MCExpr *Expr,
+                                               VariantKind Kind, MCContext &Ctx,
+                                               bool Hint) {
+  return new (Ctx) LoongArchMCExpr(Expr, Kind, Hint);
 }
 
 void LoongArchMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index 0945cf82db86..93251f824103 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -67,16 +67,18 @@ public:
 private:
   const MCExpr *Expr;
   const VariantKind Kind;
+  const bool RelaxHint;
 
-  explicit LoongArchMCExpr(const MCExpr *Expr, VariantKind Kind)
-      : Expr(Expr), Kind(Kind) {}
+  explicit LoongArchMCExpr(const MCExpr *Expr, VariantKind Kind, bool Hint)
+      : Expr(Expr), Kind(Kind), RelaxHint(Hint) {}
 
 public:
   static const LoongArchMCExpr *create(const MCExpr *Expr, VariantKind Kind,
-                                       MCContext &Ctx);
+                                       MCContext &Ctx, bool Hint = false);
 
   VariantKind getKind() const { return Kind; }
   const MCExpr *getSubExpr() const { return Expr; }
+  bool getRelaxHint() const { return RelaxHint; }
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
   bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
diff --git a/llvm/lib/Target/M68k/M68kExpandPseudo.cpp b/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
index 7bd382107773..7fcc65beaa65 100644
--- a/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
+++ b/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
@@ -161,6 +161,16 @@ bool M68kExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     return TII->ExpandMOVSZX_RM(MIB, false, TII->get(M68k::MOV16rf), MVT::i32,
                                 MVT::i16);
 
+  case M68k::MOVSXd16q8:
+    return TII->ExpandMOVSZX_RM(MIB, true, TII->get(M68k::MOV8dq), MVT::i16,
+                                MVT::i8);
+  case M68k::MOVSXd32q8:
+    return TII->ExpandMOVSZX_RM(MIB, true, TII->get(M68k::MOV8dq), MVT::i32,
+                                MVT::i8);
+  case M68k::MOVSXd32q16:
+    return TII->ExpandMOVSZX_RM(MIB, true, TII->get(M68k::MOV16dq), MVT::i32,
+                                MVT::i16);
+
   case M68k::MOVZXd16q8:
     return TII->ExpandMOVSZX_RM(MIB, false, TII->get(M68k::MOV8dq), MVT::i16,
                                 MVT::i8);
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index 0830cc7feb22..c4d7a0dec7f3 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -94,11 +94,10 @@ M68kTargetLowering::M68kTargetLowering(const M68kTargetMachine &TM,
     setOperationAction(OP, MVT::i16, Expand);
   }
 
-  // FIXME It would be better to use a custom lowering
   for (auto OP : {ISD::SMULO, ISD::UMULO}) {
-    setOperationAction(OP, MVT::i8, Expand);
-    setOperationAction(OP, MVT::i16, Expand);
-    setOperationAction(OP, MVT::i32, Expand);
+    setOperationAction(OP, MVT::i8,  Custom);
+    setOperationAction(OP, MVT::i16, Custom);
+    setOperationAction(OP, MVT::i32, Custom);
   }
 
   for (auto OP : {ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS})
@@ -1533,46 +1532,119 @@ bool M68kTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
   return VT.bitsLE(MVT::i32) || Subtarget.atLeastM68020();
 }
 
-SDValue M68kTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
-  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
-  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
-  // looks for this combo and may remove the "setcc" instruction if the "setcc"
-  // has only one use.
+static bool isOverflowArithmetic(unsigned Opcode) {
+  switch (Opcode) {
+  case ISD::UADDO:
+  case ISD::SADDO:
+  case ISD::USUBO:
+  case ISD::SSUBO:
+  case ISD::UMULO:
+  case ISD::SMULO:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static void lowerOverflowArithmetic(SDValue Op, SelectionDAG &DAG,
+                                    SDValue &Result, SDValue &CCR,
+                                    unsigned &CC) {
   SDNode *N = Op.getNode();
+  EVT VT = N->getValueType(0);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
-  unsigned BaseOp = 0;
-  unsigned Cond = 0;
   SDLoc DL(Op);
+
+  unsigned TruncOp = 0;
+  auto PromoteMULO = [&](unsigned ExtOp) {
+    // We don't have 8-bit multiplications, so promote i8 version of U/SMULO
+    // to i16.
+    // Ideally this should be done by legalizer but sadly there is no promotion
+    // rule for U/SMULO at this moment.
+    if (VT == MVT::i8) {
+      LHS = DAG.getNode(ExtOp, DL, MVT::i16, LHS);
+      RHS = DAG.getNode(ExtOp, DL, MVT::i16, RHS);
+      VT = MVT::i16;
+      TruncOp = ISD::TRUNCATE;
+    }
+  };
+
+  bool NoOverflow = false;
+  unsigned BaseOp = 0;
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("Unknown ovf instruction!");
   case ISD::SADDO:
     BaseOp = M68kISD::ADD;
-    Cond = M68k::COND_VS;
+    CC = M68k::COND_VS;
     break;
   case ISD::UADDO:
     BaseOp = M68kISD::ADD;
-    Cond = M68k::COND_CS;
+    CC = M68k::COND_CS;
     break;
   case ISD::SSUBO:
     BaseOp = M68kISD::SUB;
-    Cond = M68k::COND_VS;
+    CC = M68k::COND_VS;
     break;
   case ISD::USUBO:
     BaseOp = M68kISD::SUB;
-    Cond = M68k::COND_CS;
+    CC = M68k::COND_CS;
+    break;
+  case ISD::UMULO:
+    PromoteMULO(ISD::ZERO_EXTEND);
+    NoOverflow = VT != MVT::i32;
+    BaseOp = NoOverflow ? ISD::MUL : M68kISD::UMUL;
+    CC = M68k::COND_VS;
+    break;
+  case ISD::SMULO:
+    PromoteMULO(ISD::SIGN_EXTEND);
+    NoOverflow = VT != MVT::i32;
+    BaseOp = NoOverflow ? ISD::MUL : M68kISD::SMUL;
+    CC = M68k::COND_VS;
     break;
   }
 
-  // Also sets CCR.
-  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i8);
+  SDVTList VTs;
+  if (NoOverflow)
+    VTs = DAG.getVTList(VT);
+  else
+    // Also sets CCR.
+    VTs = DAG.getVTList(VT, MVT::i8);
+
   SDValue Arith = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
-  SDValue SetCC = DAG.getNode(M68kISD::SETCC, DL, N->getValueType(1),
-                              DAG.getConstant(Cond, DL, MVT::i8),
-                              SDValue(Arith.getNode(), 1));
+  Result = Arith.getValue(0);
+  if (TruncOp)
+    // Right now the only place to truncate is from i16 to i8.
+    Result = DAG.getNode(TruncOp, DL, MVT::i8, Arith);
 
-  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Arith, SetCC);
+  if (NoOverflow)
+    CCR = DAG.getConstant(0, DL, N->getValueType(1));
+  else
+    CCR = Arith.getValue(1);
+}
+
+SDValue M68kTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *N = Op.getNode();
+  SDLoc DL(Op);
+
+  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+  // a "setcc" instruction that checks the overflow flag.
+  SDValue Result, CCR;
+  unsigned CC;
+  lowerOverflowArithmetic(Op, DAG, Result, CCR, CC);
+
+  SDValue Overflow;
+  if (isa<ConstantSDNode>(CCR)) {
+    // It's likely a result of operations that will not overflow
+    // hence no setcc is needed.
+    Overflow = CCR;
+  } else {
+    // Generate a M68kISD::SETCC.
+    Overflow = DAG.getNode(M68kISD::SETCC, DL, N->getValueType(1),
+                           DAG.getConstant(CC, DL, MVT::i8), CCR);
+  }
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Overflow);
 }
 
 /// Create a BTST (Bit Test) node - Test bit \p BitNo in \p Src and set
@@ -2206,8 +2278,7 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       isNullConstant(Cond.getOperand(1).getOperand(0))) {
     SDValue Cmp = Cond.getOperand(1);
 
-    unsigned CondCode =
-        cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+    unsigned CondCode = Cond.getConstantOperandVal(0);
 
     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (CondCode == M68k::COND_EQ || CondCode == M68k::COND_NE)) {
@@ -2269,55 +2340,12 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       Cond = Cmp;
       addTest = false;
     }
-  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
-             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
-             CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
-    SDValue LHS = Cond.getOperand(0);
-    SDValue RHS = Cond.getOperand(1);
-    unsigned MxOpcode;
-    unsigned MxCond;
-    SDVTList VTs;
-    switch (CondOpcode) {
-    case ISD::UADDO:
-      MxOpcode = M68kISD::ADD;
-      MxCond = M68k::COND_CS;
-      break;
-    case ISD::SADDO:
-      MxOpcode = M68kISD::ADD;
-      MxCond = M68k::COND_VS;
-      break;
-    case ISD::USUBO:
-      MxOpcode = M68kISD::SUB;
-      MxCond = M68k::COND_CS;
-      break;
-    case ISD::SSUBO:
-      MxOpcode = M68kISD::SUB;
-      MxCond = M68k::COND_VS;
-      break;
-    case ISD::UMULO:
-      MxOpcode = M68kISD::UMUL;
-      MxCond = M68k::COND_VS;
-      break;
-    case ISD::SMULO:
-      MxOpcode = M68kISD::SMUL;
-      MxCond = M68k::COND_VS;
-      break;
-    default:
-      llvm_unreachable("unexpected overflowing operator");
-    }
-    if (CondOpcode == ISD::UMULO)
-      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), MVT::i32);
-    else
-      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
-
-    SDValue MxOp = DAG.getNode(MxOpcode, DL, VTs, LHS, RHS);
-
-    if (CondOpcode == ISD::UMULO)
-      Cond = MxOp.getValue(2);
-    else
-      Cond = MxOp.getValue(1);
-
-    CC = DAG.getConstant(MxCond, DL, MVT::i8);
+  } else if (isOverflowArithmetic(CondOpcode)) {
+    // Result is unused here.
+    SDValue Result;
+    unsigned CCode;
+    lowerOverflowArithmetic(Cond, DAG, Result, Cond, CCode);
+    CC = DAG.getConstant(CCode, DL, MVT::i8);
     addTest = false;
   }
 
@@ -2377,6 +2405,17 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
+  // Simple optimization when Cond is a constant to avoid generating
+  // M68kISD::CMOV if possible.
+  // TODO: Generalize this to use SelectionDAG::computeKnownBits.
+  if (auto *Const = dyn_cast<ConstantSDNode>(Cond.getNode())) {
+    const APInt &C = Const->getAPIntValue();
+    if (C.countr_zero() >= 5)
+      return Op2;
+    else if (C.countr_one() >= 5)
+      return Op1;
+  }
+
   // M68kISD::CMOV means set the result (which is operand 1) to the RHS if
   // condition is true.
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
@@ -2466,61 +2505,15 @@ SDValue M68kTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
     }
   }
   CondOpcode = Cond.getOpcode();
-  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
-      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO) {
-    SDValue LHS = Cond.getOperand(0);
-    SDValue RHS = Cond.getOperand(1);
-    unsigned MxOpcode;
-    unsigned MxCond;
-    SDVTList VTs;
-    // Keep this in sync with LowerXALUO, otherwise we might create redundant
-    // instructions that can't be removed afterwards (i.e. M68kISD::ADD and
-    // M68kISD::INC).
-    switch (CondOpcode) {
-    case ISD::UADDO:
-      MxOpcode = M68kISD::ADD;
-      MxCond = M68k::COND_CS;
-      break;
-    case ISD::SADDO:
-      MxOpcode = M68kISD::ADD;
-      MxCond = M68k::COND_VS;
-      break;
-    case ISD::USUBO:
-      MxOpcode = M68kISD::SUB;
-      MxCond = M68k::COND_CS;
-      break;
-    case ISD::SSUBO:
-      MxOpcode = M68kISD::SUB;
-      MxCond = M68k::COND_VS;
-      break;
-    case ISD::UMULO:
-      MxOpcode = M68kISD::UMUL;
-      MxCond = M68k::COND_VS;
-      break;
-    case ISD::SMULO:
-      MxOpcode = M68kISD::SMUL;
-      MxCond = M68k::COND_VS;
-      break;
-    default:
-      llvm_unreachable("unexpected overflowing operator");
-    }
+  if (isOverflowArithmetic(CondOpcode)) {
+    SDValue Result;
+    unsigned CCode;
+    lowerOverflowArithmetic(Cond, DAG, Result, Cond, CCode);
 
     if (Inverted)
-      MxCond = M68k::GetOppositeBranchCondition((M68k::CondCode)MxCond);
+      CCode = M68k::GetOppositeBranchCondition((M68k::CondCode)CCode);
+    CC = DAG.getConstant(CCode, DL, MVT::i8);
 
-    if (CondOpcode == ISD::UMULO)
-      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), MVT::i8);
-    else
-      VTs = DAG.getVTList(LHS.getValueType(), MVT::i8);
-
-    SDValue MxOp = DAG.getNode(MxOpcode, DL, VTs, LHS, RHS);
-
-    if (CondOpcode == ISD::UMULO)
-      Cond = MxOp.getValue(2);
-    else
-      Cond = MxOp.getValue(1);
-
-    CC = DAG.getConstant(MxCond, DL, MVT::i8);
     AddTest = false;
   } else {
     unsigned CondOpc;
@@ -3394,7 +3387,7 @@ SDValue M68kTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDNode *Node = Op.getNode();
   SDValue Chain = Op.getOperand(0);
   SDValue Size = Op.getOperand(1);
-  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Align = Op.getConstantOperandVal(2);
   EVT VT = Node->getValueType(0);
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.h b/llvm/lib/Target/M68k/M68kISelLowering.h
index 02427a4e749e..d00907775f92 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.h
+++ b/llvm/lib/Target/M68k/M68kISelLowering.h
@@ -194,6 +194,15 @@ private:
   unsigned GetAlignedArgumentStackSize(unsigned StackSize,
                                        SelectionDAG &DAG) const;
 
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override {
+    // In many cases, `GA` doesn't give the correct offset to fold. It's
+    // hard to know if the real offset actually fits into the displacement
+    // of the perspective addressing mode.
+    // Thus, we disable offset folding altogether and leave that to ISel
+    // patterns.
+    return false;
+  }
+
   SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
 
   /// Emit a load of return address if tail call
diff --git a/llvm/lib/Target/M68k/M68kInstrArithmetic.td b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
index 15d2049f62cb..3532e56e7417 100644
--- a/llvm/lib/Target/M68k/M68kInstrArithmetic.td
+++ b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
@@ -590,8 +590,9 @@ class MxDiMuOp_DD<string MN, bits<4> CMD, bit SIGNED = false,
 }
 
 // $dreg <- $dreg op $dreg
-class MxDiMuOp_DD_Long<string MN, bits<10> CMD, bit SIGNED = false>
-    : MxInst<(outs MxDRD32:$dst), (ins MxDRD32:$src, MxDRD32:$opd), MN#"\t$opd, $dst", []> {
+class MxDiMuOp_DD_Long<string MN, SDNode NODE, bits<10> CMD, bit SIGNED = false>
+    : MxInst<(outs MxDRD32:$dst), (ins MxDRD32:$src, MxDRD32:$opd), MN#"\t$opd, $dst",
+             [(set i32:$dst, CCR, (NODE i32:$src, i32:$opd))]> {
   let Inst = (ascend
     (descend CMD,
       /*MODE*/0b000, /*REGISTER*/(operand "$opd", 3)),
@@ -622,11 +623,9 @@ class MxDiMuOp_DI<string MN, bits<4> CMD, bit SIGNED = false,
 } // let Constraints
 } // Defs = [CCR]
 
-multiclass MxDiMuOp<string MN, bits<4> CMD, bit isComm = 0> {
-  let isCommutable = isComm in {
-    def "S"#NAME#"d32d16" : MxDiMuOp_DD<MN#"s", CMD, /*SIGNED*/true, MxDRD32, MxDRD16>;
-    def "U"#NAME#"d32d16" : MxDiMuOp_DD<MN#"u", CMD, /*SIGNED*/false, MxDRD32, MxDRD16>;
-  }
+multiclass MxDiMuOp<string MN, bits<4> CMD> {
+  def "S"#NAME#"d32d16" : MxDiMuOp_DD<MN#"s", CMD, /*SIGNED*/true, MxDRD32, MxDRD16>;
+  def "U"#NAME#"d32d16" : MxDiMuOp_DD<MN#"u", CMD, /*SIGNED*/false, MxDRD32, MxDRD16>;
 
   def "S"#NAME#"d32i16" : MxDiMuOp_DI<MN#"s", CMD, /*SIGNED*/true, MxDRD32, Mxi16imm>;
   def "U"#NAME#"d32i16" : MxDiMuOp_DI<MN#"u", CMD, /*SIGNED*/false, MxDRD32, Mxi16imm>;
@@ -634,8 +633,8 @@ multiclass MxDiMuOp<string MN, bits<4> CMD, bit isComm = 0> {
 
 defm DIV : MxDiMuOp<"div", 0x8>;
 
-def SDIVd32d32 : MxDiMuOp_DD_Long<"divs.l", 0x131, /*SIGNED*/true>;
-def UDIVd32d32 : MxDiMuOp_DD_Long<"divu.l", 0x131, /*SIGNED*/false>;
+def SDIVd32d32 : MxDiMuOp_DD_Long<"divs.l", sdiv, 0x131, /*SIGNED*/true>;
+def UDIVd32d32 : MxDiMuOp_DD_Long<"divu.l", udiv, 0x131, /*SIGNED*/false>;
 
 // This is used to cast immediates to 16-bits for operations which don't
 // support smaller immediate sizes.
@@ -685,60 +684,53 @@ def : Pat<(urem i16:$dst, i16:$opd),
             (LSR32di (LSR32di (UDIVd32d16 (MOVZXd32d16 $dst), $opd), 8), 8),
              MxSubRegIndex16Lo)>;
 
-
-// RR i32
-def : Pat<(sdiv i32:$dst, i32:$opd), (SDIVd32d32 $dst, $opd)>;
-
-def : Pat<(udiv i32:$dst, i32:$opd), (UDIVd32d32 $dst, $opd)>;
-
-
 // RI i8
-def : Pat<(sdiv i8:$dst, MximmSExt8:$opd),
+def : Pat<(sdiv i8:$dst, Mxi8immSExt8:$opd),
           (EXTRACT_SUBREG
             (SDIVd32i16 (MOVSXd32d8 $dst), (as_i16imm $opd)),
              MxSubRegIndex8Lo)>;
 
-def : Pat<(udiv i8:$dst, MximmSExt8:$opd),
+def : Pat<(udiv i8:$dst, Mxi8immSExt8:$opd),
           (EXTRACT_SUBREG
             (UDIVd32i16 (MOVZXd32d8 $dst), (as_i16imm $opd)),
              MxSubRegIndex8Lo)>;
 
-def : Pat<(srem i8:$dst, MximmSExt8:$opd),
+def : Pat<(srem i8:$dst, Mxi8immSExt8:$opd),
           (EXTRACT_SUBREG
             (ASR32di (ASR32di (SDIVd32i16 (MOVSXd32d8 $dst), (as_i16imm $opd)), 8), 8),
              MxSubRegIndex8Lo)>;
 
-def : Pat<(urem i8:$dst, MximmSExt8:$opd),
+def : Pat<(urem i8:$dst, Mxi8immSExt8:$opd),
           (EXTRACT_SUBREG
             (LSR32di (LSR32di (UDIVd32i16 (MOVZXd32d8 $dst), (as_i16imm $opd)), 8), 8),
              MxSubRegIndex8Lo)>;
 
 // RI i16
-def : Pat<(sdiv i16:$dst, MximmSExt16:$opd),
+def : Pat<(sdiv i16:$dst, Mxi16immSExt16:$opd),
           (EXTRACT_SUBREG
             (SDIVd32i16 (MOVSXd32d16 $dst), imm:$opd),
              MxSubRegIndex16Lo)>;
 
-def : Pat<(udiv i16:$dst, MximmSExt16:$opd),
+def : Pat<(udiv i16:$dst, Mxi16immSExt16:$opd),
           (EXTRACT_SUBREG
             (UDIVd32i16 (MOVZXd32d16 $dst), imm:$opd),
              MxSubRegIndex16Lo)>;
 
-def : Pat<(srem i16:$dst, MximmSExt16:$opd),
+def : Pat<(srem i16:$dst, Mxi16immSExt16:$opd),
           (EXTRACT_SUBREG
             (ASR32di (ASR32di (SDIVd32i16 (MOVSXd32d16 $dst), imm:$opd), 8), 8),
              MxSubRegIndex16Lo)>;
 
-def : Pat<(urem i16:$dst, MximmSExt16:$opd),
+def : Pat<(urem i16:$dst, Mxi16immSExt16:$opd),
           (EXTRACT_SUBREG
             (LSR32di (LSR32di (UDIVd32i16 (MOVZXd32d16 $dst), imm:$opd), 8), 8),
              MxSubRegIndex16Lo)>;
 
 
-defm MUL : MxDiMuOp<"mul", 0xC, 1>;
+defm MUL : MxDiMuOp<"mul", 0xC>;
 
-def SMULd32d32 : MxDiMuOp_DD_Long<"muls.l", 0x130, /*SIGNED*/true>;
-def UMULd32d32 : MxDiMuOp_DD_Long<"mulu.l", 0x130, /*SIGNED*/false>;
+def SMULd32d32 : MxDiMuOp_DD_Long<"muls.l", MxSMul, 0x130, /*SIGNED*/true>;
+def UMULd32d32 : MxDiMuOp_DD_Long<"mulu.l", MxUMul, 0x130, /*SIGNED*/false>;
 
 // RR
 def : Pat<(mul i16:$dst, i16:$opd),
@@ -760,17 +752,17 @@ def : Pat<(mul i32:$dst, i32:$opd), (SMULd32d32 $dst, $opd)>;
 
 
 // RI
-def : Pat<(mul i16:$dst, MximmSExt16:$opd),
+def : Pat<(mul i16:$dst, Mxi16immSExt16:$opd),
           (EXTRACT_SUBREG
             (SMULd32i16 (MOVXd32d16 $dst), imm:$opd),
              MxSubRegIndex16Lo)>;
 
-def : Pat<(mulhs i16:$dst, MximmSExt16:$opd),
+def : Pat<(mulhs i16:$dst, Mxi16immSExt16:$opd),
           (EXTRACT_SUBREG
             (ASR32di (ASR32di (SMULd32i16 (MOVXd32d16 $dst), imm:$opd), 8), 8),
              MxSubRegIndex16Lo)>;
 
-def : Pat<(mulhu i16:$dst, MximmSExt16:$opd),
+def : Pat<(mulhu i16:$dst, Mxi16immSExt16:$opd),
           (EXTRACT_SUBREG
             (LSR32di (LSR32di (UMULd32i16 (MOVXd32d16 $dst), imm:$opd), 8), 8),
              MxSubRegIndex16Lo)>;
@@ -881,16 +873,16 @@ foreach N = ["add", "addc"] in {
             (ADD32df MxDRD32:$src, MxType32.FOp:$opd)>;
 
   // add reg, imm
-  def : Pat<(!cast<SDNode>(N) i8: $src, MximmSExt8:$opd),
+  def : Pat<(!cast<SDNode>(N) i8: $src, Mxi8immSExt8:$opd),
             (ADD8di  MxDRD8 :$src, imm:$opd)>;
-  def : Pat<(!cast<SDNode>(N) i16:$src, MximmSExt16:$opd),
+  def : Pat<(!cast<SDNode>(N) i16:$src, Mxi16immSExt16:$opd),
             (ADD16di MxDRD16:$src, imm:$opd)>;
 
   // LEAp is more complex and thus will be selected over normal ADD32ri but it cannot
   // be used with data registers, here by adding complexity to a simple ADD32ri insts
   // we make sure it will be selected over LEAp
   let AddedComplexity = 15 in {
-  def : Pat<(!cast<SDNode>(N) i32:$src, MximmSExt32:$opd),
+  def : Pat<(!cast<SDNode>(N) i32:$src, Mxi32immSExt32:$opd),
             (ADD32di MxDRD32:$src, imm:$opd)>;
   } // AddedComplexity = 15
 
@@ -949,11 +941,11 @@ foreach N = ["sub", "subc"] in {
             (SUB32df MxDRD32:$src, MxType32.FOp:$opd)>;
 
   // sub reg, imm
-  def : Pat<(!cast<SDNode>(N) i8 :$src, MximmSExt8 :$opd),
+  def : Pat<(!cast<SDNode>(N) i8 :$src, Mxi8immSExt8 :$opd),
             (SUB8di  MxDRD8 :$src, imm:$opd)>;
-  def : Pat<(!cast<SDNode>(N) i16:$src, MximmSExt16:$opd),
+  def : Pat<(!cast<SDNode>(N) i16:$src, Mxi16immSExt16:$opd),
             (SUB16di MxDRD16:$src, imm:$opd)>;
-  def : Pat<(!cast<SDNode>(N) i32:$src, MximmSExt32:$opd),
+  def : Pat<(!cast<SDNode>(N) i32:$src, Mxi32immSExt32:$opd),
             (SUB32di MxDRD32:$src, imm:$opd)>;
 
   // sub imm, (An)
@@ -982,11 +974,11 @@ multiclass BitwisePat<string INST, SDNode OP> {
   def : Pat<(OP i32:$src, i32:$opd),
             (!cast<MxInst>(INST#"32dd") MxDRD32:$src, MxDRD32:$opd)>;
   // op reg, imm
-  def : Pat<(OP i8: $src, MximmSExt8 :$opd),
+  def : Pat<(OP i8: $src, Mxi8immSExt8 :$opd),
             (!cast<MxInst>(INST#"8di")  MxDRD8 :$src, imm:$opd)>;
-  def : Pat<(OP i16:$src, MximmSExt16:$opd),
+  def : Pat<(OP i16:$src, Mxi16immSExt16:$opd),
             (!cast<MxInst>(INST#"16di") MxDRD16:$src, imm:$opd)>;
-  def : Pat<(OP i32:$src, MximmSExt32:$opd),
+  def : Pat<(OP i32:$src, Mxi32immSExt32:$opd),
             (!cast<MxInst>(INST#"32di") MxDRD32:$src, imm:$opd)>;
 }
 
diff --git a/llvm/lib/Target/M68k/M68kInstrData.td b/llvm/lib/Target/M68k/M68kInstrData.td
index 624093661d19..fa7e7aa0ed46 100644
--- a/llvm/lib/Target/M68k/M68kInstrData.td
+++ b/llvm/lib/Target/M68k/M68kInstrData.td
@@ -554,18 +554,21 @@ def: Pat<(MxSExtLoadi16i8 MxCP_ARID:$src),
           (EXTRACT_SUBREG (MOVSXd32p8 MxARID8:$src), MxSubRegIndex16Lo)>;
 def: Pat<(MxSExtLoadi16i8 MxCP_ARII:$src),
           (EXTRACT_SUBREG (MOVSXd32f8 MxARII8:$src), MxSubRegIndex16Lo)>;
+def: Pat<(MxSExtLoadi16i8 MxCP_PCD:$src), (MOVSXd16q8 MxPCD8:$src)>;
 
 // i32 <- sext i8
 def: Pat<(i32 (sext i8:$src)), (MOVSXd32d8 MxDRD8:$src)>;
 def: Pat<(MxSExtLoadi32i8 MxCP_ARI :$src), (MOVSXd32j8 MxARI8 :$src)>;
 def: Pat<(MxSExtLoadi32i8 MxCP_ARID:$src), (MOVSXd32p8 MxARID8:$src)>;
 def: Pat<(MxSExtLoadi32i8 MxCP_ARII:$src), (MOVSXd32f8 MxARII8:$src)>;
+def: Pat<(MxSExtLoadi32i8 MxCP_PCD:$src),  (MOVSXd32q8 MxPCD8:$src)>;
 
 // i32 <- sext i16
 def: Pat<(i32 (sext i16:$src)), (MOVSXd32d16 MxDRD16:$src)>;
 def: Pat<(MxSExtLoadi32i16 MxCP_ARI :$src), (MOVSXd32j16 MxARI16 :$src)>;
 def: Pat<(MxSExtLoadi32i16 MxCP_ARID:$src), (MOVSXd32p16 MxARID16:$src)>;
 def: Pat<(MxSExtLoadi32i16 MxCP_ARII:$src), (MOVSXd32f16 MxARII16:$src)>;
+def: Pat<(MxSExtLoadi32i16 MxCP_PCD:$src),  (MOVSXd32q16 MxPCD16:$src)>;
 
 // i16 <- zext i8
 def: Pat<(i16 (zext i8:$src)),
diff --git a/llvm/lib/Target/M68k/M68kInstrFormats.td b/llvm/lib/Target/M68k/M68kInstrFormats.td
index 38d3127ac6a6..99bac7a59939 100644
--- a/llvm/lib/Target/M68k/M68kInstrFormats.td
+++ b/llvm/lib/Target/M68k/M68kInstrFormats.td
@@ -17,22 +17,22 @@
 ///  03   M68000    (An)                     j       address register indirect
 ///  04   M68000    (An)+                    o       address register indirect with postincrement
 ///  05   M68000    -(An)                    e       address register indirect with predecrement
-///  06   M68000    (i,An)                   p       address register indirect with displacement
-///  10   M68000    (i,An,Xn.L)              f       address register indirect with index and scale = 1
-///  07   M68000    (i,An,Xn.W)              F       address register indirect with index and scale = 1
-///  12   M68020    (i,An,Xn.L,SCALE)        g       address register indirect with index
-///  11   M68020    (i,An,Xn.W,SCALE)        G       address register indirect with index
+///  06   M68000    (d16,An)                 p       address register indirect with displacement
+///  10   M68000    (d8,An,Xn.L)             f       address register indirect with index and scale = 1
+///  07   M68000    (d8,An,Xn.W)             F       address register indirect with index and scale = 1
+///  12   M68020    (d8,An,Xn.L,SCALE)       g       address register indirect with index
+///  11   M68020    (d8,An,Xn.W,SCALE)       G       address register indirect with index
 ///  14   M68020    ([bd,An],Xn.L,SCALE,od)  u       memory indirect postindexed mode
 ///  13   M68020    ([bd,An],Xn.W,SCALE,od)  U       memory indirect postindexed mode
 ///  16   M68020    ([bd,An,Xn.L,SCALE],od)  v       memory indirect preindexed mode
 ///  15   M68020    ([bd,An,Xn.W,SCALE],od)  V       memory indirect preindexed mode
 ///  20   M68000    abs.L                    b       absolute long address
 ///  17   M68000    abs.W                    B       absolute short address
-///  21   M68000    (i,PC)                   q       program counter with displacement
-///  23   M68000    (i,PC,Xn.L)              k       program counter with index and scale = 1
-///  22   M68000    (i,PC,Xn.W)              K       program counter with index and scale = 1
-///  25   M68020    (i,PC,Xn.L,SCALE)        l       program counter with index
-///  24   M68020    (i,PC,Xn.W,SCALE)        L       program counter with index
+///  21   M68000    (d16,PC)                 q       program counter with displacement
+///  23   M68000    (d8,PC,Xn.L)             k       program counter with index and scale = 1
+///  22   M68000    (d8,PC,Xn.W)             K       program counter with index and scale = 1
+///  25   M68020    (d8,PC,Xn.L,SCALE)       l       program counter with index
+///  24   M68020    (d8,PC,Xn.W,SCALE)       L       program counter with index
 ///  27   M68020    ([bd,PC],Xn.L,SCALE,od)  x       program counter memory indirect postindexed mode
 ///  26   M68020    ([bd,PC],Xn.W,SCALE,od)  X       program counter memory indirect postindexed mode
 ///  31   M68020    ([bd,PC,Xn.L,SCALE],od)  y       program counter memory indirect preindexed mode
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.td b/llvm/lib/Target/M68k/M68kInstrInfo.td
index dc66e103361a..84eb8e56da76 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.td
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.td
@@ -55,15 +55,6 @@ def MxSDT_BiArithCCRInOut : SDTypeProfile<2, 3, [
   /*   CCR */ SDTCisSameAs<1, 4>
 ]>;
 
-// RES1, RES2, CCR <- op LHS, RHS
-def MxSDT_2BiArithCCROut : SDTypeProfile<3, 2, [
-  /* RES 1 */ SDTCisInt<0>,
-  /* RES 2 */ SDTCisSameAs<0, 1>,
-  /*   CCR */ SDTCisVT<1, i8>,
-  /*   LHS */ SDTCisSameAs<0, 2>,
-  /*   RHS */ SDTCisSameAs<0, 3>
-]>;
-
 def MxSDT_CmpTest : SDTypeProfile<1, 2, [
    /* CCR */ SDTCisVT<0, i8>,
    /* Ops */ SDTCisSameAs<1, 2>
@@ -134,7 +125,7 @@ def MxAddX : SDNode<"M68kISD::ADDX", MxSDT_BiArithCCRInOut>;
 def MxSubX : SDNode<"M68kISD::SUBX", MxSDT_BiArithCCRInOut>;
 
 def MxSMul : SDNode<"M68kISD::SMUL", MxSDT_BiArithCCROut, [SDNPCommutative]>;
-def MxUMul : SDNode<"M68kISD::UMUL", MxSDT_2BiArithCCROut, [SDNPCommutative]>;
+def MxUMul : SDNode<"M68kISD::UMUL", MxSDT_BiArithCCROut, [SDNPCommutative]>;
 
 def MxCmp     : SDNode<"M68kISD::CMP", MxSDT_CmpTest>;
 def MxBtst    : SDNode<"M68kISD::BTST", MxSDT_CmpTest>;
@@ -522,9 +513,14 @@ def MxCP_PCI   : ComplexPattern<iPTR, 2, "SelectPCI",
 // Pattern Fragments
 //===----------------------------------------------------------------------===//
 
-def MximmSExt8  : PatLeaf<(i8  imm)>;
-def MximmSExt16 : PatLeaf<(i16 imm)>;
-def MximmSExt32 : PatLeaf<(i32 imm)>;
+def Mxi8immSExt8  : PatLeaf<(i8  imm)>;
+def MximmSExt8    : PatLeaf<(imm), [{ return isInt<8>(N->getSExtValue()); }]>;
+
+def Mxi16immSExt16 : PatLeaf<(i16 imm)>;
+def MximmSExt16    : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
+
+def Mxi32immSExt32 : PatLeaf<(i32 imm)>;
+def MximmSExt32    : PatLeaf<(imm), [{ return isInt<32>(N->getSExtValue()); }]>;
 
 // Used for Shifts and Rotations, since M68k immediates in these instructions
 // are 1 <= i <= 8. Generally, if immediate is bigger than 8 it will be moved
@@ -717,7 +713,7 @@ foreach size = [8, 16, 32] in {
   // #imm
   def MxOp#size#AddrMode_i
     : MxImmOpBundle<size, !cast<MxOperand>("Mxi"#size#"imm"),
-                    !cast<PatFrag>("MximmSExt"#size)>;
+                    !cast<PatFrag>("Mxi"#size#"immSExt"#size)>;
 } // foreach size = [8, 16, 32]
 
 foreach size = [16, 32] in {
@@ -747,7 +743,7 @@ class MxType8Class<string rLet, MxOperand reg>
              MxAL8,    MxCP_AL,
              MxPCD8,   MxCP_PCD,
              MxPCI8,   MxCP_PCI,
-             Mxi8imm,  MximmSExt8,
+             Mxi8imm,  Mxi8immSExt8,
              Mxloadi8>;
 
 def MxType8 : MxType8Class<?,?>;
@@ -762,7 +758,7 @@ class MxType16Class<string rLet, MxOperand reg>
              MxAL16,    MxCP_AL,
              MxPCD16,   MxCP_PCD,
              MxPCI16,   MxCP_PCI,
-             Mxi16imm,  MximmSExt16,
+             Mxi16imm,  Mxi16immSExt16,
              Mxloadi16>;
 
 def MxType16 : MxType16Class<?,?>;
@@ -777,7 +773,7 @@ class MxType32Class<string rLet, MxOperand reg>
              MxAL32,    MxCP_AL,
              MxPCD32,   MxCP_PCD,
              MxPCI32,   MxCP_PCI,
-             Mxi32imm,  MximmSExt32,
+             Mxi32imm,  Mxi32immSExt32,
              Mxloadi32>;
 
 def MxType32 : MxType32Class<?,?>;
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index ee7762c296bf..d3b59138a5a9 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -964,7 +964,7 @@ SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
   if (!isa<ConstantSDNode>(N->getOperand(1)))
     return Op;
 
-  uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  uint64_t ShiftAmount = N->getConstantOperandVal(1);
 
   // Expand the stuff into sequence of shifts.
   SDValue Victim = N->getOperand(0);
@@ -1269,7 +1269,7 @@ SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
     return SDValue();
 
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   SDLoc dl(Op);
   EVT PtrVT = Op.getValueType();
 
@@ -1295,7 +1295,7 @@ SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);  // FIXME probably not meaningful
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                                          MSP430::R4, VT);
   while (Depth--)
diff --git a/llvm/lib/Target/Mips/Mips64InstrInfo.td b/llvm/lib/Target/Mips/Mips64InstrInfo.td
index ac679c4c01bc..c0e7eef8dd9d 100644
--- a/llvm/lib/Target/Mips/Mips64InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips64InstrInfo.td
@@ -164,20 +164,20 @@ def NOR64  : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>, GPR_64;
 
 /// Shift Instructions
 let AdditionalPredicates = [NotInMicroMips] in {
-  def DSLL : shift_rotate_imm<"dsll", uimm6, GPR64Opnd, II_DSLL, shl,
+  def DSLL : shift_rotate_imm<"dsll", uimm6, GPR64Opnd, II_DSLL, mshl_64,
                               immZExt6>,
              SRA_FM<0x38, 0>, ISA_MIPS3;
-  def DSRL : shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, II_DSRL, srl,
+  def DSRL : shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, II_DSRL, msrl_64,
                               immZExt6>,
              SRA_FM<0x3a, 0>, ISA_MIPS3;
-  def DSRA : shift_rotate_imm<"dsra", uimm6, GPR64Opnd, II_DSRA, sra,
+  def DSRA : shift_rotate_imm<"dsra", uimm6, GPR64Opnd, II_DSRA, msra_64,
                               immZExt6>,
              SRA_FM<0x3b, 0>, ISA_MIPS3;
-  def DSLLV  : shift_rotate_reg<"dsllv", GPR64Opnd, II_DSLLV, shl>,
+  def DSLLV  : shift_rotate_reg<"dsllv", GPR64Opnd, II_DSLLV, mshl_64>,
                SRLV_FM<0x14, 0>, ISA_MIPS3;
-  def DSRAV  : shift_rotate_reg<"dsrav", GPR64Opnd, II_DSRAV, sra>,
+  def DSRAV  : shift_rotate_reg<"dsrav", GPR64Opnd, II_DSRAV, msra_64>,
                SRLV_FM<0x17, 0>, ISA_MIPS3;
-  def DSRLV  : shift_rotate_reg<"dsrlv", GPR64Opnd, II_DSRLV, srl>,
+  def DSRLV  : shift_rotate_reg<"dsrlv", GPR64Opnd, II_DSRLV, msrl_64>,
                SRLV_FM<0x16, 0>, ISA_MIPS3;
   def DSLL32 : shift_rotate_imm<"dsll32", uimm5, GPR64Opnd, II_DSLL32>,
                SRA_FM<0x3c, 0>, ISA_MIPS3;
diff --git a/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 77ce8ba890a8..01b41f3b2159 100644
--- a/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/CFG.h"
@@ -31,6 +32,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
@@ -324,6 +326,24 @@ bool MipsDAGToDAGISel::SelectInlineAsmMemoryOperand(
   return true;
 }
 
+bool MipsDAGToDAGISel::isUnneededShiftMask(SDNode *N,
+                                           unsigned ShAmtBits) const {
+  assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
+
+  const APInt &RHS = N->getConstantOperandAPInt(1);
+  if (RHS.countr_one() >= ShAmtBits) {
+    LLVM_DEBUG(
+        dbgs()
+        << DEBUG_TYPE
+        << " Need optimize 'and & shl/srl/sra' and operand value bits is "
+        << RHS.countr_one() << "\n");
+    return true;
+  }
+
+  KnownBits Known = CurDAG->computeKnownBits(N->getOperand(0));
+  return (Known.Zero | RHS).countr_one() >= ShAmtBits;
+}
+
 char MipsDAGToDAGISel::ID = 0;
 
 INITIALIZE_PASS(MipsDAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
diff --git a/llvm/lib/Target/Mips/MipsISelDAGToDAG.h b/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
index e41cb08712ca..52207d0f6284 100644
--- a/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -143,6 +143,7 @@ private:
   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                     InlineAsm::ConstraintCode ConstraintID,
                                     std::vector<SDValue> &OutOps) override;
+  bool isUnneededShiftMask(SDNode *N, unsigned ShAmtBits) const;
 };
 }
 
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index a0cab8024386..483eba4e4f47 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -2508,7 +2508,7 @@ SDValue MipsTargetLowering::lowerFABS(SDValue Op, SelectionDAG &DAG) const {
 SDValue MipsTargetLowering::
 lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   // check the depth
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0) {
+  if (Op.getConstantOperandVal(0) != 0) {
     DAG.getContext()->emitError(
         "return address can be determined only for current frame");
     return SDValue();
@@ -2529,7 +2529,7 @@ SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
     return SDValue();
 
   // check the depth
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0) {
+  if (Op.getConstantOperandVal(0) != 0) {
     DAG.getContext()->emitError(
         "return address can be determined only for current frame");
     return SDValue();
diff --git a/llvm/lib/Target/Mips/MipsInstrCompiler.td b/llvm/lib/Target/Mips/MipsInstrCompiler.td
new file mode 100644
index 000000000000..8ae3d71978b1
--- /dev/null
+++ b/llvm/lib/Target/Mips/MipsInstrCompiler.td
@@ -0,0 +1,33 @@
+//===- MipsInstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the various pseudo instructions used by the compiler,
+// as well as Pat patterns used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+
+def shiftMask_32 : PatFrag<(ops node:$lhs), (and node:$lhs, imm), [{
+  return isUnneededShiftMask(N, 5);
+}]>;
+
+def shiftMask_64 : PatFrag<(ops node:$src0), (and node:$src0, imm), [{
+  return isUnneededShiftMask(N, 6);
+}]>;
+
+foreach width = [32, 64] in {
+defvar shiftMask = !cast<SDPatternOperator>("shiftMask_"#width);
+def mshl_#width : PatFrags<(ops node:$src0, node:$src1),
+  [(shl node:$src0, node:$src1), (shl node:$src0, (shiftMask node:$src1))]>;
+
+def msrl_#width : PatFrags<(ops node:$src0, node:$src1),
+  [(srl node:$src0, node:$src1), (srl node:$src0, (shiftMask node:$src1))]>;
+
+def msra_#width : PatFrags<(ops node:$src0, node:$src1),
+  [(sra node:$src0, node:$src1), (sra node:$src0, (shiftMask node:$src1))]>;
+}
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td
index 75270857ea13..4b6f4b22e71b 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 // Mips profiles and nodes
 //===----------------------------------------------------------------------===//
+include "MipsInstrCompiler.td"
 
 def SDT_MipsJmpLink      : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
 def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
@@ -2079,17 +2080,17 @@ let AdditionalPredicates = [NotInMicroMips] in {
 
 let AdditionalPredicates = [NotInMicroMips] in {
   /// Shift Instructions
-  def SLL  : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL, shl,
+  def SLL  : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL, mshl_32,
                                      immZExt5>, SRA_FM<0, 0>, ISA_MIPS1;
-  def SRL  : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL, srl,
+  def SRL  : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL, msrl_32,
                                      immZExt5>, SRA_FM<2, 0>, ISA_MIPS1;
-  def SRA  : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA, sra,
+  def SRA  : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA, msra_32,
                                      immZExt5>, SRA_FM<3, 0>, ISA_MIPS1;
-  def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV, shl>,
+  def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV, mshl_32>,
              SRLV_FM<4, 0>, ISA_MIPS1;
-  def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV, srl>,
+  def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV, msrl_32>,
              SRLV_FM<6, 0>, ISA_MIPS1;
-  def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV, sra>,
+  def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV, msra_32>,
              SRLV_FM<7, 0>, ISA_MIPS1;
 
   // Rotate Instructions
diff --git a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 8c865afd4207..0ed87ee0809a 100644
--- a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -831,8 +831,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   }
 
   case ISD::INTRINSIC_W_CHAIN: {
-    const unsigned IntrinsicOpcode =
-        cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    const unsigned IntrinsicOpcode = Node->getConstantOperandVal(1);
     switch (IntrinsicOpcode) {
     default:
       break;
@@ -885,7 +884,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   }
 
   case ISD::INTRINSIC_WO_CHAIN: {
-    switch (cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue()) {
+    switch (Node->getConstantOperandVal(0)) {
     default:
       break;
 
@@ -901,8 +900,7 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   }
 
   case ISD::INTRINSIC_VOID: {
-    const unsigned IntrinsicOpcode =
-        cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    const unsigned IntrinsicOpcode = Node->getConstantOperandVal(1);
     switch (IntrinsicOpcode) {
     default:
       break;
diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 5c34067c8888..e9788fa7ed73 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1519,7 +1519,7 @@ static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   EVT ResTy = Op->getValueType(0);
   APInt BitImm = APInt(ResTy.getScalarSizeInBits(), 1)
-                 << cast<ConstantSDNode>(Op->getOperand(2))->getAPIntValue();
+                 << Op->getConstantOperandAPInt(2);
   SDValue BitMask = DAG.getConstant(~BitImm, DL, ResTy);
 
   return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1), BitMask);
@@ -1528,7 +1528,7 @@ static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
 SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                       SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  unsigned Intrinsic = cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue();
+  unsigned Intrinsic = Op->getConstantOperandVal(0);
   switch (Intrinsic) {
   default:
     return SDValue();
@@ -2300,7 +2300,7 @@ static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
 
 SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  unsigned Intr = Op->getConstantOperandVal(1);
   switch (Intr) {
   default:
     return SDValue();
@@ -2375,7 +2375,7 @@ static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
 
 SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
                                                   SelectionDAG &DAG) const {
-  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  unsigned Intr = Op->getConstantOperandVal(1);
   switch (Intr) {
   default:
     return SDValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 894a8636f458..815c46edb6fa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -513,7 +513,7 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
 }
 
 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
-  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned IID = N->getConstantOperandVal(1);
   switch (IID) {
   default:
     return false;
@@ -730,7 +730,7 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
 }
 
 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
-  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IID = N->getConstantOperandVal(0);
   switch (IID) {
   default:
     return false;
@@ -1246,7 +1246,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
     Op1 = N->getOperand(2);
     Mem = cast<MemIntrinsicSDNode>(N);
-    unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned IID = N->getConstantOperandVal(1);
     switch (IID) {
     default:
       return false;
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index b57d185bb638..ed96339240d9 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -4902,8 +4902,7 @@ bool PPCDAGToDAGISel::trySelectLoopCountIntrinsic(SDNode *N) {
     return false;
 
   if (LHS.getOperand(0).getOpcode() != ISD::INTRINSIC_W_CHAIN ||
-      cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() !=
-          Intrinsic::loop_decrement)
+      LHS.getOperand(0).getConstantOperandVal(1) != Intrinsic::loop_decrement)
     return false;
 
   if (!isa<ConstantSDNode>(RHS))
@@ -6011,7 +6010,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // Op #3 is the Dest MBB
     // Op #4 is the Flag.
     // Prevent PPC::PRED_* from being selected into LI.
-    unsigned PCC = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned PCC = N->getConstantOperandVal(1);
     if (EnableBranchHint)
       PCC |= getBranchHint(PCC, *FuncInfo, N->getOperand(3));
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index ae0d3b76f89a..8f27e6677afa 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1176,6 +1176,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setTruncStoreAction(MVT::f128, MVT::f32, Expand);
 
       // No implementation for these ops for PowerPC.
+      setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
       setOperationAction(ISD::FSIN, MVT::f128, Expand);
       setOperationAction(ISD::FCOS, MVT::f128, Expand);
       setOperationAction(ISD::FPOW, MVT::f128, Expand);
@@ -1411,6 +1412,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setLibcallName(RTLIB::EXP2_F128, "exp2f128");
   setLibcallName(RTLIB::SIN_F128, "sinf128");
   setLibcallName(RTLIB::COS_F128, "cosf128");
+  setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
   setLibcallName(RTLIB::POW_F128, "powf128");
   setLibcallName(RTLIB::FMIN_F128, "fminf128");
   setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
@@ -2815,8 +2817,8 @@ bool PPCTargetLowering::SelectAddressRegImm(
       return true; // [r+i]
     } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
       // Match LOAD (ADD (X, Lo(G))).
-      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
-             && "Cannot handle constant offsets yet!");
+      assert(!N.getOperand(1).getConstantOperandVal(1) &&
+             "Cannot handle constant offsets yet!");
       Disp = N.getOperand(1).getOperand(0);  // The global address.
       assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
              Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
@@ -3822,8 +3824,7 @@ SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
 
   // Check all operands that may contain the LR.
   for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
-    const InlineAsm::Flag Flags(
-        cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue());
+    const InlineAsm::Flag Flags(Op.getConstantOperandVal(i));
     unsigned NumVals = Flags.getNumOperandRegisters();
     ++i; // Skip the ID value.
 
@@ -10440,8 +10441,7 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
 /// information about the intrinsic.
 static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
                                  bool &isDot, const PPCSubtarget &Subtarget) {
-  unsigned IntrinsicID =
-      cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
+  unsigned IntrinsicID = Intrin.getConstantOperandVal(0);
   CompareOpc = -1;
   isDot = false;
   switch (IntrinsicID) {
@@ -10726,8 +10726,7 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
 /// lower, do it, otherwise return null.
 SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                    SelectionDAG &DAG) const {
-  unsigned IntrinsicID =
-    cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntrinsicID = Op.getConstantOperandVal(0);
 
   SDLoc dl(Op);
 
@@ -10945,7 +10944,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   // Unpack the result based on how the target uses it.
   unsigned BitNo;   // Bit # of CR6.
   bool InvertBit;   // Invert result?
-  switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
+  switch (Op.getConstantOperandVal(1)) {
   default:  // Can't happen, don't crash on invalid number though.
   case 0:   // Return the value of the EQ bit of CR6.
     BitNo = 0; InvertBit = false;
@@ -10981,7 +10980,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   // the beginning of the argument list.
   int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1;
   SDLoc DL(Op);
-  switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) {
+  switch (Op.getConstantOperandVal(ArgStart)) {
   case Intrinsic::ppc_cfence: {
     assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
     SDValue Val = Op.getOperand(ArgStart + 1);
@@ -11546,7 +11545,7 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
       return SDValue();
 
     // Custom lower is only done for high or low doubleword.
-    int Idx = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+    int Idx = Op0.getConstantOperandVal(1);
     if (Idx % 2 != 0)
       return SDValue();
 
@@ -11715,8 +11714,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     break;
   }
   case ISD::INTRINSIC_W_CHAIN: {
-    if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
-        Intrinsic::loop_decrement)
+    if (N->getConstantOperandVal(1) != Intrinsic::loop_decrement)
       break;
 
     assert(N->getValueType(0) == MVT::i1 &&
@@ -11732,7 +11730,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
+    switch (N->getConstantOperandVal(0)) {
     case Intrinsic::ppc_pack_longdouble:
       Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
                                     N->getOperand(2), N->getOperand(1)));
@@ -13652,7 +13650,7 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
 
   if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
     EVT VT;
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    switch (N->getConstantOperandVal(1)) {
     default: return false;
     case Intrinsic::ppc_altivec_lvx:
     case Intrinsic::ppc_altivec_lvxl:
@@ -13680,7 +13678,7 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
 
   if (N->getOpcode() == ISD::INTRINSIC_VOID) {
     EVT VT;
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    switch (N->getConstantOperandVal(1)) {
     default: return false;
     case Intrinsic::ppc_altivec_stvx:
     case Intrinsic::ppc_altivec_stvxl:
@@ -15544,8 +15542,7 @@ SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
 }
 
 static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) {
-  unsigned IntrinsicID =
-      cast<ConstantSDNode>(Intrin.getOperand(1))->getZExtValue();
+  unsigned IntrinsicID = Intrin.getConstantOperandVal(1);
   if (IntrinsicID == Intrinsic::ppc_stdcx)
     StoreWidth = 8;
   else if (IntrinsicID == Intrinsic::ppc_stwcx)
@@ -15977,7 +15974,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     break;
     case ISD::INTRINSIC_WO_CHAIN: {
       bool isLittleEndian = Subtarget.isLittleEndian();
-      unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+      unsigned IID = N->getConstantOperandVal(0);
       Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
                                            : Intrinsic::ppc_altivec_lvsl);
       if (IID == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) {
@@ -15990,36 +15987,34 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                       .zext(Add.getScalarValueSizeInBits()))) {
           SDNode *BasePtr = Add->getOperand(0).getNode();
           for (SDNode *U : BasePtr->uses()) {
-            if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-                cast<ConstantSDNode>(U->getOperand(0))->getZExtValue() == IID) {
-              // We've found another LVSL/LVSR, and this address is an aligned
-              // multiple of that one. The results will be the same, so use the
-              // one we've just found instead.
+          if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+              U->getConstantOperandVal(0) == IID) {
+            // We've found another LVSL/LVSR, and this address is an aligned
+            // multiple of that one. The results will be the same, so use the
+            // one we've just found instead.
 
-              return SDValue(U, 0);
-            }
+            return SDValue(U, 0);
+          }
           }
         }
 
         if (isa<ConstantSDNode>(Add->getOperand(1))) {
           SDNode *BasePtr = Add->getOperand(0).getNode();
           for (SDNode *U : BasePtr->uses()) {
-            if (U->getOpcode() == ISD::ADD &&
-                isa<ConstantSDNode>(U->getOperand(1)) &&
-                (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() -
-                 cast<ConstantSDNode>(U->getOperand(1))->getZExtValue()) %
-                        (1ULL << Bits) ==
-                    0) {
-              SDNode *OtherAdd = U;
-              for (SDNode *V : OtherAdd->uses()) {
-                if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-                    cast<ConstantSDNode>(V->getOperand(0))->getZExtValue() ==
-                        IID) {
-                  return SDValue(V, 0);
-                }
+          if (U->getOpcode() == ISD::ADD &&
+              isa<ConstantSDNode>(U->getOperand(1)) &&
+              (Add->getConstantOperandVal(1) - U->getConstantOperandVal(1)) %
+                      (1ULL << Bits) ==
+                  0) {
+            SDNode *OtherAdd = U;
+            for (SDNode *V : OtherAdd->uses()) {
+              if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+                  V->getConstantOperandVal(0) == IID) {
+                return SDValue(V, 0);
               }
             }
           }
+          }
         }
       }
 
@@ -16059,30 +16054,30 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   case ISD::INTRINSIC_W_CHAIN:
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
-    default:
-      break;
-    case Intrinsic::ppc_altivec_vsum4sbs:
-    case Intrinsic::ppc_altivec_vsum4shs:
-    case Intrinsic::ppc_altivec_vsum4ubs: {
-      // These sum-across intrinsics only have a chain due to the side effect
-      // that they may set the SAT bit. If we know the SAT bit will not be set
-      // for some inputs, we can replace any uses of their chain with the input
-      // chain.
-      if (BuildVectorSDNode *BVN =
-              dyn_cast<BuildVectorSDNode>(N->getOperand(3))) {
-        APInt APSplatBits, APSplatUndef;
-        unsigned SplatBitSize;
-        bool HasAnyUndefs;
-        bool BVNIsConstantSplat = BVN->isConstantSplat(
-            APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0,
-            !Subtarget.isLittleEndian());
-        // If the constant splat vector is 0, the SAT bit will not be set.
-        if (BVNIsConstantSplat && APSplatBits == 0)
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), N->getOperand(0));
+      switch (N->getConstantOperandVal(1)) {
+      default:
+        break;
+      case Intrinsic::ppc_altivec_vsum4sbs:
+      case Intrinsic::ppc_altivec_vsum4shs:
+      case Intrinsic::ppc_altivec_vsum4ubs: {
+        // These sum-across intrinsics only have a chain due to the side effect
+        // that they may set the SAT bit. If we know the SAT bit will not be set
+        // for some inputs, we can replace any uses of their chain with the
+        // input chain.
+        if (BuildVectorSDNode *BVN =
+                dyn_cast<BuildVectorSDNode>(N->getOperand(3))) {
+          APInt APSplatBits, APSplatUndef;
+          unsigned SplatBitSize;
+          bool HasAnyUndefs;
+          bool BVNIsConstantSplat = BVN->isConstantSplat(
+              APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0,
+              !Subtarget.isLittleEndian());
+          // If the constant splat vector is 0, the SAT bit will not be set.
+          if (BVNIsConstantSplat && APSplatBits == 0)
+            DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), N->getOperand(0));
+        }
+        return SDValue();
       }
-      return SDValue();
-    }
     case Intrinsic::ppc_vsx_lxvw4x:
     case Intrinsic::ppc_vsx_lxvd2x:
       // For little endian, VSX loads require generating lxvd2x/xxswapd.
@@ -16096,7 +16091,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     // For little endian, VSX stores require generating xxswapd/stxvd2x.
     // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
     if (Subtarget.needsSwapsForVSXMemOps()) {
-      switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+      switch (N->getConstantOperandVal(1)) {
       default:
         break;
       case Intrinsic::ppc_vsx_stxvw4x:
@@ -16325,7 +16320,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 
       // Unpack the result based on how the target uses it.
       PPC::Predicate CompOpc;
-      switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
+      switch (LHS.getConstantOperandVal(1)) {
       default:  // Can't happen, don't crash on invalid number though.
       case 0:   // Branch on the value of the EQ bit of CR6.
         CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
@@ -16404,7 +16399,7 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
+    switch (Op.getConstantOperandVal(0)) {
     default: break;
     case Intrinsic::ppc_altivec_vcmpbfp_p:
     case Intrinsic::ppc_altivec_vcmpeqfp_p:
@@ -16431,7 +16426,7 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     break;
   }
   case ISD::INTRINSIC_W_CHAIN: {
-    switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
+    switch (Op.getConstantOperandVal(1)) {
     default:
       break;
     case Intrinsic::ppc_load2r:
@@ -16866,7 +16861,7 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
     return SDValue();
 
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
 
   // Make sure the function does not optimize away the store of the RA to
   // the stack.
@@ -16899,7 +16894,7 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
 SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
                                           SelectionDAG &DAG) const {
   SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -18084,8 +18079,7 @@ static void computeFlagsForAddressComputation(SDValue N, unsigned &FlagSet,
         FlagSet |= PPC::MOF_RPlusSImm34; // Signed 34-bit immediates.
       else
         FlagSet |= PPC::MOF_RPlusR; // Register.
-    } else if (RHS.getOpcode() == PPCISD::Lo &&
-               !cast<ConstantSDNode>(RHS.getOperand(1))->getZExtValue())
+    } else if (RHS.getOpcode() == PPCISD::Lo && !RHS.getConstantOperandVal(1))
       FlagSet |= PPC::MOF_RPlusLo; // PPCISD::Lo.
     else
       FlagSet |= PPC::MOF_RPlusR;
@@ -18129,7 +18123,7 @@ unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
   unsigned ParentOp = Parent->getOpcode();
   if (Subtarget.isISA3_1() && ((ParentOp == ISD::INTRINSIC_W_CHAIN) ||
                                (ParentOp == ISD::INTRINSIC_VOID))) {
-    unsigned ID = cast<ConstantSDNode>(Parent->getOperand(1))->getZExtValue();
+    unsigned ID = Parent->getConstantOperandVal(1);
     if ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) {
       SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp)
                              ? Parent->getOperand(2)
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index f3ea0f597eec..4759aa951664 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -1832,57 +1832,18 @@ ParseStatus RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) {
     if (getParser().parseIdentifier(Identifier))
       return ParseStatus::Failure;
 
-    // Check for CSR names conflicts.
-    // Custom CSR names might conflict with CSR names in privileged spec.
-    // E.g. - SiFive mnscratch(0x350) and privileged spec mnscratch(0x740).
-    auto CheckCSRNameConflict = [&]() {
-      if (!(RISCVSysReg::lookupSysRegByName(Identifier))) {
-        Error(S, "system register use requires an option to be enabled");
-        return true;
-      }
-      return false;
-    };
-
-    // First check for vendor specific CSRs.
-    auto SiFiveReg = RISCVSysReg::lookupSiFiveRegByName(Identifier);
-    if (SiFiveReg) {
-      if (SiFiveReg->haveVendorRequiredFeatures(getSTI().getFeatureBits())) {
-        Operands.push_back(
-            RISCVOperand::createSysReg(Identifier, S, SiFiveReg->Encoding));
-        return ParseStatus::Success;
-      }
-      if (CheckCSRNameConflict())
-        return ParseStatus::Failure;
-    }
-
     auto SysReg = RISCVSysReg::lookupSysRegByName(Identifier);
     if (!SysReg)
+      SysReg = RISCVSysReg::lookupSysRegByAltName(Identifier);
+    if (!SysReg)
       if ((SysReg = RISCVSysReg::lookupSysRegByDeprecatedName(Identifier)))
         Warning(S, "'" + Identifier + "' is a deprecated alias for '" +
                        SysReg->Name + "'");
 
-    // Check for CSR encoding conflicts.
-    // Custom CSR encoding might conflict with CSR encoding in privileged spec.
-    // E.g. - SiFive mnscratch(0x350) and privileged spec miselect(0x350).
-    auto CheckCSREncodingConflict = [&]() {
-      auto Reg = RISCVSysReg::lookupSiFiveRegByEncoding(SysReg->Encoding);
-      if (Reg && Reg->haveVendorRequiredFeatures(getSTI().getFeatureBits())) {
-        Warning(S, "'" + Identifier + "' CSR is not available on the current " +
-                       "subtarget. Instead '" + Reg->Name +
-                       "' CSR will be used.");
-        Operands.push_back(
-            RISCVOperand::createSysReg(Reg->Name, S, Reg->Encoding));
-        return true;
-      }
-      return false;
-    };
-
-    // Accept a named SysReg if the required features are present.
+    // Accept a named Sys Reg if the required features are present.
     if (SysReg) {
       if (!SysReg->haveRequiredFeatures(getSTI().getFeatureBits()))
         return Error(S, "system register use requires an option to be enabled");
-      if (CheckCSREncodingConflict())
-        return ParseStatus::Success;
       Operands.push_back(
           RISCVOperand::createSysReg(Identifier, S, SysReg->Encoding));
       return ParseStatus::Success;
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 53e2b6b4d94e..ed80da14c795 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -74,6 +74,17 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint32_t RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeGPRX1X5RegisterClass(MCInst &Inst, uint32_t RegNo,
+                                               uint64_t Address,
+                                               const MCDisassembler *Decoder) {
+  MCRegister Reg = RISCV::X0 + RegNo;
+  if (Reg != RISCV::X1 && Reg != RISCV::X5)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, uint32_t RegNo,
                                              uint64_t Address,
                                              const MCDisassembler *Decoder) {
@@ -359,6 +370,10 @@ static DecodeStatus decodeRegReg(MCInst &Inst, uint32_t Insn, uint64_t Address,
 static DecodeStatus decodeZcmpSpimm(MCInst &Inst, unsigned Imm,
                                     uint64_t Address, const void *Decoder);
 
+static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder);
+
 #include "RISCVGenDisassemblerTables.inc"
 
 static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn,
@@ -373,6 +388,16 @@ static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
+                                        uint64_t Address,
+                                        const MCDisassembler *Decoder) {
+  uint32_t Rs1 = fieldFromInstruction(Insn, 7, 5);
+  DecodeStatus Result = DecodeGPRX1X5RegisterClass(Inst, Rs1, Address, Decoder);
+  (void)Result;
+  assert(Result == MCDisassembler::Success && "Invalid register");
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, uint32_t Insn,
                                          uint64_t Address,
                                          const MCDisassembler *Decoder) {
@@ -462,10 +487,8 @@ static DecodeStatus decodeRegReg(MCInst &Inst, uint32_t Insn, uint64_t Address,
   return MCDisassembler::Success;
 }
 
-// spimm is based on rlist now.
 static DecodeStatus decodeZcmpSpimm(MCInst &Inst, unsigned Imm,
                                     uint64_t Address, const void *Decoder) {
-  // TODO: check if spimm matches rlist
   Inst.addOperand(MCOperand::createImm(Imm));
   return MCDisassembler::Success;
 }
@@ -568,8 +591,6 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     TRY_TO_DECODE_FEATURE(
         RISCV::FeatureVendorXSfvfnrclipxfqf, DecoderTableXSfvfnrclipxfqf32,
         "SiFive FP32-to-int8 Ranged Clip Instructions opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSfcie, DecoderTableXSfcie32,
-                          "Sifive CIE custom opcode table");
     TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVbitmanip,
                           DecoderTableXCVbitmanip32,
                           "CORE-V Bit Manipulation custom opcode table");
@@ -600,6 +621,8 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   TRY_TO_DECODE_AND_ADD_SP(!STI.hasFeature(RISCV::Feature64Bit),
                            DecoderTableRISCV32Only_16,
                            "RISCV32Only_16 table (16-bit Instruction)");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureStdExtZicfiss, DecoderTableZicfiss16,
+                        "RVZicfiss table (Shadow Stack)");
   TRY_TO_DECODE_FEATURE(RISCV::FeatureStdExtZcmt, DecoderTableRVZcmt16,
                         "Zcmt table (16-bit Table Jump Instructions)");
   TRY_TO_DECODE_FEATURE(
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 28ec999157c6..079906d1958c 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -101,7 +101,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower();
 
   auto &RotateActions = getActionDefinitionsBuilder({G_ROTL, G_ROTR});
-  if (ST.hasStdExtZbb()) {
+  if (ST.hasStdExtZbb() || ST.hasStdExtZbkb()) {
     RotateActions.legalFor({{s32, sXLen}, {sXLen, sXLen}});
     // Widen s32 rotate amount to s64 so SDAG patterns will match.
     if (ST.is64Bit())
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index 66a46a485f53..74d0db545e55 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -27,7 +27,6 @@ extern const SubtargetFeatureKV RISCVFeatureKV[RISCV::NumSubtargetFeatures];
 
 namespace RISCVSysReg {
 #define GET_SysRegsList_IMPL
-#define GET_SiFiveRegsList_IMPL
 #include "RISCVGenSearchableTables.inc"
 } // namespace RISCVSysReg
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 30ed36525e29..c32210fc1419 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -401,6 +401,7 @@ int getLoadFPImm(APFloat FPImm);
 namespace RISCVSysReg {
 struct SysReg {
   const char *Name;
+  const char *AltName;
   const char *DeprecatedName;
   unsigned Encoding;
   // FIXME: add these additional fields when needed.
@@ -424,22 +425,9 @@ struct SysReg {
       return true;
     return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
   }
-
-  bool haveVendorRequiredFeatures(const FeatureBitset &ActiveFeatures) const {
-    // Not in 32-bit mode.
-    if (isRV32Only && ActiveFeatures[RISCV::Feature64Bit])
-      return false;
-    // No required feature associated with the system register.
-    if (FeaturesRequired.none())
-      return false;
-    return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
-  }
 };
 
-struct SiFiveReg : SysReg {};
-
 #define GET_SysRegsList_DECL
-#define GET_SiFiveRegsList_DECL
 #include "RISCVGenSearchableTables.inc"
 } // end namespace RISCVSysReg
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 195dda0b8b14..bd899495812f 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -121,11 +121,8 @@ void RISCVInstPrinter::printCSRSystemRegister(const MCInst *MI, unsigned OpNo,
                                               const MCSubtargetInfo &STI,
                                               raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNo).getImm();
-  auto SiFiveReg = RISCVSysReg::lookupSiFiveRegByEncoding(Imm);
   auto SysReg = RISCVSysReg::lookupSysRegByEncoding(Imm);
-  if (SiFiveReg && SiFiveReg->haveVendorRequiredFeatures(STI.getFeatureBits()))
-    markup(O, Markup::Register) << SiFiveReg->Name;
-  else if (SysReg && SysReg->haveRequiredFeatures(STI.getFeatureBits()))
+  if (SysReg && SysReg->haveRequiredFeatures(STI.getFeatureBits()))
     markup(O, Markup::Register) << SysReg->Name;
   else
     markup(O, Markup::Register) << formatImm(Imm);
diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.td b/llvm/lib/Target/RISCV/RISCVCallingConv.td
index 130a6ecc143d..3dd0b3723828 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.td
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.td
@@ -14,7 +14,7 @@
 // RISCVISelLowering.cpp (CC_RISCV).
 
 def CSR_ILP32_LP64
-    : CalleeSavedRegs<(add X1, X3, X4, X8, X9, (sequence "X%u", 18, 27))>;
+    : CalleeSavedRegs<(add X1, X8, X9, (sequence "X%u", 18, 27))>;
 
 def CSR_ILP32F_LP64F
     : CalleeSavedRegs<(add CSR_ILP32_LP64,
@@ -29,7 +29,7 @@ def CSR_NoRegs : CalleeSavedRegs<(add)>;
 
 // Interrupt handler needs to save/restore all registers that are used,
 // both Caller and Callee saved registers.
-def CSR_Interrupt : CalleeSavedRegs<(add X1, (sequence "X%u", 3, 31))>;
+def CSR_Interrupt : CalleeSavedRegs<(add X1, (sequence "X%u", 5, 31))>;
 
 // Same as CSR_Interrupt, but including all 32-bit FP registers.
 def CSR_XLEN_F32_Interrupt: CalleeSavedRegs<(add CSR_Interrupt,
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 2095446c694b..59b202606dad 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -687,6 +687,28 @@ def HasStdExtZicond : Predicate<"Subtarget->hasStdExtZicond()">,
                                 AssemblerPredicate<(all_of FeatureStdExtZicond),
                                 "'Zicond' (Integer Conditional Operations)">;
 
+def FeatureStdExtZimop : SubtargetFeature<"experimental-zimop", "HasStdExtZimop", "true",
+                                          "'Zimop' (May-Be-Operations)">;
+def HasStdExtZimop : Predicate<"Subtarget->hasStdExtZimop()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZimop),
+                               "'Zimop' (May-Be-Operations)">;
+
+def FeatureStdExtZcmop : SubtargetFeature<"experimental-zcmop", "HasStdExtZcmop", "true",
+                                          "'Zcmop' (Compressed May-Be-Operations)",
+                                          [FeatureStdExtZca]>;
+def HasStdExtZcmop : Predicate<"Subtarget->hasStdExtZcmop()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZcmop),
+                               "'Zcmop' (Compressed May-Be-Operations)">;
+
+def FeatureStdExtZicfiss
+    : SubtargetFeature<"experimental-zicfiss", "HasStdExtZicfiss", "true",
+                       "'Zicfiss' (Shadow stack)",
+                       [FeatureStdExtZicsr, FeatureStdExtZimop]>;
+def HasStdExtZicfiss : Predicate<"Subtarget->hasStdExtZicfiss()">,
+                                 AssemblerPredicate<(all_of FeatureStdExtZicfiss),
+                                 "'Zicfiss' (Shadow stack)">;
+def NoHasStdExtZicfiss : Predicate<"!Subtarget->hasStdExtZicfiss()">;
+
 def FeatureStdExtSmaia
     : SubtargetFeature<"smaia", "HasStdExtSmaia", "true",
                        "'Smaia' (Smaia encompasses all added CSRs and all "
@@ -813,13 +835,6 @@ def HasVendorXSfvcp : Predicate<"Subtarget->hasVendorXSfvcp()">,
                                 AssemblerPredicate<(all_of FeatureVendorXSfvcp),
                                 "'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions)">;
 
-def FeatureVendorXSfcie
-    : SubtargetFeature<"xsfcie", "HasVendorXSfcie", "true",
-                       "'XSfcie' (SiFive Custom Instruction Extension SCIE.)">;
-def HasVendorXSfcie : Predicate<"Subtarget->hasVendorXSfcie()">,
-                        AssemblerPredicate<(all_of FeatureVendorXSfcie),
-                        "'XSfcie' (SiFive Custom Instruction Extension SCIE.)">;
-
 def FeatureVendorXSfvqmaccdod
     : SubtargetFeature<"xsfvqmaccdod", "HasVendorXSfvqmaccdod", "true",
                        "'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2))",
@@ -977,9 +992,19 @@ def TuneLUIADDIFusion
 def TuneAUIPCADDIFusion
     : SubtargetFeature<"auipc-addi-fusion", "HasAUIPCADDIFusion",
                        "true", "Enable AUIPC+ADDI macrofusion">;
-def TuneShiftedZExtFusion
-    : SubtargetFeature<"shifted-zext-fusion", "HasShiftedZExtFusion",
-                       "true", "Enable SLLI+SRLI to be fused when computing (shifted) zero extension">;
+
+def TuneZExtHFusion
+    : SubtargetFeature<"zexth-fusion", "HasZExtHFusion",
+                       "true", "Enable SLLI+SRLI to be fused to zero extension of halfword">;
+
+def TuneZExtWFusion
+    : SubtargetFeature<"zextw-fusion", "HasZExtWFusion",
+                       "true", "Enable SLLI+SRLI to be fused to zero extension of word">;
+
+def TuneShiftedZExtWFusion
+    : SubtargetFeature<"shifted-zextw-fusion", "HasShiftedZExtWFusion",
+                       "true", "Enable SLLI+SRLI to be fused when computing (shifted) zero extension of word">;
+
 def TuneLDADDFusion
     : SubtargetFeature<"ld-add-fusion", "HasLDADDFusion",
                        "true", "Enable LD+ADD macrofusion.">;
@@ -1001,12 +1026,8 @@ def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
                                    [TuneNoDefaultUnroll,
                                     TuneShortForwardBranchOpt]>;
 
-def TuneVeyronFusions : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
-                                         "Ventana Veyron-Series processors",
-                                         [TuneLUIADDIFusion,
-                                          TuneAUIPCADDIFusion,
-                                          TuneShiftedZExtFusion,
-                                          TuneLDADDFusion]>;
+def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
+                                         "Ventana Veyron-Series processors">;
 
 // Assume that lock-free native-width atomics are available, even if the target
 // and operating system combination would not usually provide them. The user
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 098a320c9153..bfa3bf3cc74e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1360,7 +1360,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (N0.getOpcode() != ISD::AND || !isa<ConstantSDNode>(N0.getOperand(1)))
       break;
 
-    uint64_t C2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+    uint64_t C2 = N0.getConstantOperandVal(1);
 
     // Constant should be a mask.
     if (!isMask_64(C2))
@@ -1604,7 +1604,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     break;
   }
   case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(1);
     switch (IntNo) {
       // By default we do not custom select any intrinsic.
     default:
@@ -1825,7 +1825,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     break;
   }
   case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    unsigned IntNo = Node->getConstantOperandVal(1);
     switch (IntNo) {
     case Intrinsic::riscv_vsseg2:
     case Intrinsic::riscv_vsseg3:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 97d76ca494cb..03a59f8a8b57 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -675,7 +675,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_FP_TO_UINT,  ISD::VP_SETCC,       ISD::VP_SIGN_EXTEND,
         ISD::VP_ZERO_EXTEND, ISD::VP_TRUNCATE,    ISD::VP_SMIN,
         ISD::VP_SMAX,        ISD::VP_UMIN,        ISD::VP_UMAX,
-        ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE};
+        ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE};
 
     static const unsigned FloatingPointVPOps[] = {
         ISD::VP_FADD,        ISD::VP_FSUB,        ISD::VP_FMUL,
@@ -688,7 +688,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_FCEIL,       ISD::VP_FFLOOR,      ISD::VP_FROUND,
         ISD::VP_FROUNDEVEN,  ISD::VP_FCOPYSIGN,   ISD::VP_FROUNDTOZERO,
         ISD::VP_FRINT,       ISD::VP_FNEARBYINT,  ISD::VP_IS_FPCLASS,
-        ISD::EXPERIMENTAL_VP_REVERSE};
+        ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE};
 
     static const unsigned IntegerVecReduceOps[] = {
         ISD::VECREDUCE_ADD,  ISD::VECREDUCE_AND,  ISD::VECREDUCE_OR,
@@ -773,6 +773,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
       setOperationAction(ISD::VECTOR_REVERSE, VT, Custom);
 
+      setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom);
       setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom);
 
       setOperationPromotedToType(
@@ -1147,6 +1148,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                               ISD::VP_SETCC, ISD::VP_TRUNCATE},
                              VT, Custom);
 
+          setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom);
           setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom);
           continue;
         }
@@ -1372,8 +1374,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setPrefLoopAlignment(Subtarget.getPrefLoopAlignment());
 
   setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
-                       ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::AND,
-                       ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT});
+                       ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::MUL,
+                       ISD::AND, ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT});
   if (Subtarget.is64Bit())
     setTargetDAGCombine(ISD::SRA);
 
@@ -5528,7 +5530,7 @@ static unsigned getRISCVVLOp(SDValue Op) {
   case ISD::VP_SELECT:
     return RISCVISD::VSELECT_VL;
   case ISD::VP_MERGE:
-    return RISCVISD::VP_MERGE_VL;
+    return RISCVISD::VMERGE_VL;
   case ISD::VP_ASHR:
     return RISCVISD::SRA_VL;
   case ISD::VP_LSHR:
@@ -5576,6 +5578,8 @@ static bool hasMergeOp(unsigned Opcode) {
     return true;
   if (Opcode >= RISCVISD::STRICT_FADD_VL && Opcode <= RISCVISD::STRICT_FDIV_VL)
     return true;
+  if (Opcode == RISCVISD::VMERGE_VL)
+    return true;
   return false;
 }
 
@@ -6637,6 +6641,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
          !Subtarget.hasVInstructionsF16()))
       return SplitVPOp(Op, DAG);
     return lowerVectorFTRUNC_FCEIL_FFLOOR_FROUND(Op, DAG, Subtarget);
+  case ISD::EXPERIMENTAL_VP_SPLICE:
+    return lowerVPSpliceExperimental(Op, DAG);
   case ISD::EXPERIMENTAL_VP_REVERSE:
     return lowerVPReverseExperimental(Op, DAG);
   }
@@ -7229,7 +7235,7 @@ SDValue RISCVTargetLowering::lowerFRAMEADDR(SDValue Op,
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, VT);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   while (Depth--) {
     int Offset = -(XLenInBytes * 2);
     SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
@@ -7254,7 +7260,7 @@ SDValue RISCVTargetLowering::lowerRETURNADDR(SDValue Op,
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   if (Depth) {
     int Off = -XLenInBytes;
     SDValue FrameAddr = lowerFRAMEADDR(Op, DAG);
@@ -8238,8 +8244,8 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG,
                          AVL);
     // TUMA or TUMU: Currently we always emit tumu policy regardless of tuma.
     // It's fine because vmerge does not care mask policy.
-    return DAG.getNode(RISCVISD::VP_MERGE_VL, DL, VT, Mask, Vec, MaskedOff,
-                       AVL);
+    return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, Mask, Vec, MaskedOff,
+                       MaskedOff, AVL);
   }
   }
 
@@ -10312,9 +10318,20 @@ SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG) const {
   for (const auto &OpIdx : enumerate(Op->ops())) {
     SDValue V = OpIdx.value();
     assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
-    // Add dummy merge value before the mask.
-    if (HasMergeOp && *ISD::getVPMaskIdx(Op.getOpcode()) == OpIdx.index())
-      Ops.push_back(DAG.getUNDEF(ContainerVT));
+    // Add dummy merge value before the mask. Or if there isn't a mask, before
+    // EVL.
+    if (HasMergeOp) {
+      auto MaskIdx = ISD::getVPMaskIdx(Op.getOpcode());
+      if (MaskIdx) {
+        if (*MaskIdx == OpIdx.index())
+          Ops.push_back(DAG.getUNDEF(ContainerVT));
+      } else if (ISD::getVPExplicitVectorLengthIdx(Op.getOpcode()) ==
+                 OpIdx.index()) {
+        // For VP_MERGE, copy the false operand instead of an undef value.
+        assert(Op.getOpcode() == ISD::VP_MERGE);
+        Ops.push_back(Ops.back());
+      }
+    }
     // Pass through operands which aren't fixed-length vectors.
     if (!V.getValueType().isFixedLengthVector()) {
       Ops.push_back(V);
@@ -10583,6 +10600,87 @@ SDValue RISCVTargetLowering::lowerVPFPIntConvOp(SDValue Op,
 }
 
 SDValue
+RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
+  SDValue Op1 = Op.getOperand(0);
+  SDValue Op2 = Op.getOperand(1);
+  SDValue Offset = Op.getOperand(2);
+  SDValue Mask = Op.getOperand(3);
+  SDValue EVL1 = Op.getOperand(4);
+  SDValue EVL2 = Op.getOperand(5);
+
+  const MVT XLenVT = Subtarget.getXLenVT();
+  MVT VT = Op.getSimpleValueType();
+  MVT ContainerVT = VT;
+  if (VT.isFixedLengthVector()) {
+    ContainerVT = getContainerForFixedLengthVector(VT);
+    Op1 = convertToScalableVector(ContainerVT, Op1, DAG, Subtarget);
+    Op2 = convertToScalableVector(ContainerVT, Op2, DAG, Subtarget);
+    MVT MaskVT = getMaskTypeFor(ContainerVT);
+    Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+  }
+
+  bool IsMaskVector = VT.getVectorElementType() == MVT::i1;
+  if (IsMaskVector) {
+    ContainerVT = ContainerVT.changeVectorElementType(MVT::i8);
+
+    // Expand input operands
+    SDValue SplatOneOp1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                                      DAG.getUNDEF(ContainerVT),
+                                      DAG.getConstant(1, DL, XLenVT), EVL1);
+    SDValue SplatZeroOp1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                                       DAG.getUNDEF(ContainerVT),
+                                       DAG.getConstant(0, DL, XLenVT), EVL1);
+    Op1 = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Op1, SplatOneOp1,
+                      SplatZeroOp1, EVL1);
+
+    SDValue SplatOneOp2 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                                      DAG.getUNDEF(ContainerVT),
+                                      DAG.getConstant(1, DL, XLenVT), EVL2);
+    SDValue SplatZeroOp2 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                                       DAG.getUNDEF(ContainerVT),
+                                       DAG.getConstant(0, DL, XLenVT), EVL2);
+    Op2 = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Op2, SplatOneOp2,
+                      SplatZeroOp2, EVL2);
+  }
+
+  int64_t ImmValue = cast<ConstantSDNode>(Offset)->getSExtValue();
+  SDValue DownOffset, UpOffset;
+  if (ImmValue >= 0) {
+    // The operand is a TargetConstant, we need to rebuild it as a regular
+    // constant.
+    DownOffset = DAG.getConstant(ImmValue, DL, XLenVT);
+    UpOffset = DAG.getNode(ISD::SUB, DL, XLenVT, EVL1, DownOffset);
+  } else {
+    // The operand is a TargetConstant, we need to rebuild it as a regular
+    // constant rather than negating the original operand.
+    UpOffset = DAG.getConstant(-ImmValue, DL, XLenVT);
+    DownOffset = DAG.getNode(ISD::SUB, DL, XLenVT, EVL1, UpOffset);
+  }
+
+  SDValue SlideDown =
+      getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
+                    Op1, DownOffset, Mask, UpOffset);
+  SDValue Result = getVSlideup(DAG, Subtarget, DL, ContainerVT, SlideDown, Op2,
+                               UpOffset, Mask, EVL2, RISCVII::TAIL_AGNOSTIC);
+
+  if (IsMaskVector) {
+    // Truncate Result back to a mask vector (Result has same EVL as Op2)
+    Result = DAG.getNode(
+        RISCVISD::SETCC_VL, DL, ContainerVT.changeVectorElementType(MVT::i1),
+        {Result, DAG.getConstant(0, DL, ContainerVT),
+         DAG.getCondCode(ISD::SETNE), DAG.getUNDEF(getMaskTypeFor(ContainerVT)),
+         Mask, EVL2});
+  }
+
+  if (!VT.isFixedLengthVector())
+    return Result;
+  return convertFromScalableVector(VT, Result, DAG, Subtarget);
+}
+
+SDValue
 RISCVTargetLowering::lowerVPReverseExperimental(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -11633,7 +11731,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    unsigned IntNo = N->getConstantOperandVal(0);
     switch (IntNo) {
     default:
       llvm_unreachable(
@@ -12752,9 +12850,9 @@ struct CombineResult;
 
 /// Helper class for folding sign/zero extensions.
 /// In particular, this class is used for the following combines:
-/// add_vl -> vwadd(u) | vwadd(u)_w
-/// sub_vl -> vwsub(u) | vwsub(u)_w
-/// mul_vl -> vwmul(u) | vwmul_su
+/// add | add_vl -> vwadd(u) | vwadd(u)_w
+/// sub | sub_vl -> vwsub(u) | vwsub(u)_w
+/// mul | mul_vl -> vwmul(u) | vwmul_su
 ///
 /// An object of this class represents an operand of the operation we want to
 /// combine.
@@ -12799,6 +12897,8 @@ struct NodeExtensionHelper {
   /// E.g., for zext(a), this would return a.
   SDValue getSource() const {
     switch (OrigOperand.getOpcode()) {
+    case ISD::ZERO_EXTEND:
+    case ISD::SIGN_EXTEND:
     case RISCVISD::VSEXT_VL:
     case RISCVISD::VZEXT_VL:
       return OrigOperand.getOperand(0);
@@ -12815,7 +12915,8 @@ struct NodeExtensionHelper {
   /// Get or create a value that can feed \p Root with the given extension \p
   /// SExt. If \p SExt is std::nullopt, this returns the source of this operand.
   /// \see ::getSource().
-  SDValue getOrCreateExtendedOp(const SDNode *Root, SelectionDAG &DAG,
+  SDValue getOrCreateExtendedOp(SDNode *Root, SelectionDAG &DAG,
+                                const RISCVSubtarget &Subtarget,
                                 std::optional<bool> SExt) const {
     if (!SExt.has_value())
       return OrigOperand;
@@ -12830,8 +12931,10 @@ struct NodeExtensionHelper {
 
     // If we need an extension, we should be changing the type.
     SDLoc DL(Root);
-    auto [Mask, VL] = getMaskAndVL(Root);
+    auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget);
     switch (OrigOperand.getOpcode()) {
+    case ISD::ZERO_EXTEND:
+    case ISD::SIGN_EXTEND:
     case RISCVISD::VSEXT_VL:
     case RISCVISD::VZEXT_VL:
       return DAG.getNode(ExtOpc, DL, NarrowVT, Source, Mask, VL);
@@ -12871,12 +12974,15 @@ struct NodeExtensionHelper {
   /// \pre \p Opcode represents a supported root (\see ::isSupportedRoot()).
   static unsigned getSameExtensionOpcode(unsigned Opcode, bool IsSExt) {
     switch (Opcode) {
+    case ISD::ADD:
     case RISCVISD::ADD_VL:
     case RISCVISD::VWADD_W_VL:
     case RISCVISD::VWADDU_W_VL:
       return IsSExt ? RISCVISD::VWADD_VL : RISCVISD::VWADDU_VL;
+    case ISD::MUL:
     case RISCVISD::MUL_VL:
       return IsSExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL;
+    case ISD::SUB:
     case RISCVISD::SUB_VL:
     case RISCVISD::VWSUB_W_VL:
     case RISCVISD::VWSUBU_W_VL:
@@ -12889,7 +12995,8 @@ struct NodeExtensionHelper {
   /// Get the opcode to materialize \p Opcode(sext(a), zext(b)) ->
   /// newOpcode(a, b).
   static unsigned getSUOpcode(unsigned Opcode) {
-    assert(Opcode == RISCVISD::MUL_VL && "SU is only supported for MUL");
+    assert((Opcode == RISCVISD::MUL_VL || Opcode == ISD::MUL) &&
+           "SU is only supported for MUL");
     return RISCVISD::VWMULSU_VL;
   }
 
@@ -12897,8 +13004,10 @@ struct NodeExtensionHelper {
   /// newOpcode(a, b).
   static unsigned getWOpcode(unsigned Opcode, bool IsSExt) {
     switch (Opcode) {
+    case ISD::ADD:
     case RISCVISD::ADD_VL:
       return IsSExt ? RISCVISD::VWADD_W_VL : RISCVISD::VWADDU_W_VL;
+    case ISD::SUB:
     case RISCVISD::SUB_VL:
       return IsSExt ? RISCVISD::VWSUB_W_VL : RISCVISD::VWSUBU_W_VL;
     default:
@@ -12908,19 +13017,33 @@ struct NodeExtensionHelper {
 
   using CombineToTry = std::function<std::optional<CombineResult>(
       SDNode * /*Root*/, const NodeExtensionHelper & /*LHS*/,
-      const NodeExtensionHelper & /*RHS*/)>;
+      const NodeExtensionHelper & /*RHS*/, SelectionDAG &,
+      const RISCVSubtarget &)>;
 
   /// Check if this node needs to be fully folded or extended for all users.
   bool needToPromoteOtherUsers() const { return EnforceOneUse; }
 
   /// Helper method to set the various fields of this struct based on the
   /// type of \p Root.
-  void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG) {
+  void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG,
+                              const RISCVSubtarget &Subtarget) {
     SupportsZExt = false;
     SupportsSExt = false;
     EnforceOneUse = true;
     CheckMask = true;
-    switch (OrigOperand.getOpcode()) {
+    unsigned Opc = OrigOperand.getOpcode();
+    switch (Opc) {
+    case ISD::ZERO_EXTEND:
+    case ISD::SIGN_EXTEND: {
+      if (OrigOperand.getValueType().isVector()) {
+        SupportsZExt = Opc == ISD::ZERO_EXTEND;
+        SupportsSExt = Opc == ISD::SIGN_EXTEND;
+        SDLoc DL(Root);
+        MVT VT = Root->getSimpleValueType(0);
+        std::tie(Mask, VL) = getDefaultScalableVLOps(VT, DL, DAG, Subtarget);
+      }
+      break;
+    }
     case RISCVISD::VZEXT_VL:
       SupportsZExt = true;
       Mask = OrigOperand.getOperand(1);
@@ -12976,8 +13099,16 @@ struct NodeExtensionHelper {
   }
 
   /// Check if \p Root supports any extension folding combines.
-  static bool isSupportedRoot(const SDNode *Root) {
+  static bool isSupportedRoot(const SDNode *Root, const SelectionDAG &DAG) {
     switch (Root->getOpcode()) {
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL: {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      if (!TLI.isTypeLegal(Root->getValueType(0)))
+        return false;
+      return Root->getValueType(0).isScalableVector();
+    }
     case RISCVISD::ADD_VL:
     case RISCVISD::MUL_VL:
     case RISCVISD::VWADD_W_VL:
@@ -12992,9 +13123,10 @@ struct NodeExtensionHelper {
   }
 
   /// Build a NodeExtensionHelper for \p Root.getOperand(\p OperandIdx).
-  NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG) {
-    assert(isSupportedRoot(Root) && "Trying to build an helper with an "
-                                    "unsupported root");
+  NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG,
+                      const RISCVSubtarget &Subtarget) {
+    assert(isSupportedRoot(Root, DAG) && "Trying to build an helper with an "
+                                         "unsupported root");
     assert(OperandIdx < 2 && "Requesting something else than LHS or RHS");
     OrigOperand = Root->getOperand(OperandIdx);
 
@@ -13010,7 +13142,7 @@ struct NodeExtensionHelper {
         SupportsZExt =
             Opc == RISCVISD::VWADDU_W_VL || Opc == RISCVISD::VWSUBU_W_VL;
         SupportsSExt = !SupportsZExt;
-        std::tie(Mask, VL) = getMaskAndVL(Root);
+        std::tie(Mask, VL) = getMaskAndVL(Root, DAG, Subtarget);
         CheckMask = true;
         // There's no existing extension here, so we don't have to worry about
         // making sure it gets removed.
@@ -13019,7 +13151,7 @@ struct NodeExtensionHelper {
       }
       [[fallthrough]];
     default:
-      fillUpExtensionSupport(Root, DAG);
+      fillUpExtensionSupport(Root, DAG, Subtarget);
       break;
     }
   }
@@ -13035,14 +13167,27 @@ struct NodeExtensionHelper {
   }
 
   /// Helper function to get the Mask and VL from \p Root.
-  static std::pair<SDValue, SDValue> getMaskAndVL(const SDNode *Root) {
-    assert(isSupportedRoot(Root) && "Unexpected root");
-    return std::make_pair(Root->getOperand(3), Root->getOperand(4));
+  static std::pair<SDValue, SDValue>
+  getMaskAndVL(const SDNode *Root, SelectionDAG &DAG,
+               const RISCVSubtarget &Subtarget) {
+    assert(isSupportedRoot(Root, DAG) && "Unexpected root");
+    switch (Root->getOpcode()) {
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL: {
+      SDLoc DL(Root);
+      MVT VT = Root->getSimpleValueType(0);
+      return getDefaultScalableVLOps(VT, DL, DAG, Subtarget);
+    }
+    default:
+      return std::make_pair(Root->getOperand(3), Root->getOperand(4));
+    }
   }
 
   /// Check if the Mask and VL of this operand are compatible with \p Root.
-  bool areVLAndMaskCompatible(const SDNode *Root) const {
-    auto [Mask, VL] = getMaskAndVL(Root);
+  bool areVLAndMaskCompatible(SDNode *Root, SelectionDAG &DAG,
+                              const RISCVSubtarget &Subtarget) const {
+    auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget);
     return isMaskCompatible(Mask) && isVLCompatible(VL);
   }
 
@@ -13050,11 +13195,14 @@ struct NodeExtensionHelper {
   /// foldings that are supported by this class.
   static bool isCommutative(const SDNode *N) {
     switch (N->getOpcode()) {
+    case ISD::ADD:
+    case ISD::MUL:
     case RISCVISD::ADD_VL:
     case RISCVISD::MUL_VL:
     case RISCVISD::VWADD_W_VL:
     case RISCVISD::VWADDU_W_VL:
       return true;
+    case ISD::SUB:
     case RISCVISD::SUB_VL:
     case RISCVISD::VWSUB_W_VL:
     case RISCVISD::VWSUBU_W_VL:
@@ -13099,14 +13247,25 @@ struct CombineResult {
   /// Return a value that uses TargetOpcode and that can be used to replace
   /// Root.
   /// The actual replacement is *not* done in that method.
-  SDValue materialize(SelectionDAG &DAG) const {
+  SDValue materialize(SelectionDAG &DAG,
+                      const RISCVSubtarget &Subtarget) const {
     SDValue Mask, VL, Merge;
-    std::tie(Mask, VL) = NodeExtensionHelper::getMaskAndVL(Root);
-    Merge = Root->getOperand(2);
+    std::tie(Mask, VL) =
+        NodeExtensionHelper::getMaskAndVL(Root, DAG, Subtarget);
+    switch (Root->getOpcode()) {
+    default:
+      Merge = Root->getOperand(2);
+      break;
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+      Merge = DAG.getUNDEF(Root->getValueType(0));
+      break;
+    }
     return DAG.getNode(TargetOpcode, SDLoc(Root), Root->getValueType(0),
-                       LHS.getOrCreateExtendedOp(Root, DAG, SExtLHS),
-                       RHS.getOrCreateExtendedOp(Root, DAG, SExtRHS), Merge,
-                       Mask, VL);
+                       LHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtLHS),
+                       RHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtRHS),
+                       Merge, Mask, VL);
   }
 };
 
@@ -13123,15 +13282,16 @@ struct CombineResult {
 static std::optional<CombineResult>
 canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS,
                                  const NodeExtensionHelper &RHS, bool AllowSExt,
-                                 bool AllowZExt) {
+                                 bool AllowZExt, SelectionDAG &DAG,
+                                 const RISCVSubtarget &Subtarget) {
   assert((AllowSExt || AllowZExt) && "Forgot to set what you want?");
-  if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root))
+  if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) ||
+      !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget))
     return std::nullopt;
   if (AllowZExt && LHS.SupportsZExt && RHS.SupportsZExt)
     return CombineResult(NodeExtensionHelper::getSameExtensionOpcode(
                              Root->getOpcode(), /*IsSExt=*/false),
-                         Root, LHS, /*SExtLHS=*/false, RHS,
-                         /*SExtRHS=*/false);
+                         Root, LHS, /*SExtLHS=*/false, RHS, /*SExtRHS=*/false);
   if (AllowSExt && LHS.SupportsSExt && RHS.SupportsSExt)
     return CombineResult(NodeExtensionHelper::getSameExtensionOpcode(
                              Root->getOpcode(), /*IsSExt=*/true),
@@ -13148,9 +13308,10 @@ canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS,
 /// can be used to apply the pattern.
 static std::optional<CombineResult>
 canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS,
-                             const NodeExtensionHelper &RHS) {
+                             const NodeExtensionHelper &RHS, SelectionDAG &DAG,
+                             const RISCVSubtarget &Subtarget) {
   return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true,
-                                          /*AllowZExt=*/true);
+                                          /*AllowZExt=*/true, DAG, Subtarget);
 }
 
 /// Check if \p Root follows a pattern Root(LHS, ext(RHS))
@@ -13159,8 +13320,9 @@ canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS,
 /// can be used to apply the pattern.
 static std::optional<CombineResult>
 canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS,
-              const NodeExtensionHelper &RHS) {
-  if (!RHS.areVLAndMaskCompatible(Root))
+              const NodeExtensionHelper &RHS, SelectionDAG &DAG,
+              const RISCVSubtarget &Subtarget) {
+  if (!RHS.areVLAndMaskCompatible(Root, DAG, Subtarget))
     return std::nullopt;
 
   // FIXME: Is it useful to form a vwadd.wx or vwsub.wx if it removes a scalar
@@ -13184,9 +13346,10 @@ canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS,
 /// can be used to apply the pattern.
 static std::optional<CombineResult>
 canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS,
-                    const NodeExtensionHelper &RHS) {
+                    const NodeExtensionHelper &RHS, SelectionDAG &DAG,
+                    const RISCVSubtarget &Subtarget) {
   return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true,
-                                          /*AllowZExt=*/false);
+                                          /*AllowZExt=*/false, DAG, Subtarget);
 }
 
 /// Check if \p Root follows a pattern Root(zext(LHS), zext(RHS))
@@ -13195,9 +13358,10 @@ canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS,
 /// can be used to apply the pattern.
 static std::optional<CombineResult>
 canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS,
-                    const NodeExtensionHelper &RHS) {
+                    const NodeExtensionHelper &RHS, SelectionDAG &DAG,
+                    const RISCVSubtarget &Subtarget) {
   return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/false,
-                                          /*AllowZExt=*/true);
+                                          /*AllowZExt=*/true, DAG, Subtarget);
 }
 
 /// Check if \p Root follows a pattern Root(sext(LHS), zext(RHS))
@@ -13206,10 +13370,13 @@ canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS,
 /// can be used to apply the pattern.
 static std::optional<CombineResult>
 canFoldToVW_SU(SDNode *Root, const NodeExtensionHelper &LHS,
-               const NodeExtensionHelper &RHS) {
+               const NodeExtensionHelper &RHS, SelectionDAG &DAG,
+               const RISCVSubtarget &Subtarget) {
+
   if (!LHS.SupportsSExt || !RHS.SupportsZExt)
     return std::nullopt;
-  if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root))
+  if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) ||
+      !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget))
     return std::nullopt;
   return CombineResult(NodeExtensionHelper::getSUOpcode(Root->getOpcode()),
                        Root, LHS, /*SExtLHS=*/true, RHS, /*SExtRHS=*/false);
@@ -13219,6 +13386,8 @@ SmallVector<NodeExtensionHelper::CombineToTry>
 NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
   SmallVector<CombineToTry> Strategies;
   switch (Root->getOpcode()) {
+  case ISD::ADD:
+  case ISD::SUB:
   case RISCVISD::ADD_VL:
   case RISCVISD::SUB_VL:
     // add|sub -> vwadd(u)|vwsub(u)
@@ -13226,6 +13395,7 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
     // add|sub -> vwadd(u)_w|vwsub(u)_w
     Strategies.push_back(canFoldToVW_W);
     break;
+  case ISD::MUL:
   case RISCVISD::MUL_VL:
     // mul -> vwmul(u)
     Strategies.push_back(canFoldToVWWithSameExtension);
@@ -13256,12 +13426,14 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
 /// mul_vl -> vwmul(u) | vwmul_su
 /// vwadd_w(u) -> vwadd(u)
 /// vwub_w(u) -> vwadd(u)
-static SDValue
-combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           const RISCVSubtarget &Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
 
-  assert(NodeExtensionHelper::isSupportedRoot(N) &&
-         "Shouldn't have called this method");
+  if (!NodeExtensionHelper::isSupportedRoot(N, DAG))
+    return SDValue();
+
   SmallVector<SDNode *> Worklist;
   SmallSet<SDNode *, 8> Inserted;
   Worklist.push_back(N);
@@ -13270,11 +13442,11 @@ combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
 
   while (!Worklist.empty()) {
     SDNode *Root = Worklist.pop_back_val();
-    if (!NodeExtensionHelper::isSupportedRoot(Root))
+    if (!NodeExtensionHelper::isSupportedRoot(Root, DAG))
       return SDValue();
 
-    NodeExtensionHelper LHS(N, 0, DAG);
-    NodeExtensionHelper RHS(N, 1, DAG);
+    NodeExtensionHelper LHS(N, 0, DAG, Subtarget);
+    NodeExtensionHelper RHS(N, 1, DAG, Subtarget);
     auto AppendUsersIfNeeded = [&Worklist,
                                 &Inserted](const NodeExtensionHelper &Op) {
       if (Op.needToPromoteOtherUsers()) {
@@ -13301,7 +13473,8 @@ combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
 
       for (NodeExtensionHelper::CombineToTry FoldingStrategy :
            FoldingStrategies) {
-        std::optional<CombineResult> Res = FoldingStrategy(N, LHS, RHS);
+        std::optional<CombineResult> Res =
+            FoldingStrategy(N, LHS, RHS, DAG, Subtarget);
         if (Res) {
           Matched = true;
           CombinesToApply.push_back(*Res);
@@ -13330,7 +13503,7 @@ combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   SmallVector<std::pair<SDValue, SDValue>> ValuesToReplace;
   ValuesToReplace.reserve(CombinesToApply.size());
   for (CombineResult Res : CombinesToApply) {
-    SDValue NewValue = Res.materialize(DAG);
+    SDValue NewValue = Res.materialize(DAG, Subtarget);
     if (!InputRootReplacement) {
       assert(Res.Root == N &&
              "First element is expected to be the current node");
@@ -13493,6 +13666,7 @@ static SDValue performMemPairCombine(SDNode *N,
 //   (fp_to_int (ffloor X))     -> fcvt X, rdn
 //   (fp_to_int (fceil X))      -> fcvt X, rup
 //   (fp_to_int (fround X))     -> fcvt X, rmm
+//   (fp_to_int (frint X))      -> fcvt X
 static SDValue performFP_TO_INTCombine(SDNode *N,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const RISCVSubtarget &Subtarget) {
@@ -13516,10 +13690,7 @@ static SDValue performFP_TO_INTCombine(SDNode *N,
 
   RISCVFPRndMode::RoundingMode FRM = matchRoundingOp(Src.getOpcode());
   // If the result is invalid, we didn't find a foldable instruction.
-  // If the result is dynamic, then we found an frint which we don't yet
-  // support. It will cause 7 to be written to the FRM CSR for vector.
-  // FIXME: We could support this by using VFCVT_X_F_VL/VFCVT_XU_F_VL below.
-  if (FRM == RISCVFPRndMode::Invalid || FRM == RISCVFPRndMode::DYN)
+  if (FRM == RISCVFPRndMode::Invalid)
     return SDValue();
 
   SDLoc DL(N);
@@ -13558,6 +13729,10 @@ static SDValue performFP_TO_INTCombine(SDNode *N,
       unsigned Opc =
           IsSigned ? RISCVISD::VFCVT_RTZ_X_F_VL : RISCVISD::VFCVT_RTZ_XU_F_VL;
       FpToInt = DAG.getNode(Opc, DL, ContainerVT, XVal, Mask, VL);
+    } else if (FRM == RISCVFPRndMode::DYN) {
+      unsigned Opc =
+          IsSigned ? RISCVISD::VFCVT_X_F_VL : RISCVISD::VFCVT_XU_F_VL;
+      FpToInt = DAG.getNode(Opc, DL, ContainerVT, XVal, Mask, VL);
     } else {
       unsigned Opc =
           IsSigned ? RISCVISD::VFCVT_RM_X_F_VL : RISCVISD::VFCVT_RM_XU_F_VL;
@@ -13978,7 +14153,7 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
     for (SDNode *U : N0->uses()) {
       if (U->getOpcode() != ISD::SRA ||
           !isa<ConstantSDNode>(U->getOperand(1)) ||
-          cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() > 32)
+          U->getConstantOperandVal(1) > 32)
         return SDValue();
     }
 
@@ -14600,13 +14775,20 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
 
 static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
                                const RISCVSubtarget &Subtarget) {
-  assert(N->getOpcode() == RISCVISD::ADD_VL);
+
+  assert(N->getOpcode() == RISCVISD::ADD_VL || N->getOpcode() == ISD::ADD);
+
+  if (N->getValueType(0).isFixedLengthVector())
+    return SDValue();
+
   SDValue Addend = N->getOperand(0);
   SDValue MulOp = N->getOperand(1);
-  SDValue AddMergeOp = N->getOperand(2);
 
-  if (!AddMergeOp.isUndef())
-    return SDValue();
+  if (N->getOpcode() == RISCVISD::ADD_VL) {
+    SDValue AddMergeOp = N->getOperand(2);
+    if (!AddMergeOp.isUndef())
+      return SDValue();
+  }
 
   auto IsVWMulOpc = [](unsigned Opc) {
     switch (Opc) {
@@ -14630,8 +14812,16 @@ static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
   if (!MulMergeOp.isUndef())
     return SDValue();
 
-  SDValue AddMask = N->getOperand(3);
-  SDValue AddVL = N->getOperand(4);
+  auto [AddMask, AddVL] = [](SDNode *N, SelectionDAG &DAG,
+                             const RISCVSubtarget &Subtarget) {
+    if (N->getOpcode() == ISD::ADD) {
+      SDLoc DL(N);
+      return getDefaultScalableVLOps(N->getSimpleValueType(0), DL, DAG,
+                                     Subtarget);
+    }
+    return std::make_pair(N->getOperand(3), N->getOperand(4));
+  }(N, DAG, Subtarget);
+
   SDValue MulMask = MulOp.getOperand(3);
   SDValue MulVL = MulOp.getOperand(4);
 
@@ -14897,10 +15087,18 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     return DAG.getNode(ISD::AND, DL, VT, NewFMV,
                        DAG.getConstant(~SignBit, DL, VT));
   }
-  case ISD::ADD:
+  case ISD::ADD: {
+    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+      return V;
+    if (SDValue V = combineToVWMACC(N, DAG, Subtarget))
+      return V;
     return performADDCombine(N, DAG, Subtarget);
-  case ISD::SUB:
+  }
+  case ISD::SUB: {
+    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+      return V;
     return performSUBCombine(N, DAG, Subtarget);
+  }
   case ISD::AND:
     return performANDCombine(N, DCI, Subtarget);
   case ISD::OR:
@@ -14908,6 +15106,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::XOR:
     return performXORCombine(N, DAG, Subtarget);
   case ISD::MUL:
+    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+      return V;
     return performMULCombine(N, DAG);
   case ISD::FADD:
   case ISD::UMAX:
@@ -15384,7 +15584,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case RISCVISD::ADD_VL:
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI))
+    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
       return V;
     return combineToVWMACC(N, DAG, Subtarget);
   case RISCVISD::SUB_VL:
@@ -15393,7 +15593,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case RISCVISD::VWSUB_W_VL:
   case RISCVISD::VWSUBU_W_VL:
   case RISCVISD::MUL_VL:
-    return combineBinOp_VLToVWBinOp_VL(N, DCI);
+    return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget);
   case RISCVISD::VFMADD_VL:
   case RISCVISD::VFNMADD_VL:
   case RISCVISD::VFMSUB_VL:
@@ -15999,13 +16199,26 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       // We can't do anything for most intrinsics.
       break;
     case Intrinsic::riscv_vsetvli:
-    case Intrinsic::riscv_vsetvlimax:
-      // Assume that VL output is <= 65536.
-      // TODO: Take SEW and LMUL into account.
-      if (BitWidth > 17)
-        Known.Zero.setBitsFrom(17);
+    case Intrinsic::riscv_vsetvlimax: {
+      bool HasAVL = IntNo == Intrinsic::riscv_vsetvli;
+      unsigned VSEW = Op.getConstantOperandVal(HasAVL + 1);
+      RISCVII::VLMUL VLMUL =
+          static_cast<RISCVII::VLMUL>(Op.getConstantOperandVal(HasAVL + 2));
+      unsigned SEW = RISCVVType::decodeVSEW(VSEW);
+      auto [LMul, Fractional] = RISCVVType::decodeVLMUL(VLMUL);
+      uint64_t MaxVL = Subtarget.getRealMaxVLen() / SEW;
+      MaxVL = (Fractional) ? MaxVL / LMul : MaxVL * LMul;
+
+      // Result of vsetvli must be not larger than AVL.
+      if (HasAVL && isa<ConstantSDNode>(Op.getOperand(1)))
+        MaxVL = std::min(MaxVL, Op.getConstantOperandVal(1));
+
+      unsigned KnownZeroFirstBit = Log2_32(MaxVL) + 1;
+      if (BitWidth > KnownZeroFirstBit)
+        Known.Zero.setBitsFrom(KnownZeroFirstBit);
       break;
     }
+    }
     break;
   }
   }
@@ -18571,7 +18784,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VNSRL_VL)
   NODE_NAME_CASE(SETCC_VL)
   NODE_NAME_CASE(VSELECT_VL)
-  NODE_NAME_CASE(VP_MERGE_VL)
+  NODE_NAME_CASE(VMERGE_VL)
   NODE_NAME_CASE(VMAND_VL)
   NODE_NAME_CASE(VMOR_VL)
   NODE_NAME_CASE(VMXOR_VL)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 41a2dc5771c8..58ed611efc83 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -332,10 +332,8 @@ enum NodeType : unsigned {
 
   // Vector select with an additional VL operand. This operation is unmasked.
   VSELECT_VL,
-  // Vector select with operand #2 (the value when the condition is false) tied
-  // to the destination and an additional VL operand. This operation is
-  // unmasked.
-  VP_MERGE_VL,
+  // General vmerge node with mask, true, false, passthru, and vl operands.
+  VMERGE_VL,
 
   // Mask binary operators.
   VMAND_VL,
@@ -910,6 +908,7 @@ private:
   SDValue lowerLogicVPOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPExtMaskOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPSetCCMaskOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVPSpliceExperimental(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPReverseExperimental(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPFPIntConvOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPStridedLoad(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp
index de2227f82192..e487cc8b2e20 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertWriteVXRM.cpp
@@ -198,13 +198,23 @@ char RISCVInsertWriteVXRM::ID = 0;
 INITIALIZE_PASS(RISCVInsertWriteVXRM, DEBUG_TYPE, RISCV_INSERT_WRITE_VXRM_NAME,
                 false, false)
 
+static bool ignoresVXRM(const MachineInstr &MI) {
+  switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
+  default:
+    return false;
+  case RISCV::VNCLIP_WI:
+  case RISCV::VNCLIPU_WI:
+    return MI.getOperand(3).getImm() == 0;
+  }
+}
+
 bool RISCVInsertWriteVXRM::computeVXRMChanges(const MachineBasicBlock &MBB) {
   BlockData &BBInfo = BlockInfo[MBB.getNumber()];
 
   bool NeedVXRMWrite = false;
   for (const MachineInstr &MI : MBB) {
     int VXRMIdx = RISCVII::getVXRMOpNum(MI.getDesc());
-    if (VXRMIdx >= 0) {
+    if (VXRMIdx >= 0 && !ignoresVXRM(MI)) {
       unsigned NewVXRMImm = MI.getOperand(VXRMIdx).getImm();
 
       if (!BBInfo.VXRMUse.isValid())
@@ -356,7 +366,7 @@ void RISCVInsertWriteVXRM::emitWriteVXRM(MachineBasicBlock &MBB) {
 
   for (MachineInstr &MI : MBB) {
     int VXRMIdx = RISCVII::getVXRMOpNum(MI.getDesc());
-    if (VXRMIdx >= 0) {
+    if (VXRMIdx >= 0 && !ignoresVXRM(MI)) {
       unsigned NewVXRMImm = MI.getOperand(VXRMIdx).getImm();
 
       if (PendingInsert || !Info.isStatic() ||
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 1dcff7eb563e..cd98438eed88 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2282,9 +2282,14 @@ bool RISCVInstrInfo::shouldClusterMemOps(
     return false;
   }
 
-  // TODO: Use a more carefully chosen heuristic, e.g. only cluster if offsets
-  // indicate they likely share a cache line.
-  return ClusterSize <= 4;
+  unsigned CacheLineSize =
+      BaseOps1.front()->getParent()->getMF()->getSubtarget().getCacheLineSize();
+  // Assume a cache line size of 64 bytes if no size is set in RISCVSubtarget.
+  CacheLineSize = CacheLineSize ? CacheLineSize : 64;
+  // Cluster if the memory operations are on the same or a neighbouring cache
+  // line, but limit the maximum ClusterSize to avoid creating too much
+  // additional register pressure.
+  return ClusterSize <= 4 && std::abs(Offset1 - Offset2) < CacheLineSize;
 }
 
 // Set BaseReg (the base register operand), Offset (the byte offset being
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index edc08187d8f7..35e8edf5d2fa 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -2111,13 +2111,16 @@ include "RISCVInstrInfoZk.td"
 include "RISCVInstrInfoV.td"
 include "RISCVInstrInfoZvk.td"
 
-// Integer
-include "RISCVInstrInfoZicbo.td"
-include "RISCVInstrInfoZicond.td"
-
 // Compressed
 include "RISCVInstrInfoC.td"
 include "RISCVInstrInfoZc.td"
+include "RISCVInstrInfoZcmop.td"
+
+// Integer
+include "RISCVInstrInfoZimop.td"
+include "RISCVInstrInfoZicbo.td"
+include "RISCVInstrInfoZicond.td"
+include "RISCVInstrInfoZicfiss.td"
 
 //===----------------------------------------------------------------------===//
 // Vendor extensions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 488ffa73f4e4..30deeaa06448 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -257,13 +257,13 @@ class SegRegClass<LMULInfo m, int nf> {
 // Vector register and vector group type information.
 //===----------------------------------------------------------------------===//
 
-class VTypeInfo<ValueType Vec, ValueType Mas, int Sew, VReg Reg, LMULInfo M,
+class VTypeInfo<ValueType Vec, ValueType Mas, int Sew, LMULInfo M,
                 ValueType Scal = XLenVT, RegisterClass ScalarReg = GPR> {
   ValueType Vector = Vec;
   ValueType Mask = Mas;
   int SEW = Sew;
   int Log2SEW = !logtwo(Sew);
-  VReg RegClass = Reg;
+  VReg RegClass = M.vrclass;
   LMULInfo LMul = M;
   ValueType Scalar = Scal;
   RegisterClass ScalarRegClass = ScalarReg;
@@ -279,9 +279,9 @@ class VTypeInfo<ValueType Vec, ValueType Mas, int Sew, VReg Reg, LMULInfo M,
 }
 
 class GroupVTypeInfo<ValueType Vec, ValueType VecM1, ValueType Mas, int Sew,
-                     VReg Reg, LMULInfo M, ValueType Scal = XLenVT,
+                     LMULInfo M, ValueType Scal = XLenVT,
                      RegisterClass ScalarReg = GPR>
-    : VTypeInfo<Vec, Mas, Sew, Reg, M, Scal, ScalarReg> {
+    : VTypeInfo<Vec, Mas, Sew, M, Scal, ScalarReg> {
   ValueType VectorM1 = VecM1;
 }
 
@@ -289,70 +289,70 @@ defset list<VTypeInfo> AllVectors = {
   defset list<VTypeInfo> AllIntegerVectors = {
     defset list<VTypeInfo> NoGroupIntegerVectors = {
       defset list<VTypeInfo> FractionalGroupIntegerVectors = {
-        def VI8MF8: VTypeInfo<vint8mf8_t,  vbool64_t,  8, VR, V_MF8>;
-        def VI8MF4: VTypeInfo<vint8mf4_t,  vbool32_t,  8, VR, V_MF4>;
-        def VI8MF2: VTypeInfo<vint8mf2_t,  vbool16_t,  8, VR, V_MF2>;
-        def VI16MF4: VTypeInfo<vint16mf4_t, vbool64_t, 16, VR, V_MF4>;
-        def VI16MF2: VTypeInfo<vint16mf2_t, vbool32_t, 16, VR, V_MF2>;
-        def VI32MF2: VTypeInfo<vint32mf2_t, vbool64_t, 32, VR, V_MF2>;
+        def VI8MF8:  VTypeInfo<vint8mf8_t,  vbool64_t, 8,  V_MF8>;
+        def VI8MF4:  VTypeInfo<vint8mf4_t,  vbool32_t, 8,  V_MF4>;
+        def VI8MF2:  VTypeInfo<vint8mf2_t,  vbool16_t, 8,  V_MF2>;
+        def VI16MF4: VTypeInfo<vint16mf4_t, vbool64_t, 16, V_MF4>;
+        def VI16MF2: VTypeInfo<vint16mf2_t, vbool32_t, 16, V_MF2>;
+        def VI32MF2: VTypeInfo<vint32mf2_t, vbool64_t, 32, V_MF2>;
       }
-      def VI8M1: VTypeInfo<vint8m1_t,   vbool8_t,   8, VR, V_M1>;
-      def VI16M1: VTypeInfo<vint16m1_t,  vbool16_t, 16, VR, V_M1>;
-      def VI32M1: VTypeInfo<vint32m1_t,  vbool32_t, 32, VR, V_M1>;
-      def VI64M1: VTypeInfo<vint64m1_t,  vbool64_t, 64, VR, V_M1>;
+      def VI8M1:  VTypeInfo<vint8m1_t,  vbool8_t,   8, V_M1>;
+      def VI16M1: VTypeInfo<vint16m1_t, vbool16_t, 16, V_M1>;
+      def VI32M1: VTypeInfo<vint32m1_t, vbool32_t, 32, V_M1>;
+      def VI64M1: VTypeInfo<vint64m1_t, vbool64_t, 64, V_M1>;
     }
     defset list<GroupVTypeInfo> GroupIntegerVectors = {
-      def VI8M2: GroupVTypeInfo<vint8m2_t, vint8m1_t, vbool4_t, 8, VRM2, V_M2>;
-      def VI8M4: GroupVTypeInfo<vint8m4_t, vint8m1_t, vbool2_t, 8, VRM4, V_M4>;
-      def VI8M8: GroupVTypeInfo<vint8m8_t, vint8m1_t, vbool1_t, 8, VRM8, V_M8>;
+      def VI8M2: GroupVTypeInfo<vint8m2_t, vint8m1_t, vbool4_t, 8, V_M2>;
+      def VI8M4: GroupVTypeInfo<vint8m4_t, vint8m1_t, vbool2_t, 8, V_M4>;
+      def VI8M8: GroupVTypeInfo<vint8m8_t, vint8m1_t, vbool1_t, 8, V_M8>;
 
-      def VI16M2: GroupVTypeInfo<vint16m2_t,vint16m1_t,vbool8_t, 16,VRM2, V_M2>;
-      def VI16M4: GroupVTypeInfo<vint16m4_t,vint16m1_t,vbool4_t, 16,VRM4, V_M4>;
-      def VI16M8: GroupVTypeInfo<vint16m8_t,vint16m1_t,vbool2_t, 16,VRM8, V_M8>;
+      def VI16M2: GroupVTypeInfo<vint16m2_t, vint16m1_t, vbool8_t, 16, V_M2>;
+      def VI16M4: GroupVTypeInfo<vint16m4_t, vint16m1_t, vbool4_t, 16, V_M4>;
+      def VI16M8: GroupVTypeInfo<vint16m8_t, vint16m1_t, vbool2_t, 16, V_M8>;
 
-      def VI32M2: GroupVTypeInfo<vint32m2_t,vint32m1_t,vbool16_t,32,VRM2, V_M2>;
-      def VI32M4: GroupVTypeInfo<vint32m4_t,vint32m1_t,vbool8_t, 32,VRM4, V_M4>;
-      def VI32M8: GroupVTypeInfo<vint32m8_t,vint32m1_t,vbool4_t, 32,VRM8, V_M8>;
+      def VI32M2: GroupVTypeInfo<vint32m2_t, vint32m1_t, vbool16_t, 32, V_M2>;
+      def VI32M4: GroupVTypeInfo<vint32m4_t, vint32m1_t, vbool8_t,  32, V_M4>;
+      def VI32M8: GroupVTypeInfo<vint32m8_t, vint32m1_t, vbool4_t,  32, V_M8>;
 
-      def VI64M2: GroupVTypeInfo<vint64m2_t,vint64m1_t,vbool32_t,64,VRM2, V_M2>;
-      def VI64M4: GroupVTypeInfo<vint64m4_t,vint64m1_t,vbool16_t,64,VRM4, V_M4>;
-      def VI64M8: GroupVTypeInfo<vint64m8_t,vint64m1_t,vbool8_t, 64,VRM8, V_M8>;
+      def VI64M2: GroupVTypeInfo<vint64m2_t, vint64m1_t, vbool32_t, 64, V_M2>;
+      def VI64M4: GroupVTypeInfo<vint64m4_t, vint64m1_t, vbool16_t, 64, V_M4>;
+      def VI64M8: GroupVTypeInfo<vint64m8_t, vint64m1_t, vbool8_t,  64, V_M8>;
     }
   }
 
   defset list<VTypeInfo> AllFloatVectors = {
     defset list<VTypeInfo> NoGroupFloatVectors = {
       defset list<VTypeInfo> FractionalGroupFloatVectors = {
-        def VF16MF4: VTypeInfo<vfloat16mf4_t, vbool64_t, 16, VR, V_MF4, f16, FPR16>;
-        def VF16MF2: VTypeInfo<vfloat16mf2_t, vbool32_t, 16, VR, V_MF2, f16, FPR16>;
-        def VF32MF2: VTypeInfo<vfloat32mf2_t,vbool64_t, 32, VR, V_MF2, f32, FPR32>;
+        def VF16MF4: VTypeInfo<vfloat16mf4_t, vbool64_t, 16, V_MF4, f16, FPR16>;
+        def VF16MF2: VTypeInfo<vfloat16mf2_t, vbool32_t, 16, V_MF2, f16, FPR16>;
+        def VF32MF2: VTypeInfo<vfloat32mf2_t, vbool64_t, 32, V_MF2, f32, FPR32>;
       }
-      def VF16M1:  VTypeInfo<vfloat16m1_t,  vbool16_t, 16, VR, V_M1,  f16, FPR16>;
-      def VF32M1:  VTypeInfo<vfloat32m1_t, vbool32_t, 32, VR, V_M1,  f32, FPR32>;
-      def VF64M1: VTypeInfo<vfloat64m1_t, vbool64_t, 64, VR, V_M1, f64, FPR64>;
+      def VF16M1: VTypeInfo<vfloat16m1_t, vbool16_t, 16, V_M1, f16, FPR16>;
+      def VF32M1: VTypeInfo<vfloat32m1_t, vbool32_t, 32, V_M1, f32, FPR32>;
+      def VF64M1: VTypeInfo<vfloat64m1_t, vbool64_t, 64, V_M1, f64, FPR64>;
     }
 
     defset list<GroupVTypeInfo> GroupFloatVectors = {
       def VF16M2: GroupVTypeInfo<vfloat16m2_t, vfloat16m1_t, vbool8_t, 16,
-                                 VRM2, V_M2, f16, FPR16>;
+                                 V_M2, f16, FPR16>;
       def VF16M4: GroupVTypeInfo<vfloat16m4_t, vfloat16m1_t, vbool4_t, 16,
-                                 VRM4, V_M4, f16, FPR16>;
+                                 V_M4, f16, FPR16>;
       def VF16M8: GroupVTypeInfo<vfloat16m8_t, vfloat16m1_t, vbool2_t, 16,
-                                 VRM8, V_M8, f16, FPR16>;
+                                 V_M8, f16, FPR16>;
 
       def VF32M2: GroupVTypeInfo<vfloat32m2_t, vfloat32m1_t, vbool16_t, 32,
-                                 VRM2, V_M2, f32, FPR32>;
+                                 V_M2, f32, FPR32>;
       def VF32M4: GroupVTypeInfo<vfloat32m4_t, vfloat32m1_t, vbool8_t,  32,
-                                 VRM4, V_M4, f32, FPR32>;
+                                 V_M4, f32, FPR32>;
       def VF32M8: GroupVTypeInfo<vfloat32m8_t, vfloat32m1_t, vbool4_t,  32,
-                                 VRM8, V_M8, f32, FPR32>;
+                                 V_M8, f32, FPR32>;
 
       def VF64M2: GroupVTypeInfo<vfloat64m2_t, vfloat64m1_t, vbool32_t, 64,
-                                 VRM2, V_M2, f64, FPR64>;
+                                 V_M2, f64, FPR64>;
       def VF64M4: GroupVTypeInfo<vfloat64m4_t, vfloat64m1_t, vbool16_t, 64,
-                                 VRM4, V_M4, f64, FPR64>;
+                                 V_M4, f64, FPR64>;
       def VF64M8: GroupVTypeInfo<vfloat64m8_t, vfloat64m1_t, vbool8_t,  64,
-                                 VRM8, V_M8, f64, FPR64>;
+                                 V_M8, f64, FPR64>;
     }
   }
 }
@@ -360,19 +360,19 @@ defset list<VTypeInfo> AllVectors = {
 defset list<VTypeInfo> AllBFloatVectors = {
   defset list<VTypeInfo> NoGroupBFloatVectors = {
     defset list<VTypeInfo> FractionalGroupBFloatVectors = {
-      def VBF16MF4: VTypeInfo<vbfloat16mf4_t, vbool64_t, 16, VR, V_MF4, bf16, FPR16>;
-      def VBF16MF2: VTypeInfo<vbfloat16mf2_t, vbool32_t, 16, VR, V_MF2, bf16, FPR16>;
+      def VBF16MF4: VTypeInfo<vbfloat16mf4_t, vbool64_t, 16, V_MF4, bf16, FPR16>;
+      def VBF16MF2: VTypeInfo<vbfloat16mf2_t, vbool32_t, 16, V_MF2, bf16, FPR16>;
     }
-    def VBF16M1:  VTypeInfo<vbfloat16m1_t,  vbool16_t, 16, VR, V_M1,  bf16, FPR16>;
+    def VBF16M1:  VTypeInfo<vbfloat16m1_t, vbool16_t, 16, V_M1, bf16, FPR16>;
   }
 
   defset list<GroupVTypeInfo> GroupBFloatVectors = {
     def VBF16M2: GroupVTypeInfo<vbfloat16m2_t, vbfloat16m1_t, vbool8_t, 16,
-                                VRM2, V_M2, bf16, FPR16>;
+                                V_M2, bf16, FPR16>;
     def VBF16M4: GroupVTypeInfo<vbfloat16m4_t, vbfloat16m1_t, vbool4_t, 16,
-                                VRM4, V_M4, bf16, FPR16>;
+                                V_M4, bf16, FPR16>;
     def VBF16M8: GroupVTypeInfo<vbfloat16m8_t, vbfloat16m1_t, vbool2_t, 16,
-                                VRM8, V_M8, bf16, FPR16>;
+                                V_M8, bf16, FPR16>;
   }
 }
 
@@ -1069,7 +1069,8 @@ class VPseudoUnaryMask<VReg RetClass,
 
 class VPseudoUnaryMaskRoundingMode<VReg RetClass,
                                    VReg OpClass,
-                                   string Constraint = ""> :
+                                   string Constraint = "",
+                                   int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
              (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
                   VMaskOp:$vm, ixlenimm:$rm,
@@ -1079,6 +1080,7 @@ class VPseudoUnaryMaskRoundingMode<VReg RetClass,
   let mayStore = 0;
   let hasSideEffects = 0;
   let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1106,7 +1108,8 @@ class VPseudoUnaryMask_NoExcept<VReg RetClass,
 
 class VPseudoUnaryNoMask_FRM<VReg RetClass,
                              VReg OpClass,
-                             string Constraint = ""> :
+                             string Constraint = "",
+                             int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
              (ins RetClass:$merge, OpClass:$rs2, ixlenimm:$frm,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
@@ -1115,6 +1118,7 @@ class VPseudoUnaryNoMask_FRM<VReg RetClass,
   let mayStore = 0;
   let hasSideEffects = 0;
   let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1123,7 +1127,8 @@ class VPseudoUnaryNoMask_FRM<VReg RetClass,
 
 class VPseudoUnaryMask_FRM<VReg RetClass,
                            VReg OpClass,
-                           string Constraint = ""> :
+                           string Constraint = "",
+                           int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
              (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
                   VMaskOp:$vm, ixlenimm:$frm,
@@ -1133,6 +1138,7 @@ class VPseudoUnaryMask_FRM<VReg RetClass,
   let mayStore = 0;
   let hasSideEffects = 0;
   let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1528,7 +1534,8 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass,
                                DAGOperand Op2Class,
                                LMULInfo MInfo,
                                bit CarryIn,
-                               string Constraint> :
+                               string Constraint,
+                               int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
              !if(CarryIn,
                 (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1,
@@ -1540,6 +1547,7 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass,
   let mayStore = 0;
   let hasSideEffects = 0;
   let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 0;
@@ -2447,10 +2455,11 @@ multiclass VPseudoBinaryV_VM<LMULInfo m, bit CarryOut = 0, bit CarryIn = 1,
                          m.vrclass, m.vrclass, m, CarryIn, Constraint, TargetConstraintType>;
 }
 
-multiclass VPseudoTiedBinaryV_VM<LMULInfo m> {
+multiclass VPseudoTiedBinaryV_VM<LMULInfo m, int TargetConstraintType = 1> {
   def "_VVM" # "_" # m.MX:
     VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
-                             m.vrclass, m.vrclass, m, 1, "">;
+                             m.vrclass, m.vrclass, m, 1, "",
+                             TargetConstraintType>;
 }
 
 multiclass VPseudoBinaryV_XM<LMULInfo m, bit CarryOut = 0, bit CarryIn = 1,
@@ -2462,10 +2471,11 @@ multiclass VPseudoBinaryV_XM<LMULInfo m, bit CarryOut = 0, bit CarryIn = 1,
                          m.vrclass, GPR, m, CarryIn, Constraint, TargetConstraintType>;
 }
 
-multiclass VPseudoTiedBinaryV_XM<LMULInfo m> {
+multiclass VPseudoTiedBinaryV_XM<LMULInfo m, int TargetConstraintType = 1> {
   def "_VXM" # "_" # m.MX:
     VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
-                             m.vrclass, GPR, m, 1, "">;
+                             m.vrclass, GPR, m, 1, "",
+                             TargetConstraintType>;
 }
 
 multiclass VPseudoVMRG_FM {
@@ -2596,45 +2606,48 @@ multiclass VPseudoVRCP_V_RM {
   }
 }
 
-multiclass PseudoVEXT_VF2<int TargetConstraintType = 1> {
+multiclass PseudoVEXT_VF2 {
   defvar constraints = "@earlyclobber $rd";
   foreach m = MxListVF2 in {
     defvar mx = m.MX;
+    defvar CurrTypeConstraints = !if(!or(!eq(mx, "MF4"), !eq(mx, "MF2"), !eq(mx, "M1")), 1, 3);
     let VLMul = m.value in {
-      def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints, TargetConstraintType>,
+      def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints, CurrTypeConstraints>,
                      SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
       def "_" # mx # "_MASK" :
-        VPseudoUnaryMask<m.vrclass, m.f2vrclass, constraints, TargetConstraintType>,
+        VPseudoUnaryMask<m.vrclass, m.f2vrclass, constraints, CurrTypeConstraints>,
         RISCVMaskedPseudo<MaskIdx=2>,
         SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
     }
   }
 }
 
-multiclass PseudoVEXT_VF4<int TargetConstraintType = 1> {
+multiclass PseudoVEXT_VF4 {
   defvar constraints = "@earlyclobber $rd";
   foreach m = MxListVF4 in {
     defvar mx = m.MX;
+    defvar CurrTypeConstraints = !if(!or(!eq(mx, "MF2"), !eq(mx, "M1"), !eq(mx, "M2")), 1, 3);
     let VLMul = m.value in {
-      def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints, TargetConstraintType>,
+      def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints, CurrTypeConstraints>,
                      SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
       def "_" # mx # "_MASK" :
-        VPseudoUnaryMask<m.vrclass, m.f4vrclass, constraints, TargetConstraintType>,
+        VPseudoUnaryMask<m.vrclass, m.f4vrclass, constraints, CurrTypeConstraints>,
         RISCVMaskedPseudo<MaskIdx=2>,
         SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
     }
   }
 }
 
-multiclass PseudoVEXT_VF8<int TargetConstraintType = 1> {
+multiclass PseudoVEXT_VF8 {
   defvar constraints = "@earlyclobber $rd";
   foreach m = MxListVF8 in {
     defvar mx = m.MX;
+    defvar CurrTypeConstraints = !if(!or(!eq(mx, "M1"), !eq(mx, "M2"), !eq(mx, "M4")), 1, 3);
     let VLMul = m.value in {
-      def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints, TargetConstraintType>,
+      def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints, CurrTypeConstraints>,
                      SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
       def "_" # mx # "_MASK" :
-        VPseudoUnaryMask<m.vrclass, m.f8vrclass, constraints, TargetConstraintType>,
+        VPseudoUnaryMask<m.vrclass, m.f8vrclass, constraints, CurrTypeConstraints>,
         RISCVMaskedPseudo<MaskIdx=2>,
         SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
     }
@@ -3619,7 +3632,7 @@ multiclass VPseudoConversionRoundingMode<VReg RetClass,
   let VLMul = MInfo.value in {
     def "_" # MInfo.MX : VPseudoUnaryNoMaskRoundingMode<RetClass, Op1Class, Constraint, TargetConstraintType>;
     def "_" # MInfo.MX # "_MASK" : VPseudoUnaryMaskRoundingMode<RetClass, Op1Class,
-                                                                Constraint>,
+                                                                Constraint, TargetConstraintType>,
                                    RISCVMaskedPseudo<MaskIdx=2>;
   }
 }
@@ -3628,12 +3641,13 @@ multiclass VPseudoConversionRoundingMode<VReg RetClass,
 multiclass VPseudoConversionRM<VReg RetClass,
                                VReg Op1Class,
                                LMULInfo MInfo,
-                               string Constraint = ""> {
+                               string Constraint = "",
+                               int TargetConstraintType = 1> {
   let VLMul = MInfo.value in {
     def "_" # MInfo.MX : VPseudoUnaryNoMask_FRM<RetClass, Op1Class,
-                                                        Constraint>;
+                                                        Constraint, TargetConstraintType>;
     def "_" # MInfo.MX # "_MASK" : VPseudoUnaryMask_FRM<RetClass, Op1Class,
-                                                        Constraint>,
+                                                        Constraint, TargetConstraintType>,
                                    RISCVMaskedPseudo<MaskIdx=2>;
   }
 }
@@ -3761,7 +3775,7 @@ multiclass VPseudoVNCVTI_W_RM {
 multiclass VPseudoVNCVTI_RM_W {
   defvar constraint = "@earlyclobber $rd";
   foreach m = MxListW in {
-    defm _W : VPseudoConversionRM<m.vrclass, m.wvrclass, m, constraint>,
+    defm _W : VPseudoConversionRM<m.vrclass, m.wvrclass, m, constraint, TargetConstraintType=2>,
               SchedUnary<"WriteVFNCvtFToIV", "ReadVFNCvtFToIV", m.MX,
                          forceMergeOpRead=true>;
   }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index dc6b57fad321..5b50a4a78c01 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -344,7 +344,14 @@ def SDT_RISCVSelect_VL  : SDTypeProfile<1, 4, [
 ]>;
 
 def riscv_vselect_vl  : SDNode<"RISCVISD::VSELECT_VL", SDT_RISCVSelect_VL>;
-def riscv_vp_merge_vl : SDNode<"RISCVISD::VP_MERGE_VL", SDT_RISCVSelect_VL>;
+
+def SDT_RISCVVMERGE_VL  : SDTypeProfile<1, 5, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<1, i1>,
+  SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisSameAs<0, 4>,
+  SDTCisVT<5, XLenVT>
+]>;
+
+def riscv_vmerge_vl : SDNode<"RISCVISD::VMERGE_VL", SDT_RISCVVMERGE_VL>;
 
 def SDT_RISCVVMSETCLR_VL : SDTypeProfile<1, 1, [SDTCVecEltisVT<0, i1>,
                                                 SDTCisVT<1, XLenVT>]>;
@@ -675,14 +682,14 @@ multiclass VPatTiedBinaryNoMaskVL_V<SDNode vop,
                      op2_reg_class:$rs2,
                      GPR:$vl, sew, TAIL_AGNOSTIC)>;
   // Tail undisturbed
-  def : Pat<(riscv_vp_merge_vl true_mask,
+  def : Pat<(riscv_vmerge_vl true_mask,
              (result_type (vop
                            result_reg_class:$rs1,
                            (op2_type op2_reg_class:$rs2),
                            srcvalue,
                            true_mask,
                            VLOpFrag)),
-             result_reg_class:$rs1, VLOpFrag),
+             result_reg_class:$rs1, result_reg_class:$rs1, VLOpFrag),
             (!cast<Instruction>(instruction_name#"_"#suffix#"_"# vlmul.MX#"_TIED")
                      result_reg_class:$rs1,
                      op2_reg_class:$rs2,
@@ -712,14 +719,14 @@ multiclass VPatTiedBinaryNoMaskVL_V_RM<SDNode vop,
                      FRM_DYN,
                      GPR:$vl, sew, TAIL_AGNOSTIC)>;
   // Tail undisturbed
-  def : Pat<(riscv_vp_merge_vl true_mask,
+  def : Pat<(riscv_vmerge_vl true_mask,
              (result_type (vop
                            result_reg_class:$rs1,
                            (op2_type op2_reg_class:$rs2),
                            srcvalue,
                            true_mask,
                            VLOpFrag)),
-             result_reg_class:$rs1, VLOpFrag),
+             result_reg_class:$rs1, result_reg_class:$rs1, VLOpFrag),
             (!cast<Instruction>(instruction_name#"_"#suffix#"_"# vlmul.MX#"_TIED")
                      result_reg_class:$rs1,
                      op2_reg_class:$rs2,
@@ -1697,21 +1704,21 @@ multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> {
   foreach vti = AllIntegerVectors in {
   defvar suffix = vti.LMul.MX;
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(riscv_vp_merge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                 (vti.Vector (op vti.RegClass:$rd,
                                 (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2,
                                     srcvalue, (vti.Mask true_mask), VLOpFrag),
                                 srcvalue, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vp_merge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                 (vti.Vector (op vti.RegClass:$rd,
                                 (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2,
                                     srcvalue, (vti.Mask true_mask), VLOpFrag),
                                 srcvalue, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
@@ -1840,17 +1847,17 @@ multiclass VPatFPMulAccVL_VV_VF<PatFrag vop, string instruction_name> {
   foreach vti = AllFloatVectors in {
   defvar suffix = vti.LMul.MX;
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(riscv_vp_merge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                            (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vp_merge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                            (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
@@ -1876,10 +1883,10 @@ multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> {
   foreach vti = AllFloatVectors in {
   defvar suffix = vti.LMul.MX;
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(riscv_vp_merge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                            (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0),
@@ -1887,10 +1894,10 @@ multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> {
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
                    GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vp_merge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                            (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0),
@@ -2273,29 +2280,32 @@ foreach vti = AllIntegerVectors in {
                    (vti.Vector (IMPLICIT_DEF)),
                    vti.RegClass:$rs2, simm5:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 
-    def : Pat<(vti.Vector (riscv_vp_merge_vl (vti.Mask V0),
-                                             vti.RegClass:$rs1,
-                                             vti.RegClass:$rs2,
-                                             VLOpFrag)),
+    def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
+                                           vti.RegClass:$rs1,
+                                           vti.RegClass:$rs2,
+                                           vti.RegClass:$merge,
+                                           VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
-                   vti.RegClass:$rs2, vti.RegClass:$rs2, vti.RegClass:$rs1,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+                  vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                  (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 
-    def : Pat<(vti.Vector (riscv_vp_merge_vl (vti.Mask V0),
-                                             (SplatPat XLenVT:$rs1),
-                                             vti.RegClass:$rs2,
-                                             VLOpFrag)),
+    def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
+                                            (SplatPat XLenVT:$rs1),
+                                            vti.RegClass:$rs2,
+                                            vti.RegClass:$merge,
+                                            VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
-                   vti.RegClass:$rs2, vti.RegClass:$rs2, GPR:$rs1,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
-
-    def : Pat<(vti.Vector (riscv_vp_merge_vl (vti.Mask V0),
-                                             (SplatPat_simm5 simm5:$rs1),
-                                             vti.RegClass:$rs2,
-                                             VLOpFrag)),
+                  vti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                  (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+
+    def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
+                                           (SplatPat_simm5 simm5:$rs1),
+                                           vti.RegClass:$rs2,
+                                           vti.RegClass:$merge,
+                                           VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
-                   vti.RegClass:$rs2, vti.RegClass:$rs2, simm5:$rs1,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+                  vti.RegClass:$merge, vti.RegClass:$rs2, simm5:$rs1,
+                  (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
   }
 }
 
@@ -2328,6 +2338,64 @@ defm : VPatBinaryVL_VV_VX_VI<riscv_uaddsat_vl, "PseudoVSADDU">;
 defm : VPatBinaryVL_VV_VX<riscv_ssubsat_vl, "PseudoVSSUB">;
 defm : VPatBinaryVL_VV_VX<riscv_usubsat_vl, "PseudoVSSUBU">;
 
+// 12.5. Vector Narrowing Fixed-Point Clip Instructions
+class VPatTruncSatClipMaxMinBase<string inst,
+                                 VTypeInfo vti,
+                                 VTypeInfo wti,
+                                 SDPatternOperator op1,
+                                 int op1_value,
+                                 SDPatternOperator op2,
+                                 int op2_value> :
+  Pat<(vti.Vector (riscv_trunc_vector_vl
+        (wti.Vector (op1
+          (wti.Vector (op2
+            (wti.Vector wti.RegClass:$rs1),
+            (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), op2_value, (XLenVT srcvalue))),
+            (wti.Vector undef),(wti.Mask V0), VLOpFrag)),
+          (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), op1_value, (XLenVT srcvalue))),
+          (wti.Vector undef), (wti.Mask V0), VLOpFrag)),
+        (vti.Mask V0), VLOpFrag)),
+      (!cast<Instruction>(inst#"_WI_"#vti.LMul.MX#"_MASK")
+        (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
+        (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
+
+class VPatTruncSatClipUMin<VTypeInfo vti,
+                           VTypeInfo wti,
+                           int uminval> :
+  Pat<(vti.Vector (riscv_trunc_vector_vl
+        (wti.Vector (riscv_umin_vl
+          (wti.Vector wti.RegClass:$rs1),
+          (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), uminval, (XLenVT srcvalue))),
+          (wti.Vector undef), (wti.Mask V0), VLOpFrag)),
+        (vti.Mask V0), VLOpFrag)),
+      (!cast<Instruction>("PseudoVNCLIPU_WI_"#vti.LMul.MX#"_MASK")
+        (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0,
+        (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>;
+
+multiclass VPatTruncSatClipMaxMin<string inst, VTypeInfo vti, VTypeInfo wti,
+  SDPatternOperator max, int maxval, SDPatternOperator min, int minval> {
+    def : VPatTruncSatClipMaxMinBase<inst, vti, wti, max, maxval, min, minval>;
+    def : VPatTruncSatClipMaxMinBase<inst, vti, wti, min, minval, max, maxval>;
+}
+
+multiclass VPatTruncSatClip<VTypeInfo vti, VTypeInfo wti> {
+  defvar sew = vti.SEW;
+  defvar uminval = !sub(!shl(1, sew), 1);
+  defvar sminval = !sub(!shl(1, !sub(sew, 1)), 1);
+  defvar smaxval = !sub(0, !shl(1, !sub(sew, 1)));
+
+  let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
+                               GetVTypePredicates<wti>.Predicates) in {
+    defm : VPatTruncSatClipMaxMin<"PseudoVNCLIP", vti, wti, riscv_smin_vl,
+                                  sminval, riscv_smax_vl, smaxval>;
+    def : VPatTruncSatClipUMin<vti, wti, uminval>;
+  }
+
+}
+
+foreach vtiToWti = AllWidenableIntVectors in
+  defm : VPatTruncSatClip<vtiToWti.Vti, vtiToWti.Wti>;
+
 // 13. Vector Floating-Point Instructions
 
 // 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
@@ -2493,21 +2561,23 @@ foreach fvti = AllFloatVectors in {
                    (fvti.Vector (IMPLICIT_DEF)),
                    fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
 
-    def : Pat<(fvti.Vector (riscv_vp_merge_vl (fvti.Mask V0),
-                                              fvti.RegClass:$rs1,
-                                              fvti.RegClass:$rs2,
-                                              VLOpFrag)),
-              (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
-                   fvti.RegClass:$rs2, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
-                   GPR:$vl, fvti.Log2SEW)>;
-
-    def : Pat<(fvti.Vector (riscv_vp_merge_vl (fvti.Mask V0),
-                                              (SplatFPOp (fvti.Scalar fpimm0)),
-                                              fvti.RegClass:$rs2,
-                                              VLOpFrag)),
-              (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
-                   fvti.RegClass:$rs2, fvti.RegClass:$rs2, 0, (fvti.Mask V0),
-                   GPR:$vl, fvti.Log2SEW)>;
+  def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
+                                          fvti.RegClass:$rs1,
+                                          fvti.RegClass:$rs2,
+                                          fvti.RegClass:$merge,
+                                          VLOpFrag)),
+            (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
+                 fvti.RegClass:$merge, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
+                 GPR:$vl, fvti.Log2SEW)>;
+
+  def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
+                                          (SplatFPOp (fvti.Scalar fpimm0)),
+                                          fvti.RegClass:$rs2,
+                                          fvti.RegClass:$merge,
+                                          VLOpFrag)),
+            (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
+                 fvti.RegClass:$merge, fvti.RegClass:$rs2, 0, (fvti.Mask V0),
+                 GPR:$vl, fvti.Log2SEW)>;
   }
 
   let Predicates = GetVTypePredicates<fvti>.Predicates in {
@@ -2521,12 +2591,13 @@ foreach fvti = AllFloatVectors in {
                    (fvti.Scalar fvti.ScalarRegClass:$rs1),
                    (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
 
-    def : Pat<(fvti.Vector (riscv_vp_merge_vl (fvti.Mask V0),
-                                              (SplatFPOp fvti.ScalarRegClass:$rs1),
-                                              fvti.RegClass:$rs2,
-                                              VLOpFrag)),
+    def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
+                                            (SplatFPOp fvti.ScalarRegClass:$rs1),
+                                            fvti.RegClass:$rs2,
+                                            fvti.RegClass:$merge,
+                                            VLOpFrag)),
               (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
-                   fvti.RegClass:$rs2, fvti.RegClass:$rs2,
+                   fvti.RegClass:$merge, fvti.RegClass:$rs2,
                    (fvti.Scalar fvti.ScalarRegClass:$rs1),
                    (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index a16fa7e76992..31f832dfd84c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -349,24 +349,30 @@ multiclass VPseudoSiFiveVMACC<string mx, VReg vd_type, VReg vs2_type,
       : VPseudoTernaryNoMaskWithPolicy<vd_type, V_M1.vrclass, vs2_type, Constraint>;
 }
 
-multiclass VPseudoSiFiveVQMACC<string Constraint = ""> {
+multiclass VPseudoSiFiveVQMACCDOD<string Constraint = ""> {
   foreach m = MxListVF8 in
     let VLMul = m.value in
     defm NAME : VPseudoSiFiveVMACC<m.MX, m.vrclass, m.vrclass, Constraint>;
 }
 
+multiclass VPseudoSiFiveVQMACCQOQ<string Constraint = ""> {
+  foreach m = [V_MF2, V_M1, V_M2, V_M4] in
+    let VLMul = m.value in
+    defm NAME : VPseudoSiFiveVMACC<m.MX, m.wvrclass, m.vrclass, Constraint>;
+}
+
 multiclass VPseudoSiFiveVFWMACC<string Constraint = ""> {
-  foreach m = MxListFW in
+  foreach m = MxListVF2 in
     let VLMul = m.value in
     defm NAME : VPseudoSiFiveVMACC<m.MX, m.wvrclass, m.vrclass, Constraint>;
 }
 
 multiclass VPseudoSiFiveVFNRCLIP<string Constraint = "@earlyclobber $rd"> {
-  foreach m = MxListVF4 in
+  foreach i = 0-4 in
     let hasSideEffects = 0 in
-      defm "Pseudo" # NAME : VPseudoBinaryRoundingMode<!if(!eq(m.vrclass, VRM8),
-                                                           VRM2, VR),
-                                                       m.vrclass, FPR32, m,
+      defm "Pseudo" # NAME : VPseudoBinaryRoundingMode<MxListW[i].vrclass,
+                                                       MxListVF4[i].vrclass,
+                                                       FPR32, MxListW[i],
                                                        Constraint, /*sew*/0,
                                                        UsesVXRM=0>;
 }
@@ -400,17 +406,17 @@ let Predicates = [HasVendorXSfvcp] in {
 }
 
 let Predicates = [HasVendorXSfvqmaccdod] in {
-  defm VQMACCU_2x8x2  : VPseudoSiFiveVQMACC;
-  defm VQMACC_2x8x2   : VPseudoSiFiveVQMACC;
-  defm VQMACCUS_2x8x2 : VPseudoSiFiveVQMACC;
-  defm VQMACCSU_2x8x2 : VPseudoSiFiveVQMACC;
+  defm VQMACCU_2x8x2  : VPseudoSiFiveVQMACCDOD;
+  defm VQMACC_2x8x2   : VPseudoSiFiveVQMACCDOD;
+  defm VQMACCUS_2x8x2 : VPseudoSiFiveVQMACCDOD;
+  defm VQMACCSU_2x8x2 : VPseudoSiFiveVQMACCDOD;
 }
 
 let Predicates = [HasVendorXSfvqmaccqoq] in {
-  defm VQMACCU_4x8x4  : VPseudoSiFiveVQMACC;
-  defm VQMACC_4x8x4   : VPseudoSiFiveVQMACC;
-  defm VQMACCUS_4x8x4 : VPseudoSiFiveVQMACC;
-  defm VQMACCSU_4x8x4 : VPseudoSiFiveVQMACC;
+  defm VQMACCU_4x8x4  : VPseudoSiFiveVQMACCQOQ;
+  defm VQMACC_4x8x4   : VPseudoSiFiveVQMACCQOQ;
+  defm VQMACCUS_4x8x4 : VPseudoSiFiveVQMACCQOQ;
+  defm VQMACCSU_4x8x4 : VPseudoSiFiveVQMACCQOQ;
 }
 
 let Predicates = [HasVendorXSfvfwmaccqqq] in {
@@ -566,16 +572,25 @@ multiclass VPatVMACC<string intrinsic, string instruction, string kind,
   }
 }
 
-defset list<VTypeInfoToWide> VQMACCInfoPairs = {
+defset list<VTypeInfoToWide> VQMACCDODInfoPairs = {
   def : VTypeInfoToWide<VI8M1, VI32M1>;
   def : VTypeInfoToWide<VI8M2, VI32M2>;
   def : VTypeInfoToWide<VI8M4, VI32M4>;
   def : VTypeInfoToWide<VI8M8, VI32M8>;
 }
 
-multiclass VPatVQMACC<string intrinsic, string instruction, string kind>
-    : VPatVMACC<intrinsic, instruction, kind, VQMACCInfoPairs, vint8m1_t>;
+defset list<VTypeInfoToWide> VQMACCQOQInfoPairs = {
+  def : VTypeInfoToWide<VI8MF2, VI32M1>;
+  def : VTypeInfoToWide<VI8M1, VI32M2>;
+  def : VTypeInfoToWide<VI8M2, VI32M4>;
+  def : VTypeInfoToWide<VI8M4, VI32M8>;
+}
+
+multiclass VPatVQMACCDOD<string intrinsic, string instruction, string kind>
+    : VPatVMACC<intrinsic, instruction, kind, VQMACCDODInfoPairs, vint8m1_t>;
 
+multiclass VPatVQMACCQOQ<string intrinsic, string instruction, string kind>
+    : VPatVMACC<intrinsic, instruction, kind, VQMACCQOQInfoPairs, vint8m1_t>;
 
 multiclass VPatVFWMACC<string intrinsic, string instruction, string kind>
     : VPatVMACC<intrinsic, instruction, kind, AllWidenableBFloatToFloatVectors,
@@ -594,7 +609,7 @@ multiclass VPatVFNRCLIP<string intrinsic, string instruction> {
     defvar Vti = pair.Vti;
     defvar Wti = pair.Wti;
     defm : VPatBinaryRoundingMode<"int_riscv_sf_" # intrinsic,
-                                  "Pseudo" # instruction # "_" # Wti.LMul.MX,
+                                  "Pseudo" # instruction # "_" # Vti.LMul.MX,
                                   Vti.Vector, Wti.Vector, Wti.Scalar, Vti.Mask,
                                   Vti.Log2SEW, Vti.RegClass,
                                   Wti.RegClass, Wti.ScalarRegClass>;
@@ -637,17 +652,17 @@ let Predicates = [HasVendorXSfvcp] in {
 }
 
 let Predicates = [HasVendorXSfvqmaccdod] in {
-  defm : VPatVQMACC<"vqmaccu_2x8x2", "VQMACCU", "2x8x2">;
-  defm : VPatVQMACC<"vqmacc_2x8x2", "VQMACC", "2x8x2">;
-  defm : VPatVQMACC<"vqmaccus_2x8x2", "VQMACCUS", "2x8x2">;
-  defm : VPatVQMACC<"vqmaccsu_2x8x2", "VQMACCSU", "2x8x2">;
+  defm : VPatVQMACCDOD<"vqmaccu_2x8x2", "VQMACCU", "2x8x2">;
+  defm : VPatVQMACCDOD<"vqmacc_2x8x2", "VQMACC", "2x8x2">;
+  defm : VPatVQMACCDOD<"vqmaccus_2x8x2", "VQMACCUS", "2x8x2">;
+  defm : VPatVQMACCDOD<"vqmaccsu_2x8x2", "VQMACCSU", "2x8x2">;
 }
 
 let Predicates = [HasVendorXSfvqmaccqoq] in {
-  defm : VPatVQMACC<"vqmaccu_4x8x4", "VQMACCU", "4x8x4">;
-  defm : VPatVQMACC<"vqmacc_4x8x4", "VQMACC", "4x8x4">;
-  defm : VPatVQMACC<"vqmaccus_4x8x4", "VQMACCUS", "4x8x4">;
-  defm : VPatVQMACC<"vqmaccsu_4x8x4", "VQMACCSU", "4x8x4">;
+  defm : VPatVQMACCQOQ<"vqmaccu_4x8x4", "VQMACCU", "4x8x4">;
+  defm : VPatVQMACCQOQ<"vqmacc_4x8x4", "VQMACC", "4x8x4">;
+  defm : VPatVQMACCQOQ<"vqmaccus_4x8x4", "VQMACCUS", "4x8x4">;
+  defm : VPatVQMACCQOQ<"vqmaccsu_4x8x4", "VQMACCSU", "4x8x4">;
 }
 
 let Predicates = [HasVendorXSfvfwmaccqqq] in {
@@ -658,27 +673,3 @@ let Predicates = [HasVendorXSfvfnrclipxfqf] in {
   defm : VPatVFNRCLIP<"vfnrclip_xu_f_qf", "VFNRCLIP_XU_F_QF">;
   defm : VPatVFNRCLIP<"vfnrclip_x_f_qf", "VFNRCLIP_X_F_QF">;
 }
-
-let Predicates = [HasVendorXSfcie] in {
-let hasSideEffects = 1, mayLoad = 0, mayStore = 0, DecoderNamespace = "XSfcie" in {
-def SF_CFLUSH_D_L1 : RVInstI<0b000, OPC_SYSTEM, (outs), (ins GPR:$rs1), "cflush.d.l1","$rs1">,
-                             Sched<[]> {
-  let rd = 0;
-  let imm12 = {0b1111,0b1100,0b0000};
-}
-
-def SF_CDISCARD_D_L1 : RVInstI<0b000, OPC_SYSTEM, (outs), (ins GPR:$rs1), "cdiscard.d.l1","$rs1">,
-                               Sched<[]> {
-  let rd = 0;
-  let imm12 = {0b1111,0b1100,0b0010};
-}
-
-def SF_CEASE : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "cease","">,  Sched<[]> {
-  let rs1 = 0;
-  let rd = 0;
-  let imm12 = {0b0011,0b0000,0b0101};
-}
-}
-def : InstAlias<"cflush.d.l1", (SF_CFLUSH_D_L1 X0)>;
-def : InstAlias<"cdiscard.d.l1", (SF_CDISCARD_D_L1 X0)>;
-} // Predicates = [HasVendorXScie]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
index a78f36244468..3506204d6c25 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
@@ -56,9 +56,8 @@ def rlist : Operand<OtherVT> {
     int64_t Imm;
     if (!MCOp.evaluateAsConstantImm(Imm))
       return false;
-    if (!isUInt<4>(Imm)) return false;
     // 0~3 Reserved for EABI
-    return (Imm >= 4) && (Imm <= 15);
+    return isUInt<4>(Imm) && Imm >= 4;
   }];
  }
 
@@ -70,7 +69,7 @@ def spimm : Operand<OtherVT> {
     int64_t Imm;
     if (!MCOp.evaluateAsConstantImm(Imm))
       return false;
-    return isShiftedUInt<5, 4>(Imm);
+    return isShiftedUInt<2, 4>(Imm);
   }];
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
new file mode 100644
index 000000000000..6fbfde5ef488
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
@@ -0,0 +1,34 @@
+//===-- RISCVInstrInfoZcmop.td -----------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the RISC-V instructions from the standard Compressed
+// May-Be-Operations Extension (Zcmop).
+// This version is still experimental as the 'Zcmop' extension hasn't been
+// ratified yet. It is based on v0.2 of the specification.
+//
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class CMOPInst<bits<3> imm3, string opcodestr>
+    : RVInst16CI<0b011, 0b01, (outs), (ins), opcodestr, ""> {
+  let Inst{6-2} = 0;
+  let Inst{7} = 1;
+  let Inst{10-8} = imm3;
+  let Inst{12-11} = 0;
+}
+
+// CMOP1, CMOP5 is used by Zicfiss.
+let Predicates = [HasStdExtZcmop, NoHasStdExtZicfiss] in {
+  def CMOP1 : CMOPInst<0, "cmop.1">, Sched<[]>;
+  def CMOP5 : CMOPInst<2, "cmop.5">, Sched<[]>;
+}
+
+foreach n = [3, 7, 9, 11, 13, 15] in {
+  let Predicates = [HasStdExtZcmop] in
+  def CMOP # n : CMOPInst<!srl(n, 1), "cmop." # n>, Sched<[]>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td
new file mode 100644
index 000000000000..49a57f86cccd
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicfiss.td
@@ -0,0 +1,72 @@
+//===------ RISCVInstrInfoZicfiss.td - RISC-V Zicfiss -*- tablegen -*------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction class templates
+//===----------------------------------------------------------------------===//
+
+class RVC_SSInst<bits<5> rs1val, RegisterClass reg_class, string opcodestr> :
+  RVInst16<(outs), (ins reg_class:$rs1), opcodestr, "$rs1", [], InstFormatOther> {
+  let Inst{15-13} = 0b011;
+  let Inst{12} = 0;
+  let Inst{11-7} = rs1val;
+  let Inst{6-2} = 0b00000;
+  let Inst{1-0} = 0b01;
+  let DecoderMethod = "decodeCSSPushPopchk";
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtZicfiss] in {
+let Uses = [SSP], Defs = [SSP], hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
+def SSPOPCHK : RVInstI<0b100, OPC_SYSTEM, (outs), (ins GPRX1X5:$rs1), "sspopchk",
+                       "$rs1"> {
+  let rd = 0;
+  let imm12 = 0b110011011100;
+} // Uses = [SSP],  Defs = [SSP], hasSideEffects = 0, mayLoad = 1, mayStore = 0
+
+let Uses = [SSP], hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+def SSRDP : RVInstI<0b100, OPC_SYSTEM, (outs GPRNoX0:$rd), (ins), "ssrdp", "$rd"> {
+  let imm12 = 0b110011011100;
+  let rs1 = 0b00000;
+}
+} // Uses = [SSP], hasSideEffects = 0, mayLoad = 0, mayStore = 0
+
+let Uses = [SSP], Defs = [SSP], hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+def SSPUSH : RVInstR<0b1100111, 0b100, OPC_SYSTEM, (outs), (ins GPRX1X5:$rs2),
+                     "sspush", "$rs2"> {
+  let rd = 0b00000;
+  let rs1 = 0b00000;
+}
+} // Predicates = [HasStdExtZicfiss]
+
+let Predicates = [HasStdExtZicfiss, HasStdExtZcmop],
+    DecoderNamespace = "Zicfiss" in {
+let Uses = [SSP], Defs = [SSP], hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+def C_SSPUSH : RVC_SSInst<0b00001, GPRX1, "c.sspush">;
+
+let Uses = [SSP], Defs = [SSP], hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
+def C_SSPOPCHK : RVC_SSInst<0b00101, GPRX5, "c.sspopchk">;
+} // Predicates = [HasStdExtZicfiss, HasStdExtZcmop]
+
+let Predicates = [HasStdExtZicfiss] in
+defm SSAMOSWAP_W  : AMO_rr_aq_rl<0b01001, 0b010, "ssamoswap.w">;
+
+let Predicates = [HasStdExtZicfiss, IsRV64] in
+defm SSAMOSWAP_D  : AMO_rr_aq_rl<0b01001, 0b011, "ssamoswap.d">;
+
+//===----------------------------------------------------------------------===/
+// Compress Instruction tablegen backend.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtZicfiss, HasStdExtZcmop] in {
+def : CompressPat<(SSPUSH X1), (C_SSPUSH X1)>;
+def : CompressPat<(SSPOPCHK X5), (C_SSPOPCHK X5)>;
+} // Predicates = [HasStdExtZicfiss, HasStdExtZcmop]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td
new file mode 100644
index 000000000000..1e8c70046c63
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td
@@ -0,0 +1,59 @@
+//===-- RISCVInstrInfoZimop.td -----------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the RISC-V instructions from the standard
+// May-Be-Operations Extension (Zimop).
+// This version is still experimental as the 'Zimop' extension hasn't been
+// ratified yet. It is based on v0.1 of the specification.
+//
+//===----------------------------------------------------------------------===//
+
+class RVInstIMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3, RISCVOpcode opcode,
+                   dag outs, dag ins, string opcodestr, string argstr>
+    : RVInstIBase<funct3, opcode, outs, ins, opcodestr, argstr> {
+  let Inst{31} = imm7{6};
+  let Inst{30} = imm5{4};
+  let Inst{29-28} = imm7{5-4};
+  let Inst{27-26} = imm5{3-2};
+  let Inst{25-22} = imm7{3-0};
+  let Inst{21-20} = imm5{1-0};
+}
+
+class RVInstRMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3, RISCVOpcode opcode,
+                   dag outs, dag ins, string opcodestr, string argstr>
+    : RVInstRBase<funct3, opcode, outs, ins, opcodestr, argstr> {
+  let Inst{31} = imm4{3};
+  let Inst{30} = imm3{2};
+  let Inst{29-28} = imm4{2-1};
+  let Inst{27-26} = imm3{1-0};
+  let Inst{25} = imm4{0};
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3,
+             RISCVOpcode opcode, string opcodestr>
+    : RVInstIMopr<imm7, imm5, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1),
+                   opcodestr, "$rd, $rs1">;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3,
+             RISCVOpcode opcode, string opcodestr>
+    : RVInstRMoprr<imm4, imm3, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
+                   opcodestr, "$rd, $rs1, $rs2">;
+
+foreach i = 0...31 in {
+  let Predicates = [HasStdExtZimop] in
+  def MOPR#i : RVMopr<0b1000111, i, 0b100, OPC_SYSTEM, "mop.r."#i>,
+               Sched<[]>;
+}
+
+foreach i = 0...7 in {
+  let Predicates = [HasStdExtZimop] in
+  def MOPRR#i : RVMoprr<0b1001, i, 0b100, OPC_SYSTEM, "mop.rr."#i>,
+                Sched<[]>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp b/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
index 02ea5270823d..f948f05b22f7 100644
--- a/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
@@ -58,18 +58,66 @@ static bool isLDADD(const MachineInstr *FirstMI, const MachineInstr &SecondMI) {
   return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
 }
 
-// Fuse these patterns:
-//
-// slli rd, rs1, 32
-// srli rd, rd, x
-// where 0 <= x <= 32
-//
-// and
-//
+// Fuse zero extension of halfword:
 // slli rd, rs1, 48
+// srli rd, rd, 48
+static bool isZExtH(const MachineInstr *FirstMI, const MachineInstr &SecondMI) {
+  if (SecondMI.getOpcode() != RISCV::SRLI)
+    return false;
+
+  if (!SecondMI.getOperand(2).isImm())
+    return false;
+
+  if (SecondMI.getOperand(2).getImm() != 48)
+    return false;
+
+  // Given SecondMI, when FirstMI is unspecified, we must return
+  // if SecondMI may be part of a fused pair at all.
+  if (!FirstMI)
+    return true;
+
+  if (FirstMI->getOpcode() != RISCV::SLLI)
+    return false;
+
+  if (FirstMI->getOperand(2).getImm() != 48)
+    return false;
+
+  return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
+}
+
+// Fuse zero extension of word:
+// slli rd, rs1, 32
+// srli rd, rd, 32
+static bool isZExtW(const MachineInstr *FirstMI, const MachineInstr &SecondMI) {
+  if (SecondMI.getOpcode() != RISCV::SRLI)
+    return false;
+
+  if (!SecondMI.getOperand(2).isImm())
+    return false;
+
+  if (SecondMI.getOperand(2).getImm() != 32)
+    return false;
+
+  // Given SecondMI, when FirstMI is unspecified, we must return
+  // if SecondMI may be part of a fused pair at all.
+  if (!FirstMI)
+    return true;
+
+  if (FirstMI->getOpcode() != RISCV::SLLI)
+    return false;
+
+  if (FirstMI->getOperand(2).getImm() != 32)
+    return false;
+
+  return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
+}
+
+// Fuse shifted zero extension of word:
+// slli rd, rs1, 32
 // srli rd, rd, x
-static bool isShiftedZExt(const MachineInstr *FirstMI,
-                          const MachineInstr &SecondMI) {
+// where 0 <= x < 32
+static bool isShiftedZExtW(const MachineInstr *FirstMI,
+                           const MachineInstr &SecondMI) {
   if (SecondMI.getOpcode() != RISCV::SRLI)
     return false;
 
@@ -77,8 +125,7 @@ static bool isShiftedZExt(const MachineInstr *FirstMI,
     return false;
 
   unsigned SRLIImm = SecondMI.getOperand(2).getImm();
-  bool IsShiftBy48 = SRLIImm == 48;
-  if (SRLIImm > 32 && !IsShiftBy48)
+  if (SRLIImm >= 32)
     return false;
 
   // Given SecondMI, when FirstMI is unspecified, we must return
@@ -89,8 +136,7 @@ static bool isShiftedZExt(const MachineInstr *FirstMI,
   if (FirstMI->getOpcode() != RISCV::SLLI)
     return false;
 
-  unsigned SLLIImm = FirstMI->getOperand(2).getImm();
-  if (IsShiftBy48 ? (SLLIImm != 48) : (SLLIImm != 32))
+  if (FirstMI->getOperand(2).getImm() != 32)
     return false;
 
   return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
@@ -144,7 +190,13 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   if (ST.hasAUIPCADDIFusion() && isAUIPCADDI(FirstMI, SecondMI))
     return true;
 
-  if (ST.hasShiftedZExtFusion() && isShiftedZExt(FirstMI, SecondMI))
+  if (ST.hasZExtHFusion() && isZExtH(FirstMI, SecondMI))
+    return true;
+
+  if (ST.hasZExtWFusion() && isZExtW(FirstMI, SecondMI))
+    return true;
+
+  if (ST.hasShiftedZExtWFusion() && isShiftedZExtW(FirstMI, SecondMI))
     return true;
 
   if (ST.hasLDADDFusion() && isLDADD(FirstMI, SecondMI))
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 16c79519fcac..ba8996e710ed 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -174,8 +174,7 @@ def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
-                                      FeatureStdExtZihintpause,
-                                      FeatureVendorXSfcie],
+                                      FeatureStdExtZihintpause],
                                      [TuneSiFive7]>;
 
 def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54",
@@ -273,7 +272,13 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
                                              FeatureStdExtZicbop,
                                              FeatureStdExtZicboz,
                                              FeatureVendorXVentanaCondOps],
-                                             [TuneVeyronFusions]>;
+                                             [TuneVentanaVeyron,
+                                              TuneLUIADDIFusion,
+                                              TuneAUIPCADDIFusion,
+                                              TuneZExtHFusion,
+                                              TuneZExtWFusion,
+                                              TuneShiftedZExtWFusion,
+                                              TuneLDADDFusion]>;
 
 def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
                                           NoSchedModel,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index a3c19115bd31..24f8d600f1ea 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -127,6 +127,9 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     markSuperRegs(Reserved, RISCV::X27);
   }
 
+  // Shadow stack pointer.
+  markSuperRegs(Reserved, RISCV::SSP);
+
   assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index c59c9b294d79..840fd149d681 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -137,6 +137,8 @@ def GPR : GPRRegisterClass<(add (sequence "X%u", 10, 17),
                                 (sequence "X%u", 0, 4))>;
 
 def GPRX0 : GPRRegisterClass<(add X0)>;
+def GPRX1 : GPRRegisterClass<(add X1)>;
+def GPRX5 : GPRRegisterClass<(add X5)>;
 
 def GPRNoX0 : GPRRegisterClass<(sub GPR, X0)>;
 
@@ -165,6 +167,8 @@ def SP : GPRRegisterClass<(add X2)>;
 def SR07 : GPRRegisterClass<(add (sequence "X%u", 8, 9),
                                  (sequence "X%u", 18, 23))>;
 
+def GPRX1X5 :  GPRRegisterClass<(add X1, X5)>;
+
 // Floating point registers
 let RegAltNameIndices = [ABIRegAltName] in {
   def F0_H  : RISCVReg16<0, "f0", ["ft0"]>, DwarfRegNum<[32]>;
@@ -591,3 +595,6 @@ foreach m = LMULList in {
 // Special registers
 def FFLAGS : RISCVReg<0, "fflags">;
 def FRM    : RISCVReg<0, "frm">;
+
+// Shadow Stack register
+def SSP    : RISCVReg<0, "ssp">;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 7540218633bf..26320b05d9be 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -190,8 +190,8 @@ public:
   }
 
   bool hasMacroFusion() const {
-    return hasLUIADDIFusion() || hasAUIPCADDIFusion() ||
-           hasShiftedZExtFusion() || hasLDADDFusion();
+    return hasLUIADDIFusion() || hasAUIPCADDIFusion() || hasZExtHFusion() ||
+           hasZExtWFusion() || hasShiftedZExtWFusion() || hasLDADDFusion();
   }
 
   // Vector codegen related methods.
diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index 953df7b15e2f..43475e825b46 100644
--- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -19,9 +19,11 @@ include "llvm/TableGen/SearchableTable.td"
 
 class SysReg<string name, bits<12> op> {
   string Name = name;
-  // A maximum of one deprecated name is supported right now. It generates a
-  // diagnostic when the name is used to encourage software to migrate away from
-  // the name.
+  // A maximum of one alias is supported right now.
+  string AltName = name;
+  // A maximum of one deprecated name is supported right now.  Unlike the
+  // `AltName` alias, a `DeprecatedName` generates a diagnostic when the name is
+  // used to encourage software to migrate away from the name.
   string DeprecatedName = "";
   bits<12> Encoding = op;
   // FIXME: add these additional fields when needed.
@@ -41,7 +43,7 @@ def SysRegsList : GenericTable {
   let FilterClass = "SysReg";
   // FIXME: add "ReadWrite", "Mode", "Extra", "Number" fields when needed.
   let Fields = [
-    "Name", "DeprecatedName", "Encoding", "FeaturesRequired",
+    "Name", "AltName", "DeprecatedName", "Encoding", "FeaturesRequired",
     "isRV32Only",
   ];
 
@@ -54,32 +56,13 @@ def lookupSysRegByName : SearchIndex {
   let Key = [ "Name" ];
 }
 
-def lookupSysRegByDeprecatedName : SearchIndex {
+def lookupSysRegByAltName : SearchIndex {
   let Table = SysRegsList;
-  let Key = [ "DeprecatedName" ];
-}
-
-class SiFiveReg<string name, bits<12> op> : SysReg<name, op>;
-
-def SiFiveRegsList : GenericTable {
-  let FilterClass = "SiFiveReg";
-  // FIXME: add "ReadWrite", "Mode", "Extra", "Number" fields when needed.
-  let Fields = [
-    "Name", "DeprecatedName", "Encoding", "FeaturesRequired",
-    "isRV32Only",
-  ];
-
-  let PrimaryKey = [ "Encoding" ];
-  let PrimaryKeyName = "lookupSiFiveRegByEncoding";
+  let Key = [ "AltName" ];
 }
 
-def lookupSiFiveRegByName : SearchIndex {
-  let Table = SiFiveRegsList;
-  let Key = [ "Name" ];
-}
-
-def lookupSiFiveRegByDeprecatedName : SearchIndex {
-  let Table = SiFiveRegsList;
+def lookupSysRegByDeprecatedName : SearchIndex {
+  let Table = SysRegsList;
   let Key = [ "DeprecatedName" ];
 }
 
@@ -309,7 +292,7 @@ foreach i = 3...31 in
 //===----------------------------------------------------------------------===//
 // Machine Counter Setup
 //===----------------------------------------------------------------------===//
-let DeprecatedName = "mucounteren" in // Privileged spec v1.9.1 Name
+let AltName = "mucounteren" in // Privileged spec v1.9.1 Name
 def : SysReg<"mcountinhibit", 0x320>;
 
 // mhpmevent3-mhpmevent31 at 0x323-0x33F.
@@ -323,20 +306,6 @@ foreach i = 3...31 in {
 }
 
 //===----------------------------------------------------------------------===//
-// SiFive Custom Machine Mode Registers
-//===----------------------------------------------------------------------===//
-
-let FeaturesRequired = [{ {RISCV::FeatureVendorXSfcie} }] in {
-def : SiFiveReg<"mnscratch", 0x350>;
-def : SiFiveReg<"mnepc", 0x351>;
-def : SiFiveReg<"mncause", 0x352>;
-def : SiFiveReg<"mnstatus", 0x353>;
-def : SiFiveReg<"mbpm", 0x7C0>;
-def : SiFiveReg<"mfd", 0x7C1>;
-def : SiFiveReg<"mpd", 0x7C8>;
-}
-
-//===----------------------------------------------------------------------===//
 // Debug/ Trace Registers (shared with Debug Mode)
 //===----------------------------------------------------------------------===//
 def : SysReg<"tselect", 0x7A0>;
@@ -353,7 +322,7 @@ def : SysReg<"dpc", 0x7B1>;
 
 // "dscratch" is an alternative name for "dscratch0" which appeared in earlier
 // drafts of the RISC-V debug spec
-let DeprecatedName = "dscratch" in
+let AltName = "dscratch" in
 def : SysReg<"dscratch0", 0x7B2>;
 def : SysReg<"dscratch1", 0x7B3>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 96ecc771863e..4c955744b37d 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -359,7 +359,8 @@ public:
                      const TargetTransformInfo::LSRCost &C2);
 
   bool shouldFoldTerminatingConditionAfterLSR() const {
-    return true;
+    // FIXME: Enabling this causes miscompiles.
+    return false;
   }
 };
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 5ac45079bd00..c85bd27d256b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1617,7 +1617,7 @@ static bool buildEnqueueKernel(const SPIRV::IncomingCall *Call,
                                SPIRVGlobalRegistry *GR) {
   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
   const DataLayout &DL = MIRBuilder.getDataLayout();
-  bool HasEvents = Call->Builtin->Name.find("events") != StringRef::npos;
+  bool HasEvents = Call->Builtin->Name.contains("events");
   const SPIRVType *Int32Ty = GR->getOrCreateSPIRVIntegerType(32, MIRBuilder);
 
   // Make vararg instructions before OpEnqueueKernel.
@@ -2098,7 +2098,7 @@ parseBuiltinTypeNameToTargetExtType(std::string TypeName,
 
   // Parameterized SPIR-V builtins names follow this format:
   // e.g. %spirv.Image._void_1_0_0_0_0_0_0, %spirv.Pipe._0
-  if (NameWithParameters.find('_') == std::string::npos)
+  if (!NameWithParameters.contains('_'))
     return TargetExtType::get(MIRBuilder.getContext(), NameWithParameters);
 
   SmallVector<StringRef> Parameters;
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index b8a6784ff3c6..6c009b9e8dde 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -645,7 +645,7 @@ SPIRVType *SPIRVGlobalRegistry::findSPIRVType(
   Register Reg = DT.find(Ty, &MIRBuilder.getMF());
   if (Reg.isValid())
     return getSPIRVTypeForVReg(Reg);
-  if (ForwardPointerTypes.find(Ty) != ForwardPointerTypes.end())
+  if (ForwardPointerTypes.contains(Ty))
     return ForwardPointerTypes[Ty];
   return restOfCreateSPIRVType(Ty, MIRBuilder, AccQual, EmitIR);
 }
@@ -712,14 +712,14 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
     // Null pointer means we have a loop in type definitions, make and
     // return corresponding OpTypeForwardPointer.
     if (SpvElementType == nullptr) {
-      if (ForwardPointerTypes.find(Ty) == ForwardPointerTypes.end())
+      if (!ForwardPointerTypes.contains(Ty))
         ForwardPointerTypes[PType] = getOpTypeForwardPointer(SC, MIRBuilder);
       return ForwardPointerTypes[PType];
     }
     Register Reg(0);
     // If we have forward pointer associated with this type, use its register
     // operand to create OpTypePointer.
-    if (ForwardPointerTypes.find(PType) != ForwardPointerTypes.end())
+    if (ForwardPointerTypes.contains(PType))
       Reg = getSPIRVTypeID(ForwardPointerTypes[PType]);
 
     return getOpTypePointer(SC, SpvElementType, MIRBuilder, Reg);
@@ -959,8 +959,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVTypeByName(
   // N is the number of elements of the vector.
   Type *Ty;
 
-  if (TypeStr.starts_with("atomic_"))
-    TypeStr = TypeStr.substr(strlen("atomic_"));
+  TypeStr.consume_front("atomic_");
 
   if (TypeStr.starts_with("void")) {
     Ty = Type::getVoidTy(Ctx);
@@ -1007,8 +1006,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVTypeByName(
   // Handle "typeN*" or  "type vector[N]*".
   bool IsPtrToVec = TypeStr.consume_back("*");
 
-  if (TypeStr.starts_with(" vector[")) {
-    TypeStr = TypeStr.substr(strlen(" vector["));
+  if (TypeStr.consume_front(" vector[")) {
     TypeStr = TypeStr.substr(0, TypeStr.find(']'));
   }
   TypeStr.getAsInteger(10, VecElts);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 779036016560..2a830535a2aa 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -391,7 +391,7 @@ void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) {
         if (MI.getOpcode() != SPIRV::OpExtInst)
           continue;
         auto Set = MI.getOperand(2).getImm();
-        if (MAI.ExtInstSetMap.find(Set) == MAI.ExtInstSetMap.end())
+        if (!MAI.ExtInstSetMap.contains(Set))
           MAI.ExtInstSetMap[Set] = Register::index2VirtReg(MAI.getNextID());
       }
     }
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index 5124181b49e2..d0b8027edd42 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -189,7 +189,7 @@ struct ModuleAnalysisInfo {
   }
   unsigned getNextID() { return MaxID++; }
   bool hasMBBRegister(const MachineBasicBlock &MBB) {
-    return BBNumToRegMap.find(MBB.getNumber()) != BBNumToRegMap.end();
+    return BBNumToRegMap.contains(MBB.getNumber());
   }
   // Convert MBB's number to corresponding ID register.
   Register getOrCreateMBBRegister(const MachineBasicBlock &MBB) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index f4076be2a7b7..1bfce70fedc0 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -83,7 +83,7 @@ static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR) {
   }
   for (MachineInstr *MI : ToErase) {
     Register Reg = MI->getOperand(2).getReg();
-    if (RegsAlreadyAddedToDT.find(MI) != RegsAlreadyAddedToDT.end())
+    if (RegsAlreadyAddedToDT.contains(MI))
       Reg = RegsAlreadyAddedToDT[MI];
     auto *RC = MRI.getRegClassOrNull(MI->getOperand(0).getReg());
     if (!MRI.getRegClassOrNull(Reg) && RC)
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 4f0801479211..78bdf3ae9a84 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -2050,7 +2050,7 @@ static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
          LHS.getOperand(3).getOpcode() == SPISD::CMPFCC_V9))) &&
       isOneConstant(LHS.getOperand(0)) && isNullConstant(LHS.getOperand(1))) {
     SDValue CMPCC = LHS.getOperand(3);
-    SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
+    SPCC = LHS.getConstantOperandVal(2);
     LHS = CMPCC.getOperand(0);
     RHS = CMPCC.getOperand(1);
   }
@@ -3186,7 +3186,7 @@ static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
 
 SDValue SparcTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 559f2ca476d7..045c4c0aac07 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -2186,7 +2186,7 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 // the mask of valid CC values if so.
 static bool isIntrinsicWithCCAndChain(SDValue Op, unsigned &Opcode,
                                       unsigned &CCValid) {
-  unsigned Id = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned Id = Op.getConstantOperandVal(1);
   switch (Id) {
   case Intrinsic::s390_tbegin:
     Opcode = SystemZISD::TBEGIN;
@@ -2212,7 +2212,7 @@ static bool isIntrinsicWithCCAndChain(SDValue Op, unsigned &Opcode,
 // CC value as its final argument.  Provide the associated SystemZISD
 // opcode and the mask of valid CC values if so.
 static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
-  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Id = Op.getConstantOperandVal(0);
   switch (Id) {
   case Intrinsic::s390_vpkshs:
   case Intrinsic::s390_vpksfs:
@@ -2600,10 +2600,9 @@ static bool shouldSwapCmpOperands(const Comparison &C) {
     return true;
   if (C.ICmpType != SystemZICMP::SignedOnly && Opcode0 == ISD::ZERO_EXTEND)
     return true;
-  if (C.ICmpType != SystemZICMP::SignedOnly &&
-      Opcode0 == ISD::AND &&
+  if (C.ICmpType != SystemZICMP::SignedOnly && Opcode0 == ISD::AND &&
       C.Op0.getOperand(1).getOpcode() == ISD::Constant &&
-      cast<ConstantSDNode>(C.Op0.getOperand(1))->getZExtValue() == 0xffffffff)
+      C.Op0.getConstantOperandVal(1) == 0xffffffff)
     return true;
 
   return false;
@@ -3429,11 +3428,9 @@ SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
   return (Neg.getOpcode() == ISD::SUB &&
           Neg.getOperand(0).getOpcode() == ISD::Constant &&
-          cast<ConstantSDNode>(Neg.getOperand(0))->getZExtValue() == 0 &&
-          Neg.getOperand(1) == Pos &&
-          (Pos == CmpOp ||
-           (Pos.getOpcode() == ISD::SIGN_EXTEND &&
-            Pos.getOperand(0) == CmpOp)));
+          Neg.getConstantOperandVal(0) == 0 && Neg.getOperand(1) == Pos &&
+          (Pos == CmpOp || (Pos.getOpcode() == ISD::SIGN_EXTEND &&
+                            Pos.getOperand(0) == CmpOp)));
 }
 
 // Return the absolute or negative absolute of Op; IsNegative decides which.
@@ -3740,7 +3737,7 @@ SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
   MFI.setFrameAddressIsTaken(true);
 
   SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   // By definition, the frame address is the address of the back chain.  (In
@@ -3776,7 +3773,7 @@ SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op,
     return SDValue();
 
   SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Depth = Op.getConstantOperandVal(0);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Depth > 0) {
@@ -4226,7 +4223,7 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
   if (HighOp.getOpcode() == ISD::AND &&
       HighOp.getOperand(1).getOpcode() == ISD::Constant) {
     SDValue HighOp0 = HighOp.getOperand(0);
-    uint64_t Mask = cast<ConstantSDNode>(HighOp.getOperand(1))->getZExtValue();
+    uint64_t Mask = HighOp.getConstantOperandVal(1);
     if (DAG.MaskedValueIsZero(HighOp0, APInt(64, ~(Mask | 0xffffffff))))
       HighOp = HighOp0;
   }
@@ -4485,10 +4482,10 @@ SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
 SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
                                                  SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
-    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
-  SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
-    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+  AtomicOrdering FenceOrdering =
+      static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
+  SyncScope::ID FenceSSID =
+      static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
 
   // The only fence that needs an instruction is a sequentially-consistent
   // cross-thread fence.
@@ -4773,13 +4770,13 @@ SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
 
 SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
                                              SelectionDAG &DAG) const {
-  bool IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  bool IsData = Op.getConstantOperandVal(4);
   if (!IsData)
     // Just preserve the chain.
     return Op.getOperand(0);
 
   SDLoc DL(Op);
-  bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  bool IsWrite = Op.getConstantOperandVal(2);
   unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
   auto *Node = cast<MemIntrinsicSDNode>(Op.getNode());
   SDValue Ops[] = {Op.getOperand(0), DAG.getTargetConstant(Code, DL, MVT::i32),
@@ -4825,7 +4822,7 @@ SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                        SDValue(Node, 0), getCCResult(DAG, SDValue(Node, 1)));
   }
 
-  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned Id = Op.getConstantOperandVal(0);
   switch (Id) {
   case Intrinsic::thread_pointer:
     return lowerThreadPointer(SDLoc(Op), DAG);
@@ -5628,7 +5625,7 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
       Op = Op.getOperand(0);
     if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         Op.getOperand(1).getOpcode() == ISD::Constant) {
-      unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      unsigned Elem = Op.getConstantOperandVal(1);
       if (!GS.add(Op.getOperand(0), Elem))
         return SDValue();
       FoundOne = true;
@@ -6727,8 +6724,7 @@ SDValue SystemZTargetLowering::combineLOAD(
       int Index = 1;
       if (User->getOpcode() == ISD::SRL &&
           User->getOperand(1).getOpcode() == ISD::Constant &&
-          cast<ConstantSDNode>(User->getOperand(1))->getZExtValue() == 64 &&
-          User->hasOneUse()) {
+          User->getConstantOperandVal(1) == 64 && User->hasOneUse()) {
         User = *User->use_begin();
         Index = 0;
       }
@@ -6857,7 +6853,7 @@ static bool isMovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
     std::swap(Op0, Op1);
   if (Op1.getOpcode() != ISD::SHL || !Op1.getNode()->hasOneUse() ||
       Op1.getOperand(1).getOpcode() != ISD::Constant ||
-      cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue() != 64)
+      Op1.getConstantOperandVal(1) != 64)
     return false;
   Op1 = Op1.getOperand(0);
 
@@ -7149,20 +7145,18 @@ SDValue SystemZTargetLowering::combineFP_ROUND(
   unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
   SelectionDAG &DAG = DCI.DAG;
   SDValue Op0 = N->getOperand(OpNo);
-  if (N->getValueType(0) == MVT::f32 &&
-      Op0.hasOneUse() &&
+  if (N->getValueType(0) == MVT::f32 && Op0.hasOneUse() &&
       Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
       Op0.getOperand(0).getValueType() == MVT::v2f64 &&
       Op0.getOperand(1).getOpcode() == ISD::Constant &&
-      cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
+      Op0.getConstantOperandVal(1) == 0) {
     SDValue Vec = Op0.getOperand(0);
     for (auto *U : Vec->uses()) {
-      if (U != Op0.getNode() &&
-          U->hasOneUse() &&
+      if (U != Op0.getNode() && U->hasOneUse() &&
           U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
           U->getOperand(0) == Vec &&
           U->getOperand(1).getOpcode() == ISD::Constant &&
-          cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
+          U->getConstantOperandVal(1) == 1) {
         SDValue OtherRound = SDValue(*U->use_begin(), 0);
         if (OtherRound.getOpcode() == N->getOpcode() &&
             OtherRound.getOperand(OpNo) == SDValue(U, 0) &&
@@ -7215,20 +7209,18 @@ SDValue SystemZTargetLowering::combineFP_EXTEND(
   unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
   SelectionDAG &DAG = DCI.DAG;
   SDValue Op0 = N->getOperand(OpNo);
-  if (N->getValueType(0) == MVT::f64 &&
-      Op0.hasOneUse() &&
+  if (N->getValueType(0) == MVT::f64 && Op0.hasOneUse() &&
       Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
       Op0.getOperand(0).getValueType() == MVT::v4f32 &&
       Op0.getOperand(1).getOpcode() == ISD::Constant &&
-      cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
+      Op0.getConstantOperandVal(1) == 0) {
     SDValue Vec = Op0.getOperand(0);
     for (auto *U : Vec->uses()) {
-      if (U != Op0.getNode() &&
-          U->hasOneUse() &&
+      if (U != Op0.getNode() && U->hasOneUse() &&
           U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
           U->getOperand(0) == Vec &&
           U->getOperand(1).getOpcode() == ISD::Constant &&
-          cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 2) {
+          U->getConstantOperandVal(1) == 2) {
         SDValue OtherExtend = SDValue(*U->use_begin(), 0);
         if (OtherExtend.getOpcode() == N->getOpcode() &&
             OtherExtend.getOperand(OpNo) == SDValue(U, 0) &&
@@ -7605,7 +7597,7 @@ SDValue SystemZTargetLowering::combineINTRINSIC(
     SDNode *N, DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
 
-  unsigned Id = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned Id = N->getConstantOperandVal(1);
   switch (Id) {
   // VECTOR LOAD (RIGHTMOST) WITH LENGTH with a length operand of 15
   // or larger is simply a vector load.
@@ -7679,7 +7671,7 @@ static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
   APInt SrcDemE;
   unsigned Opcode = Op.getOpcode();
   if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
-    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned Id = Op.getConstantOperandVal(0);
     switch (Id) {
     case Intrinsic::s390_vpksh:   // PACKS
     case Intrinsic::s390_vpksf:
@@ -7723,7 +7715,7 @@ static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
       SrcDemE = APInt(NumElts, 0);
       if (!DemandedElts[OpNo - 1])
         break;
-      unsigned Mask = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      unsigned Mask = Op.getConstantOperandVal(3);
       unsigned MaskBit = ((OpNo - 1) ? 1 : 4);
       // Demand input element 0 or 1, given by the mask bit value.
       SrcDemE.setBit((Mask & MaskBit)? 1 : 0);
@@ -7732,7 +7724,7 @@ static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
     case Intrinsic::s390_vsldb: {
       // VECTOR SHIFT LEFT DOUBLE BY BYTE
       assert(VT == MVT::v16i8 && "Unexpected type.");
-      unsigned FirstIdx = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      unsigned FirstIdx = Op.getConstantOperandVal(3);
       assert (FirstIdx > 0 && FirstIdx < 16 && "Unused operand.");
       unsigned NumSrc0Els = 16 - FirstIdx;
       SrcDemE = APInt(NumElts, 0);
@@ -7808,7 +7800,7 @@ SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   unsigned Opcode = Op.getOpcode();
   if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
     bool IsLogical = false;
-    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned Id = Op.getConstantOperandVal(0);
     switch (Id) {
     case Intrinsic::s390_vpksh:   // PACKS
     case Intrinsic::s390_vpksf:
@@ -7908,7 +7900,7 @@ SystemZTargetLowering::ComputeNumSignBitsForTargetNode(
     return 1;
   unsigned Opcode = Op.getOpcode();
   if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
-    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned Id = Op.getConstantOperandVal(0);
     switch (Id) {
     case Intrinsic::s390_vpksh:   // PACKS
     case Intrinsic::s390_vpksf:
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index af6cf340f8a3..d98bb886c185 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -507,11 +507,11 @@ def z_subcarry : PatFrag<(ops node:$lhs, node:$rhs),
 
 // Signed and unsigned comparisons.
 def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, timm), [{
-  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  unsigned Type = N->getConstantOperandVal(2);
   return Type != SystemZICMP::UnsignedOnly;
 }]>;
 def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, timm), [{
-  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  unsigned Type = N->getConstantOperandVal(2);
   return Type != SystemZICMP::SignedOnly;
 }]>;
 
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 3f96bd37755e..2a4383314e46 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -61,8 +61,6 @@ bool TargetMachine::isLargeGlobalValue(const GlobalValue *GVal) const {
   // We should properly mark well-known section name prefixes as small/large,
   // because otherwise the output section may have the wrong section flags and
   // the linker will lay it out in an unexpected way.
-  // TODO: bring back lbss/ldata/lrodata checks after fixing accesses to large
-  // globals in the small code model.
   StringRef Name = GV->getSection();
   if (!Name.empty()) {
     auto IsPrefix = [&](StringRef Prefix) {
@@ -71,6 +69,8 @@ bool TargetMachine::isLargeGlobalValue(const GlobalValue *GVal) const {
     };
     if (IsPrefix(".bss") || IsPrefix(".data") || IsPrefix(".rodata"))
       return false;
+    if (IsPrefix(".lbss") || IsPrefix(".ldata") || IsPrefix(".lrodata"))
+      return true;
   }
 
   // For x86-64, we treat an explicit GlobalVariable small code model to mean
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 0267aefd1e91..0e41a2d7aa03 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -1101,10 +1101,10 @@ Instruction *VETargetLowering::emitTrailingFence(IRBuilderBase &Builder,
 SDValue VETargetLowering::lowerATOMIC_FENCE(SDValue Op,
                                             SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
-      cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
-  SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
-      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+  AtomicOrdering FenceOrdering =
+      static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
+  SyncScope::ID FenceSSID =
+      static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
 
   // VE uses Release consistency, so need a fence instruction if it is a
   // cross-thread fence.
@@ -1766,7 +1766,7 @@ static SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
 SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
   switch (IntNo) {
   default: // Don't custom lower most intrinsics.
     return SDValue();
@@ -2937,8 +2937,8 @@ static bool isI32Insn(const SDNode *User, const SDNode *N) {
     if (User->getOperand(1).getNode() != N &&
         User->getOperand(2).getNode() != N &&
         isa<ConstantSDNode>(User->getOperand(3))) {
-      VECC::CondCode VECCVal = static_cast<VECC::CondCode>(
-          cast<ConstantSDNode>(User->getOperand(3))->getZExtValue());
+      VECC::CondCode VECCVal =
+          static_cast<VECC::CondCode>(User->getConstantOperandVal(3));
       return isIntVECondCode(VECCVal);
     }
     [[fallthrough]];
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 1d40ce35c1b4..051f6caa8c04 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -108,6 +108,8 @@ class X86AsmParser : public MCTargetAsmParser {
 
   // Does this instruction use apx extended register?
   bool UseApxExtendedReg = false;
+  // Is this instruction explicitly required not to update flags?
+  bool ForcedNoFlag = false;
 
 private:
   SMLoc consumeToken() {
@@ -263,8 +265,7 @@ private:
         return 0;
 
       SmallVector<ICToken, 16> OperandStack;
-      for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) {
-        ICToken Op = PostfixStack[i];
+      for (const ICToken &Op : PostfixStack) {
         if (Op.first == IC_IMM || Op.first == IC_REGISTER) {
           OperandStack.push_back(Op);
         } else if (isUnaryOperator(Op.first)) {
@@ -1731,8 +1732,8 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
       OrigOperands.pop_back();
   }
   // OrigOperands.append(FinalOperands.begin(), FinalOperands.end());
-  for (unsigned int i = 0; i < FinalOperands.size(); ++i)
-    OrigOperands.push_back(std::move(FinalOperands[i]));
+  for (auto &Op : FinalOperands)
+    OrigOperands.push_back(std::move(Op));
 
   return false;
 }
@@ -2313,8 +2314,7 @@ bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM,
 
   // Drop the optional '.'.
   StringRef DotDispStr = Tok.getString();
-  if (DotDispStr.starts_with("."))
-    DotDispStr = DotDispStr.drop_front(1);
+  DotDispStr.consume_front(".");
   StringRef TrailingDot;
 
   // .Imm gets lexed as a real.
@@ -3127,6 +3127,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   ForcedVEXEncoding = VEXEncoding_Default;
   ForcedDispEncoding = DispEncoding_Default;
   UseApxExtendedReg = false;
+  ForcedNoFlag = false;
 
   // Parse pseudo prefixes.
   while (true) {
@@ -3151,6 +3152,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
         ForcedDispEncoding = DispEncoding_Disp8;
       else if (Prefix == "disp32")
         ForcedDispEncoding = DispEncoding_Disp32;
+      else if (Prefix == "nf")
+        ForcedNoFlag = true;
       else
         return Error(NameLoc, "unknown prefix");
 
@@ -3998,6 +4001,8 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
 
   if (UseApxExtendedReg && !X86II::canUseApxExtendedReg(MCID))
     return Match_Unsupported;
+  if (ForcedNoFlag != !!(MCID.TSFlags & X86II::EVEX_NF))
+    return Match_Unsupported;
 
   if (ForcedVEXEncoding == VEXEncoding_EVEX &&
       (MCID.TSFlags & X86II::EncodingMask) != X86II::EVEX)
diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 59e2008f5632..347dc0d4ed43 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -1169,7 +1169,11 @@ static int getInstructionID(struct InternalInstruction *insn,
         attrMask |= ATTR_EVEXKZ;
       if (bFromEVEX4of4(insn->vectorExtensionPrefix[3]))
         attrMask |= ATTR_EVEXB;
-      if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+      // nf bit is the MSB of aaa
+      if (nfFromEVEX4of4(insn->vectorExtensionPrefix[3]) &&
+          insn->opcodeType == MAP4)
+        attrMask |= ATTR_EVEXNF;
+      else if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
         attrMask |= ATTR_EVEXK;
       if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
         attrMask |= ATTR_VEXL;
diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index decc45091941..4c7b1c094522 100644
--- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -103,6 +103,7 @@ namespace X86Disassembler {
 #define bFromEVEX4of4(evex) bitFromOffset4(evex)
 #define v2FromEVEX4of4(evex) invertedBitFromOffset3(evex)
 #define aaaFromEVEX4of4(evex) threeBitsFromOffset0(evex)
+#define nfFromEVEX4of4(evex) bitFromOffset2(evex)
 
 // These enums represent Intel registers for use by the decoder.
 #define REGS_8BIT                                                              \
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index b0fcaef5f4b0..e006dd877360 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -870,7 +870,10 @@ enum : uint64_t {
   ExplicitVEXPrefix = 2ULL << ExplicitOpPrefixShift,
   /// For instructions that are promoted to EVEX space for EGPR.
   ExplicitEVEXPrefix = 3ULL << ExplicitOpPrefixShift,
-  ExplicitOpPrefixMask = 3ULL << ExplicitOpPrefixShift
+  ExplicitOpPrefixMask = 3ULL << ExplicitOpPrefixShift,
+  /// EVEX_NF - Set if this instruction has EVEX.NF field set.
+  EVEX_NFShift = ExplicitOpPrefixShift + 2,
+  EVEX_NF = 1ULL << EVEX_NFShift
 };
 
 /// \returns true if the instruction with given opcode is a prefix.
@@ -992,6 +995,12 @@ inline unsigned getOperandBias(const MCInstrDesc &Desc) {
   }
 }
 
+/// \returns true if the instruction has a NDD (new data destination).
+inline bool hasNewDataDest(uint64_t TSFlags) {
+  return (TSFlags & X86II::OpMapMask) == X86II::T_MAP4 &&
+         (TSFlags & X86II::EVEX_B) && (TSFlags & X86II::VEX_4V);
+}
+
 /// \returns operand # for the first field of the memory operand or -1 if no
 /// memory operands.
 /// NOTE: This ignores tied operands.  If there is a tied register which is
@@ -1018,7 +1027,7 @@ inline int getMemoryOperandNo(uint64_t TSFlags) {
     return -1;
   case X86II::MRMDestMem:
   case X86II::MRMDestMemFSIB:
-    return 0;
+    return hasNewDataDest(TSFlags);
   case X86II::MRMSrcMem:
   case X86II::MRMSrcMemFSIB:
     // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index cab2f0a2e1c1..1947313a9dfb 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -369,6 +369,9 @@ void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O,
   else if (Flags & X86::IP_HAS_REPEAT)
     O << "\trep\t";
 
+  if (TSFlags & X86II::EVEX_NF)
+    O << "\t{nf}";
+
   // These all require a pseudo prefix
   if ((Flags & X86::IP_USE_VEX) ||
       (TSFlags & X86II::ExplicitOpPrefixMask) == X86II::ExplicitVEXPrefix)
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index b6ebbcf56aef..924956295e7c 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -251,6 +251,7 @@ public:
   void setAAA(const MCInst &MI, unsigned OpNum) {
     EVEX_aaa = getRegEncoding(MI, OpNum);
   }
+  void setNF(bool V) { EVEX_aaa |= V << 2; }
 
   X86OpcodePrefixHelper(const MCRegisterInfo &MRI)
       : W(0), R(0), X(0), B(0), M(0), R2(0), X2(0), B2(0), VEX_4V(0), VEX_L(0),
@@ -987,9 +988,11 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
   }
 
   Prefix.setW(TSFlags & X86II::REX_W);
+  Prefix.setNF(TSFlags & X86II::EVEX_NF);
 
   bool HasEVEX_K = TSFlags & X86II::EVEX_K;
   bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+  bool IsND = X86II::hasNewDataDest(TSFlags); // IsND implies HasVEX_4V
   bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
 
   switch (TSFlags & X86II::OpMapMask) {
@@ -1049,6 +1052,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
 
   bool EncodeRC = false;
   uint8_t EVEX_rc = 0;
+
   unsigned CurOp = X86II::getOperandBias(Desc);
 
   switch (TSFlags & X86II::FormMask) {
@@ -1060,7 +1064,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     Prefix.setBB2(MI, MemOperand + X86::AddrBaseReg);
     Prefix.setXX2(MI, MemOperand + X86::AddrIndexReg);
     CurOp += X86::AddrNumOperands;
-    Prefix.set4V(MI, CurOp++);
+    Prefix.set4VV2(MI, CurOp++);
     break;
   }
   case X86II::MRM_C0:
@@ -1073,16 +1077,21 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     //  MemAddr, src1(VEX_4V), src2(ModR/M)
     //  MemAddr, src1(ModR/M), imm8
     //
+    // NDD:
+    //  dst(VEX_4V), MemAddr, src1(ModR/M)
     Prefix.setBB2(MI, MemOperand + X86::AddrBaseReg);
     Prefix.setXX2(MI, MemOperand + X86::AddrIndexReg);
     Prefix.setV2(MI, MemOperand + X86::AddrIndexReg, HasVEX_4V);
 
+    if (IsND)
+      Prefix.set4VV2(MI, CurOp++);
+
     CurOp += X86::AddrNumOperands;
 
     if (HasEVEX_K)
       Prefix.setAAA(MI, CurOp++);
 
-    if (HasVEX_4V)
+    if (!IsND && HasVEX_4V)
       Prefix.set4VV2(MI, CurOp++);
 
     Prefix.setRR2(MI, CurOp++);
@@ -1098,12 +1107,18 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     //
     //  FMA4:
     //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4])
+    //
+    //  NDD:
+    //  dst(VEX_4V), src1(ModR/M), MemAddr
+    if (IsND)
+      Prefix.set4VV2(MI, CurOp++);
+
     Prefix.setRR2(MI, CurOp++);
 
     if (HasEVEX_K)
       Prefix.setAAA(MI, CurOp++);
 
-    if (HasVEX_4V)
+    if (!IsND && HasVEX_4V)
       Prefix.set4VV2(MI, CurOp++);
 
     Prefix.setBB2(MI, MemOperand + X86::AddrBaseReg);
@@ -1160,12 +1175,17 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     //
     //  FMA4:
     //  dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
+    //
+    //  NDD:
+    //  dst(VEX_4V), src1(ModR/M.reg), src2(ModR/M)
+    if (IsND)
+      Prefix.set4VV2(MI, CurOp++);
     Prefix.setRR2(MI, CurOp++);
 
     if (HasEVEX_K)
       Prefix.setAAA(MI, CurOp++);
 
-    if (HasVEX_4V)
+    if (!IsND && HasVEX_4V)
       Prefix.set4VV2(MI, CurOp++);
 
     Prefix.setBB2(MI, CurOp);
@@ -1209,6 +1229,11 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     //  dst(ModR/M), src(ModR/M)
     //  dst(ModR/M), src(ModR/M), imm8
     //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+    //
+    // NDD:
+    // dst(VEX_4V), src1(ModR/M), src2(ModR/M)
+    if (IsND)
+      Prefix.set4VV2(MI, CurOp++);
     Prefix.setBB2(MI, CurOp);
     Prefix.setX(MI, CurOp, 4);
     ++CurOp;
@@ -1216,7 +1241,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     if (HasEVEX_K)
       Prefix.setAAA(MI, CurOp++);
 
-    if (HasVEX_4V)
+    if (!IsND && HasVEX_4V)
       Prefix.set4VV2(MI, CurOp++);
 
     Prefix.setRR2(MI, CurOp++);
@@ -1508,6 +1533,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
 
   unsigned OpcodeOffset = 0;
 
+  bool IsND = X86II::hasNewDataDest(TSFlags);
+
   uint64_t Form = TSFlags & X86II::FormMask;
   switch (Form) {
   default:
@@ -1576,6 +1603,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
 
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       ++SrcRegNum;
+    if (IsND) // Skip the NDD operand encoded in EVEX_VVVV
+      ++CurOp;
 
     emitRegModRMByte(MI.getOperand(CurOp),
                      getX86RegNum(MI.getOperand(SrcRegNum)), CB);
@@ -1602,6 +1631,9 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       ++SrcRegNum;
 
+    if (IsND) // Skip new data destination
+      ++CurOp;
+
     bool ForceSIB = (Form == X86II::MRMDestMemFSIB);
     emitMemModRMByte(MI, CurOp, getX86RegNum(MI.getOperand(SrcRegNum)), TSFlags,
                      Kind, StartByte, CB, Fixups, STI, ForceSIB);
@@ -1669,6 +1701,9 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
   case X86II::MRMSrcMem: {
     unsigned FirstMemOp = CurOp + 1;
 
+    if (IsND) // Skip new data destination
+      CurOp++;
+
     if (HasEVEX_K) // Skip writemask
       ++FirstMemOp;
 
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 5fd6828f4312..e89ddcc570c9 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1256,11 +1256,6 @@ def ProcessorFeatures {
   list<SubtargetFeature> SRFFeatures =
     !listconcat(ADLFeatures, SRFAdditionalFeatures);
 
-  // Grandridge
-  list<SubtargetFeature> GRRAdditionalFeatures = [FeatureRAOINT];
-  list<SubtargetFeature> GRRFeatures =
-    !listconcat(SRFFeatures, GRRAdditionalFeatures);
-
   // Arrowlake S
   list<SubtargetFeature> ARLSAdditionalFeatures = [FeatureAVXVNNIINT16,
                                                    FeatureSHA512,
@@ -1706,10 +1701,10 @@ foreach P = ["goldmont_plus", "goldmont-plus"] in {
 }
 def : ProcModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures,
                 ProcessorFeatures.TRMTuning>;
-def : ProcModel<"sierraforest", AlderlakePModel, ProcessorFeatures.SRFFeatures,
-                ProcessorFeatures.TRMTuning>;
-def : ProcModel<"grandridge", AlderlakePModel, ProcessorFeatures.GRRFeatures,
+foreach P = ["sierraforest", "grandridge"] in {
+  def : ProcModel<P, AlderlakePModel, ProcessorFeatures.SRFFeatures,
                 ProcessorFeatures.TRMTuning>;
+}
 
 // "Arrandale" along with corei3 and corei5
 foreach P = ["nehalem", "corei7", "core_i7_sse4_2"] in {
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 7f134fe1c72b..0ba31e173a1a 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -1306,8 +1306,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
                   TII.get(Subtarget->is64Bit() ? X86::RET64 : X86::RET32));
   }
-  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
-    MIB.addReg(RetRegs[i], RegState::Implicit);
+  for (unsigned Reg : RetRegs)
+    MIB.addReg(Reg, RegState::Implicit);
   return true;
 }
 
@@ -3346,8 +3346,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
   // Walk the register/memloc assignments, inserting copies/loads.
   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign const &VA = ArgLocs[i];
+  for (const CCValAssign &VA : ArgLocs) {
     const Value *ArgVal = OutVals[VA.getValNo()];
     MVT ArgVT = OutVTs[VA.getValNo()];
 
diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index aab2535aa86d..ca4d03913d09 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -462,8 +462,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
 
     // Check to see if any of the values defined by this instruction are dead
     // after definition.  If so, pop them.
-    for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
-      unsigned Reg = DeadRegs[i];
+    for (unsigned Reg : DeadRegs) {
       // Check if Reg is live on the stack. An inline-asm register operand that
       // is in the clobber list and marked dead might not be live on the stack.
       static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers");
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 7ec59c74f5f5..73b10cf3067e 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -487,7 +487,7 @@ namespace {
     // from PatFrags in tablegen.
     bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
       assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
-      const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+      const APInt &Val = N->getConstantOperandAPInt(1);
 
       if (Val.countr_one() >= Width)
         return true;
@@ -1828,9 +1828,7 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
   // That signifies access to globals that are known to be "near",
   // such as the GOT itself.
   CodeModel::Model M = TM.getCodeModel();
-  if (Subtarget->is64Bit() &&
-      ((M == CodeModel::Large && !IsRIPRelTLS) ||
-       (M == CodeModel::Medium && !IsRIPRel)))
+  if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
     return true;
 
   // Base and index reg must be 0 in order to use %rip as base.
@@ -1866,6 +1864,13 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
   } else
     llvm_unreachable("Unhandled symbol reference node.");
 
+  // Can't use an addressing mode with large globals.
+  if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
+      TM.isLargeGlobalValue(AM.GV)) {
+    AM = Backup;
+    return true;
+  }
+
   if (foldOffsetIntoAddress(Offset, AM)) {
     AM = Backup;
     return true;
@@ -1910,20 +1915,12 @@ bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
 
   // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
   // because it has a smaller encoding.
-  // TODO: Which other code models can use this?
-  switch (TM.getCodeModel()) {
-    default: break;
-    case CodeModel::Small:
-    case CodeModel::Kernel:
-      if (Subtarget->is64Bit() &&
-          AM.Scale == 1 &&
-          AM.BaseType == X86ISelAddressMode::RegBase &&
-          AM.Base_Reg.getNode() == nullptr &&
-          AM.IndexReg.getNode() == nullptr &&
-          AM.SymbolFlags == X86II::MO_NO_FLAG &&
-          AM.hasSymbolicDisplacement())
-        AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
-      break;
+  if (TM.getCodeModel() != CodeModel::Large &&
+      (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
+      AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
+      AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
+    AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
   }
 
   return false;
@@ -5236,7 +5233,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case X86ISD::VPTERNLOG: {
-    uint8_t Imm = cast<ConstantSDNode>(Node->getOperand(3))->getZExtValue();
+    uint8_t Imm = Node->getConstantOperandVal(3);
     if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
                        Node->getOperand(1), Node->getOperand(2), Imm))
       return;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index db5e4fe84f41..1e4b1361f98a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2267,6 +2267,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FDIV, VT, Expand);
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
     }
     setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
     addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
@@ -2282,6 +2284,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
     setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32bf16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v32bf16, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
@@ -2674,34 +2678,33 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
 }
 
-bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
-                                       bool hasSymbolicDisplacement) {
+bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model CM,
+                                       bool HasSymbolicDisplacement) {
   // Offset should fit into 32 bit immediate field.
   if (!isInt<32>(Offset))
     return false;
 
   // If we don't have a symbolic displacement - we don't have any extra
   // restrictions.
-  if (!hasSymbolicDisplacement)
+  if (!HasSymbolicDisplacement)
     return true;
 
-  // FIXME: Some tweaks might be needed for medium code model.
-  if (M != CodeModel::Small && M != CodeModel::Kernel)
-    return false;
-
-  // For small code model we assume that latest object is 16MB before end of 31
-  // bits boundary. We may also accept pretty large negative constants knowing
-  // that all objects are in the positive half of address space.
-  if (M == CodeModel::Small && Offset < 16*1024*1024)
+  // We can fold large offsets in the large code model because we always use
+  // 64-bit offsets.
+  if (CM == CodeModel::Large)
     return true;
 
   // For kernel code model we know that all object resist in the negative half
   // of 32bits address space. We may not accept negative offsets, since they may
   // be just off and we may accept pretty large positive ones.
-  if (M == CodeModel::Kernel && Offset >= 0)
-    return true;
+  if (CM == CodeModel::Kernel)
+    return Offset >= 0;
 
-  return false;
+  // For other non-large code models we assume that latest small object is 16MB
+  // before end of 31 bits boundary. We may also accept pretty large negative
+  // constants knowing that all objects are in the positive half of address
+  // space.
+  return Offset < 16 * 1024 * 1024;
 }
 
 /// Return true if the condition is an signed comparison operation.
@@ -3738,9 +3741,11 @@ static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
   // type. This ensures they get CSE'd. But if the integer type is not
   // available, use a floating-point +0.0 instead.
   SDValue Vec;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
-  } else if (VT.isFloatingPoint()) {
+  } else if (VT.isFloatingPoint() &&
+             TLI.isTypeLegal(VT.getVectorElementType())) {
     Vec = DAG.getConstantFP(+0.0, dl, VT);
   } else if (VT.getVectorElementType() == MVT::i1) {
     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
@@ -7563,8 +7568,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
   } else
     DstVec = DAG.getUNDEF(VT);
 
-  for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
-    unsigned InsertIdx = NonConstIdx[i];
+  for (unsigned InsertIdx : NonConstIdx) {
     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
                          Op.getOperand(InsertIdx),
                          DAG.getIntPtrConstant(InsertIdx, dl));
@@ -31754,7 +31758,7 @@ static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG) {
-  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  unsigned IsData = Op.getConstantOperandVal(4);
 
   // We don't support non-data prefetch without PREFETCHI.
   // Just preserve the chain.
diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
index 3e11ab2d98a4..9aa70dff5f93 100644
--- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
@@ -69,8 +69,8 @@ using PrefetchHints = SampleRecord::CallTargetMap;
 
 // Return any prefetching hints for the specified MachineInstruction. The hints
 // are returned as pairs (name, delta).
-ErrorOr<PrefetchHints> getPrefetchHints(const FunctionSamples *TopSamples,
-                                        const MachineInstr &MI) {
+ErrorOr<const PrefetchHints &>
+getPrefetchHints(const FunctionSamples *TopSamples, const MachineInstr &MI) {
   if (const auto &Loc = MI.getDebugLoc())
     if (const auto *Samples = TopSamples->findFunctionSamples(Loc))
       return Samples->findCallTargetMapAt(FunctionSamples::getOffset(Loc),
@@ -123,7 +123,7 @@ bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples,
   };
   static const char *SerializedPrefetchPrefix = "__prefetch";
 
-  const ErrorOr<PrefetchHints> T = getPrefetchHints(TopSamples, MI);
+  auto T = getPrefetchHints(TopSamples, MI);
   if (!T)
     return false;
   int16_t max_index = -1;
@@ -135,8 +135,7 @@ bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples,
       int64_t D = static_cast<int64_t>(S_V.second);
       unsigned IID = 0;
       for (const auto &HintType : HintTypes) {
-        if (Name.starts_with(HintType.first)) {
-          Name = Name.drop_front(HintType.first.size());
+        if (Name.consume_front(HintType.first)) {
           IID = HintType.second;
           break;
         }
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 2dbb3e5ee316..c47bee070e04 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -14,38 +14,48 @@
 //===----------------------------------------------------------------------===//
 // AMX instructions
 
-let Predicates = [HasAMXTILE, In64BitMode] in {
-  let SchedRW = [WriteSystem] in {
-    let hasSideEffects = 1,
-        Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
-    def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
-                       "ldtilecfg\t$src",
-                       [(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
-    let hasSideEffects = 1 in
-    def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
-                       "sttilecfg\t$src",
-                       [(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
-    let mayLoad = 1 in
-    def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
-                      (ins sibmem:$src),
-                      "tileloadd\t{$src, $dst|$dst, $src}", []>,
-                      VEX, T8XD;
-    let mayLoad = 1 in
-    def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
-                        (ins sibmem:$src),
-                        "tileloaddt1\t{$src, $dst|$dst, $src}", []>,
-                        VEX, T8PD;
+multiclass AMX_TILE_COMMON<string Suffix, Predicate HasEGPR> {
+let Predicates = [HasAMXTILE, HasEGPR, In64BitMode] in {
+  let hasSideEffects = 1,
+      Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+  def LDTILECFG#Suffix : I<0x49, MRM0m, (outs), (ins opaquemem:$src),
+                           "ldtilecfg\t$src",
+                           [(int_x86_ldtilecfg addr:$src)]>,
+                         T8, PS;
+  let hasSideEffects = 1 in
+  def STTILECFG#Suffix : I<0x49, MRM0m, (outs), (ins opaquemem:$src),
+                           "sttilecfg\t$src",
+                           [(int_x86_sttilecfg addr:$src)]>,
+                         T8, PD;
+  let mayLoad = 1 in
+  def TILELOADD#Suffix : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
+                           (ins sibmem:$src),
+                           "tileloadd\t{$src, $dst|$dst, $src}", []>,
+                         T8, XD;
+  let mayLoad = 1 in
+  def TILELOADDT1#Suffix : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
+                             (ins sibmem:$src),
+                             "tileloaddt1\t{$src, $dst|$dst, $src}", []>,
+                           T8, PD;
+  let mayStore = 1 in
+  def TILESTORED#Suffix : I<0x4b, MRMDestMemFSIB, (outs),
+                            (ins sibmem:$dst, TILE:$src),
+                            "tilestored\t{$src, $dst|$dst, $src}", []>,
+                          T8, XS;
+}
+}
+
+let SchedRW = [WriteSystem] in {
+  defm "" : AMX_TILE_COMMON<"", NoEGPR>, VEX;
+  defm "" : AMX_TILE_COMMON<"_EVEX", HasEGPR>, EVEX, NoCD8;
+
+  let Predicates = [HasAMXTILE, In64BitMode] in {
     let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
     def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
-                        "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
-    let mayStore = 1 in
-    def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
-                       (ins sibmem:$dst, TILE:$src),
-                       "tilestored\t{$src, $dst|$dst, $src}", []>,
-                       VEX, T8XS;
+                        "tilerelease", [(int_x86_tilerelease)]>, VEX, T8, PS;
     def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins),
                      "tilezero\t$dst", []>,
-                     VEX, T8XD;
+                     VEX, T8, XD;
 
     // Pseduo instruction for RA.
     let isPseudo = true, mayLoad = 1, hasSideEffects = 1,
@@ -82,8 +92,8 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
       def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
                               [(int_x86_tilezero timm:$src)]>;
     }
-  } // SchedRW
-} // HasAMXTILE
+  } // Predicates
+} // SchedRW
 
 let Predicates = [HasAMXINT8, In64BitMode] in {
   let SchedRW = [WriteSystem] in {
@@ -91,19 +101,19 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
       def TDPBSSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbssd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
-                      VEX_4V, T8XD;
+                      VEX, VVVV, T8, XD;
       def TDPBSUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbsud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
-                      VEX_4V, T8XS;
+                      VEX, VVVV, T8, XS;
       def TDPBUSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbusd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
-                      VEX_4V, T8PD;
+                      VEX, VVVV, T8, PD;
       def TDPBUUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
-                      VEX_4V, T8PS;
+                      VEX, VVVV, T8;
     }
 
     // Pseduo instruction for RA.
@@ -163,7 +173,7 @@ let Predicates = [HasAMXBF16, In64BitMode] in {
     def TDPBF16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                      []>, VEX_4V, T8XS;
+                      []>, VEX, VVVV, T8, XS;
 
     // Pseduo instruction for RA.
     let isPseudo = true, Constraints = "$src4 = $dst" in
@@ -193,7 +203,7 @@ let Predicates = [HasAMXFP16, In64BitMode] in {
       def TDPFP16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst),
                         (ins TILE:$src1, TILE:$src2, TILE:$src3),
                         "tdpfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                        []>, VEX_4V, T8XD;
+                        []>, VEX, VVVV, T8, XD;
     }
 
     // Pseduo instruction for RA.
@@ -222,11 +232,11 @@ let Predicates = [HasAMXCOMPLEX, In64BitMode] in {
       def TCMMIMFP16PS   : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
                             (ins TILE:$src1, TILE:$src2, TILE:$src3),
                             "tcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                            []>, T8PD, VEX_4V;
+                            []>, T8, PD, VEX, VVVV;
       def TCMMRLFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
                             (ins TILE:$src1, TILE:$src2, TILE:$src3),
                             "tcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                            []>, VEX_4V, WIG, T8PS;
+                            []>, VEX, VVVV, WIG, T8;
 
     } // Constraints = "$src1 = $dst"
 
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index e1fe2b680b96..c3a673f97d34 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -378,7 +378,7 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
                    (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
                                            (From.VT From.RC:$src2),
                                            (iPTR imm))>,
-                   AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
+                   AVX512AIi8Base, EVEX, VVVV, Sched<[sched]>;
     let mayLoad = 1 in
     defm rm : AVX512_maskable_split<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
                    (ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
@@ -389,7 +389,7 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
                                (iPTR imm)),
                    (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
                                (From.VT (From.LdFrag addr:$src2)),
-                               (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
+                               (iPTR imm))>, AVX512AIi8Base, EVEX, VVVV,
                    EVEX_CD8<From.EltSize, From.CD8TupleForm>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -647,14 +647,14 @@ def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
       (ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, timm:$src3))]>,
-      EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
+      EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>;
 def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
       (ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set VR128X:$dst, (X86insertps VR128X:$src1,
                           (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
                           timm:$src3))]>,
-      EVEX_4V, EVEX_CD8<32, CD8VT1>,
+      EVEX, VVVV, EVEX_CD8<32, CD8VT1>,
       Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
 }
 
@@ -1039,7 +1039,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                        (bitconvert
                         (DestInfo.VT
                          (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))],
-                    DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>;
+                    DestInfo.ExeDomain>, T8, PD, EVEX, Sched<[SchedRR]>;
   def rrkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
                       (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src),
                       !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
@@ -1051,7 +1051,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                             (DestInfo.VT
                              (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
                           MaskInfo.ImmAllZerosV))],
-                       DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>;
+                       DestInfo.ExeDomain>, T8, PD, EVEX, EVEX_KZ, Sched<[SchedRR]>;
   let Constraints = "$src0 = $dst" in
   def rrk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
                      (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
@@ -1065,7 +1065,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                           (DestInfo.VT
                            (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
                         MaskInfo.RC:$src0))],
-                      DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>;
+                      DestInfo.ExeDomain>, T8, PD, EVEX, EVEX_K, Sched<[SchedRR]>;
 
   let hasSideEffects = 0, mayLoad = 1 in
   def rm : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
@@ -1076,7 +1076,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                        (bitconvert
                         (DestInfo.VT
                          (UnmaskedBcastOp addr:$src)))))],
-                    DestInfo.ExeDomain>, T8PD, EVEX,
+                    DestInfo.ExeDomain>, T8, PD, EVEX,
                     EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
 
   def rmkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
@@ -1090,7 +1090,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                             (DestInfo.VT
                              (SrcInfo.BroadcastLdFrag addr:$src)))),
                           MaskInfo.ImmAllZerosV))],
-                       DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ,
+                       DestInfo.ExeDomain>, T8, PD, EVEX, EVEX_KZ,
                        EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
 
   let Constraints = "$src0 = $dst",
@@ -1107,7 +1107,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                           (DestInfo.VT
                            (SrcInfo.BroadcastLdFrag addr:$src)))),
                         MaskInfo.RC:$src0))],
-                      DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K,
+                      DestInfo.ExeDomain>, T8, PD, EVEX, EVEX_K,
                       EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
 }
 
@@ -1173,7 +1173,7 @@ multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR,
                           "vpbroadcast"#_.Suffix, "$src", "$src",
                           (_.VT (OpNode SrcRC:$src)), /*IsCommutable*/0,
                           /*IsKCommutable*/0, /*IsKZCommutable*/0, vselect>,
-                          T8PD, EVEX, Sched<[SchedRR]>;
+                          T8, PD, EVEX, Sched<[SchedRR]>;
 }
 
 multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, SchedWrite SchedRR,
@@ -1185,7 +1185,7 @@ multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, SchedWrite Sched
                          !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)),
                          !con((ins _.KRCWM:$mask), (ins GR32:$src)),
                          "vpbroadcast"#_.Suffix, "$src", "$src", [], [], [],
-                         "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>;
+                         "$src0 = $dst">, T8, PD, EVEX, Sched<[SchedRR]>;
 
   def : Pat <(_.VT (OpNode SrcRC:$src)),
              (!cast<Instruction>(Name#rr)
@@ -1447,6 +1447,17 @@ def : Pat<(vselect_mask VK8WM:$mask,
           (VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
 }
 
+let Predicates = [HasBF16] in {
+  def : Pat<(v32bf16 (X86SubVBroadcastld256 addr:$src)),
+            (VBROADCASTF64X4rm addr:$src)>;
+  def : Pat<(v32bf16 (X86SubVBroadcastld128 addr:$src)),
+            (VBROADCASTF32X4rm addr:$src)>;
+}
+
+let Predicates = [HasBF16, HasVLX] in
+  def : Pat<(v16bf16 (X86SubVBroadcastld128 addr:$src)),
+            (VBROADCASTF32X4Z256rm addr:$src)>;
+
 let Predicates = [HasVLX, HasDQI] in {
 defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
                            X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, VEX_W1X,
@@ -1593,7 +1604,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1, _.RC:$src3)), 1>,
-          EVEX_4V, AVX5128IBase, Sched<[sched]>;
+          EVEX, VVVV, AVX5128IBase, Sched<[sched]>;
 
   let mayLoad = 1 in
   defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
@@ -1601,7 +1612,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1,
                    (_.VT (_.LdFrag addr:$src3)))), 1>,
-            EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
+            EVEX, VVVV, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
@@ -1616,7 +1627,7 @@ multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
               !strconcat("$src2, ${src3}", _.BroadcastStr ),
               (_.VT (X86VPermt2 _.RC:$src2,
                IdxVT.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
-              AVX5128IBase, EVEX_4V, EVEX_B,
+              AVX5128IBase, EVEX, VVVV, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -1715,14 +1726,14 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
           (ins IdxVT.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
-          EVEX_4V, AVX5128IBase, Sched<[sched]>;
+          EVEX, VVVV, AVX5128IBase, Sched<[sched]>;
 
   defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins IdxVT.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
                    (_.LdFrag addr:$src3))), 1>,
-            EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
+            EVEX, VVVV, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
@@ -1735,7 +1746,7 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
               !strconcat("$src2, ${src3}", _.BroadcastStr ),
               (_.VT (X86VPermt2 _.RC:$src1,
                IdxVT.RC:$src2,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
-              AVX5128IBase, EVEX_4V, EVEX_B,
+              AVX5128IBase, EVEX, VVVV, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -1800,35 +1811,35 @@ multiclass WriteFVarBlendask<bits<8> opc, string OpcodeStr,
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), []>,
-             EVEX_4V, Sched<[sched]>;
+             EVEX, VVVV, Sched<[sched]>;
   def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-             []>, EVEX_4V, EVEX_K, Sched<[sched]>;
+             []>, EVEX, VVVV, EVEX_K, Sched<[sched]>;
   def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
-             []>, EVEX_4V, EVEX_KZ, Sched<[sched]>;
+             []>, EVEX, VVVV, EVEX_KZ, Sched<[sched]>;
   let mayLoad = 1 in {
   def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
-             []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+             []>, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
              Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-             []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>,
+             []>, EVEX, VVVV, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>,
              Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
-             []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>,
+             []>, EVEX, VVVV, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>,
              Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
   }
@@ -1841,7 +1852,7 @@ multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr,
             "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
             "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), []>,
-      EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+      EVEX, VVVV, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   def rmbkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
@@ -1849,7 +1860,7 @@ multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr,
             "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}} {z}|",
             "$dst {${mask}} {z}, $src1, ${src2}", _.BroadcastStr, "}"), []>,
-      EVEX_4V, EVEX_KZ, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+      EVEX, VVVV, EVEX_KZ, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
@@ -1857,7 +1868,7 @@ multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr,
             "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
             "$dst, $src1, ${src2}", _.BroadcastStr, "}"), []>,
-      EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+      EVEX, VVVV, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -1921,7 +1932,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                       "$cc, $src2, $src1", "$src1, $src2, $cc",
                       (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
                       (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                 timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+                                 timm:$cc)>, EVEX, VVVV, VEX_LIG, Sched<[sched]>, SIMD_EXC;
   let mayLoad = 1 in
   defm  rm_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
                     (outs _.KRC:$dst),
@@ -1931,7 +1942,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                     (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
                         timm:$cc),
                     (OpNode_su (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
-                        timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
+                        timm:$cc)>, EVEX, VVVV, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 
   let Uses = [MXCSR] in
@@ -1944,7 +1955,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                                 timm:$cc),
                      (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
                                    timm:$cc)>,
-                     EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
+                     EVEX, VVVV, VEX_LIG, EVEX_B, Sched<[sched]>;
 
   let isCodeGenOnly = 1 in {
     let isCommutable = 1 in
@@ -1955,7 +1966,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                 [(set _.KRC:$dst, (OpNode _.FRC:$src1,
                                           _.FRC:$src2,
                                           timm:$cc))]>,
-                EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+                EVEX, VVVV, VEX_LIG, Sched<[sched]>, SIMD_EXC;
     def rm : AVX512Ii8<0xC2, MRMSrcMem,
               (outs _.KRC:$dst),
               (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
@@ -1964,7 +1975,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
               [(set _.KRC:$dst, (OpNode _.FRC:$src1,
                                         (_.ScalarLdFrag addr:$src2),
                                         timm:$cc))]>,
-              EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
+              EVEX, VVVV, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
               Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
   }
 }
@@ -1991,24 +2002,24 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr,
   def rr : AVX512BI<opc, MRMSrcReg,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             []>, EVEX_4V, Sched<[sched]>;
+             []>, EVEX, VVVV, Sched<[sched]>;
   let mayLoad = 1, hasSideEffects = 0 in
   def rm : AVX512BI<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             []>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+             []>, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCommutable = IsCommutable, hasSideEffects = 0 in
   def rrk : AVX512BI<opc, MRMSrcReg,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
                           "$dst {${mask}}, $src1, $src2}"),
-              []>, EVEX_4V, EVEX_K, Sched<[sched]>;
+              []>, EVEX, VVVV, EVEX_K, Sched<[sched]>;
   let mayLoad = 1, hasSideEffects = 0 in
   def rmk : AVX512BI<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
                           "$dst {${mask}}, $src1, $src2}"),
-              []>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              []>, EVEX, VVVV, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr,
@@ -2020,14 +2031,14 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr,
               (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
               !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
                                     "|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
-              []>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              []>, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmbk : AVX512BI<opc, MRMSrcMem,
                (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
                                        _.ScalarMemOp:$src2),
                !strconcat(OpcodeStr,
                           "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
                           "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
-               []>, EVEX_4V, EVEX_K, EVEX_B,
+               []>, EVEX, VVVV, EVEX_K, EVEX_B,
                Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -2082,7 +2093,7 @@ defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd",
 
 defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq",
                       SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>,
-                T8PD, REX_W, EVEX_CD8<64, CD8VF>;
+                T8, REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb",
                       SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
@@ -2098,7 +2109,7 @@ defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd",
 
 defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq",
                       SchedWriteVecALU, avx512vl_i64_info, HasAVX512>,
-                T8PD, REX_W, EVEX_CD8<64, CD8VF>;
+                T8, REX_W, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
@@ -2113,7 +2124,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
              [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1),
                                                 (_.VT _.RC:$src2),
                                                 cond)))]>,
-             EVEX_4V, Sched<[sched]>;
+             EVEX, VVVV, Sched<[sched]>;
   def rmi : AVX512AIi8<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
              !strconcat("vpcmp", Suffix,
@@ -2123,7 +2134,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                                  (_.VT _.RC:$src1),
                                  (_.VT (_.LdFrag addr:$src2)),
                                  cond)))]>,
-             EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+             EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCommutable = 1 in
   def rrik : AVX512AIi8<opc, MRMSrcReg,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
@@ -2135,7 +2146,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                                      (_.KVT (Frag_su:$cc (_.VT _.RC:$src1),
                                                          (_.VT _.RC:$src2),
                                                          cond))))]>,
-              EVEX_4V, EVEX_K, Sched<[sched]>;
+              EVEX, VVVV, EVEX_K, Sched<[sched]>;
   def rmik : AVX512AIi8<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
                                     u8imm:$cc),
@@ -2148,7 +2159,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                                        (_.VT _.RC:$src1),
                                        (_.VT (_.LdFrag addr:$src2)),
                                        cond))))]>,
-              EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              EVEX, VVVV, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   def : Pat<(_.KVT (Frag:$cc (_.LdFrag addr:$src2),
                              (_.VT _.RC:$src1), cond)),
@@ -2177,7 +2188,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
                                        (_.VT _.RC:$src1),
                                        (_.BroadcastLdFrag addr:$src2),
                                        cond)))]>,
-             EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+             EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmibk : AVX512AIi8<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
                                        _.ScalarMemOp:$src2, u8imm:$cc),
@@ -2189,7 +2200,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
                                              (_.VT _.RC:$src1),
                                              (_.BroadcastLdFrag addr:$src2),
                                              cond))))]>,
-              EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              EVEX, VVVV, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   def : Pat<(_.KVT (Frag:$cc (_.BroadcastLdFrag addr:$src2),
                     (_.VT _.RC:$src1), cond)),
@@ -2405,11 +2416,11 @@ multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _,
 }
 
 defm VCMPPD : avx512_vcmp<SchedWriteFCmp, avx512vl_f64_info>,
-                          AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+                          AVX512PDIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
 defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>,
-                          AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+                          AVX512PSIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
 defm VCMPPH : avx512_vcmp<SchedWriteFCmp, avx512vl_f16_info, HasFP16>,
-                          AVX512PSIi8Base, EVEX_4V, EVEX_CD8<16, CD8VF>, TA;
+                          AVX512PSIi8Base, EVEX, VVVV, EVEX_CD8<16, CD8VF>, TA;
 
 // Patterns to select fp compares with load as first operand.
 let Predicates = [HasAVX512] in {
@@ -2625,40 +2636,40 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
 let Predicates = [HasDQI, NoEGPR] in
   defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>,
                avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
-               VEX, PD;
+               VEX, TB, PD;
 let Predicates = [HasDQI, HasEGPR, In64BitMode] in
   defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem, "_EVEX">,
                avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32, "_EVEX">,
-               EVEX, PD;
+               EVEX, TB, PD;
 
 let Predicates = [HasAVX512, NoEGPR] in
   defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
                avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
-               VEX, PS;
+               VEX, TB;
 let Predicates = [HasAVX512, HasEGPR, In64BitMode] in
   defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem, "_EVEX">,
                avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32, "_EVEX">,
-               EVEX, PS;
+               EVEX, TB;
 
 let Predicates = [HasBWI, NoEGPR] in {
   defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
-               VEX, PD, REX_W;
+               VEX, TB, PD, REX_W;
   defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
-               VEX, XD;
+               VEX, TB, XD;
   defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
-               VEX, PS, REX_W;
+               VEX, TB, REX_W;
   defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
-               VEX, XD, REX_W;
+               VEX, TB, XD, REX_W;
 }
 let Predicates = [HasBWI, HasEGPR, In64BitMode] in {
   defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem, "_EVEX">,
-               EVEX, PD, REX_W;
+               EVEX, TB, PD, REX_W;
   defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32, "_EVEX">,
-               EVEX, XD;
+               EVEX, TB, XD;
   defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem, "_EVEX">,
-               EVEX, PS, REX_W;
+               EVEX, TB, REX_W;
   defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64, "_EVEX">,
-               EVEX, XD, REX_W;
+               EVEX, TB, XD, REX_W;
 }
 
 // GR from/to mask register
@@ -2769,13 +2780,13 @@ multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
                                 SDPatternOperator OpNode,
                                 X86FoldableSchedWrite sched> {
   defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
-                            sched, HasDQI>, VEX, PD;
+                            sched, HasDQI>, VEX, TB, PD;
   defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
-                            sched, HasAVX512>, VEX, PS;
+                            sched, HasAVX512>, VEX, TB;
   defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
-                            sched, HasBWI>, VEX, PD, REX_W;
+                            sched, HasBWI>, VEX, TB, PD, REX_W;
   defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
-                            sched, HasBWI>, VEX, PS, REX_W;
+                            sched, HasBWI>, VEX, TB, REX_W;
 }
 
 // TODO - do we need a X86SchedWriteWidths::KMASK type?
@@ -2812,13 +2823,13 @@ multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
                                  X86FoldableSchedWrite sched, bit IsCommutable,
                                  Predicate prdW = HasAVX512> {
   defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
-                             sched, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
+                             sched, HasDQI, IsCommutable>, VEX, VVVV, VEX_L, TB, PD;
   defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
-                             sched, prdW, IsCommutable>, VEX_4V, VEX_L, PS;
+                             sched, prdW, IsCommutable>, VEX, VVVV, VEX_L, TB;
   defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
-                             sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, REX_W, PD;
+                             sched, HasBWI, IsCommutable>, VEX, VVVV, VEX_L, REX_W, TB, PD;
   defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
-                             sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, REX_W, PS;
+                             sched, HasBWI, IsCommutable>, VEX, VVVV, VEX_L, REX_W, TB;
 }
 
 // TODO - do we need a X86SchedWriteWidths::KMASK type?
@@ -2869,16 +2880,16 @@ multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst,
     def rr : I<0x4b, MRMSrcReg, (outs Dst.KRC:$dst),
                (ins Src.KRC:$src1, Src.KRC:$src2),
                "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-               VEX_4V, VEX_L, Sched<[sched]>;
+               VEX, VVVV, VEX_L, Sched<[sched]>;
 
     def : Pat<(Dst.KVT (concat_vectors Src.KRC:$src1, Src.KRC:$src2)),
               (!cast<Instruction>(NAME#rr) Src.KRC:$src2, Src.KRC:$src1)>;
   }
 }
 
-defm KUNPCKBW : avx512_mask_unpck<"bw", v16i1_info, v8i1_info,  WriteShuffle, HasAVX512>, PD;
-defm KUNPCKWD : avx512_mask_unpck<"wd", v32i1_info, v16i1_info, WriteShuffle, HasBWI>, PS;
-defm KUNPCKDQ : avx512_mask_unpck<"dq", v64i1_info, v32i1_info, WriteShuffle, HasBWI>, PS, REX_W;
+defm KUNPCKBW : avx512_mask_unpck<"bw", v16i1_info, v8i1_info,  WriteShuffle, HasAVX512>, TB, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", v32i1_info, v16i1_info, WriteShuffle, HasBWI>, TB;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", v64i1_info, v32i1_info, WriteShuffle, HasBWI>, TB, REX_W;
 
 // Mask bit testing
 multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
@@ -2895,13 +2906,13 @@ multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                 X86FoldableSchedWrite sched,
                                 Predicate prdW = HasAVX512> {
   defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, sched, HasDQI>,
-                                                                VEX, PD;
+                                                                VEX, TB, PD;
   defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, sched, prdW>,
-                                                                VEX, PS;
+                                                                VEX, TB;
   defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, sched, HasBWI>,
-                                                                VEX, PS, REX_W;
+                                                                VEX, TB, REX_W;
   defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, sched, HasBWI>,
-                                                                VEX, PD, REX_W;
+                                                                VEX, TB, PD, REX_W;
 }
 
 // TODO - do we need a X86SchedWriteWidths::KMASK type?
@@ -2922,15 +2933,15 @@ multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
 multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
                                  SDNode OpNode, X86FoldableSchedWrite sched> {
   defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode,
-                               sched>, VEX, TAPD, REX_W;
+                               sched>, VEX, TA, PD, REX_W;
   let Predicates = [HasDQI] in
   defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode,
-                               sched>, VEX, TAPD;
+                               sched>, VEX, TA, PD;
   let Predicates = [HasBWI] in {
   defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode,
-                               sched>, VEX, TAPD, REX_W;
+                               sched>, VEX, TA, PD, REX_W;
   defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode,
-                               sched>, VEX, TAPD;
+                               sched>, VEX, TA, PD;
   }
 }
 
@@ -3371,25 +3382,25 @@ defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
                                      HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
                avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
                                       HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
-               PS, EVEX_CD8<32, CD8VF>;
+               TB, EVEX_CD8<32, CD8VF>;
 
 defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
                                      HasAVX512, SchedWriteFMoveLS, "VMOVAPD">,
                avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
                                       HasAVX512, SchedWriteFMoveLS, "VMOVAPD">,
-               PD, REX_W, EVEX_CD8<64, CD8VF>;
+               TB, PD, REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
                               SchedWriteFMoveLS, "VMOVUPS", 0, null_frag>,
                avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
                                SchedWriteFMoveLS, "VMOVUPS">,
-                               PS, EVEX_CD8<32, CD8VF>;
+                               TB, EVEX_CD8<32, CD8VF>;
 
 defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
                               SchedWriteFMoveLS, "VMOVUPD", 0, null_frag>,
                avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512,
                                SchedWriteFMoveLS, "VMOVUPD">,
-               PD, REX_W, EVEX_CD8<64, CD8VF>;
+               TB, PD, REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                                        HasAVX512, SchedWriteVecMoveLS,
@@ -3397,7 +3408,7 @@ defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
                                         HasAVX512, SchedWriteVecMoveLS,
                                         "VMOVDQA", 1>,
-                 PD, EVEX_CD8<32, CD8VF>;
+                 TB, PD, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
                                        HasAVX512, SchedWriteVecMoveLS,
@@ -3405,31 +3416,31 @@ defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
                  avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
                                         HasAVX512, SchedWriteVecMoveLS,
                                         "VMOVDQA">,
-                 PD, REX_W, EVEX_CD8<64, CD8VF>;
+                 TB, PD, REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI,
                                SchedWriteVecMoveLS, "VMOVDQU", 1>,
                 avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI,
                                 SchedWriteVecMoveLS, "VMOVDQU", 1>,
-                XD, EVEX_CD8<8, CD8VF>;
+                TB, XD, EVEX_CD8<8, CD8VF>;
 
 defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI,
                                 SchedWriteVecMoveLS, "VMOVDQU", 1>,
                  avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI,
                                  SchedWriteVecMoveLS, "VMOVDQU", 1>,
-                 XD, REX_W, EVEX_CD8<16, CD8VF>;
+                 TB, XD, REX_W, EVEX_CD8<16, CD8VF>;
 
 defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
                                 SchedWriteVecMoveLS, "VMOVDQU", 1, null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
                                  SchedWriteVecMoveLS, "VMOVDQU", 1>,
-                 XS, EVEX_CD8<32, CD8VF>;
+                 TB, XS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
                                 SchedWriteVecMoveLS, "VMOVDQU", 0, null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
                                  SchedWriteVecMoveLS, "VMOVDQU">,
-                 XS, REX_W, EVEX_CD8<64, CD8VF>;
+                 TB, XS, REX_W, EVEX_CD8<64, CD8VF>;
 
 // Special instructions to help with spilling when we don't have VLX. We need
 // to load or store from a ZMM register instead. These are converted in
@@ -3816,12 +3827,12 @@ def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
                       "vmovq\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
                                                    (iPTR 0)))]>,
-                      PD, EVEX, REX_W, Sched<[WriteVecMoveToGpr]>,
+                      TB, PD, EVEX, REX_W, Sched<[WriteVecMoveToGpr]>,
                       Requires<[HasAVX512]>;
 
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
 def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src),
-                      "vmovq\t{$src, $dst|$dst, $src}", []>, PD,
+                      "vmovq\t{$src, $dst|$dst, $src}", []>, TB, PD,
                       EVEX, REX_W, EVEX_CD8<64, CD8VT1>, Sched<[WriteVecStore]>,
                       Requires<[HasAVX512, In64BitMode]>;
 
@@ -3830,7 +3841,7 @@ def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
                       "vmovq\t{$src, $dst|$dst, $src}",
                       [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
                               addr:$dst)]>,
-                      EVEX, PD, REX_W, EVEX_CD8<64, CD8VT1>,
+                      EVEX, TB, PD, REX_W, EVEX_CD8<64, CD8VT1>,
                       Sched<[WriteVecStore]>, Requires<[HasAVX512]>;
 
 let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in
@@ -3897,7 +3908,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
-             _.ExeDomain>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
+             _.ExeDomain>, EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>;
   let Predicates = [prd] in {
   def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
               (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
@@ -3906,7 +3917,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
               [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
                                       (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
                                       _.ImmAllZerosV)))],
-              _.ExeDomain>, EVEX_4V, EVEX_KZ, Sched<[SchedWriteFShuffle.XMM]>;
+              _.ExeDomain>, EVEX, VVVV, EVEX_KZ, Sched<[SchedWriteFShuffle.XMM]>;
   let Constraints = "$src0 = $dst"  in
   def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
@@ -3915,7 +3926,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
              [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
                                      (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
                                      (_.VT _.RC:$src0))))],
-             _.ExeDomain>, EVEX_4V, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>;
+             _.ExeDomain>, EVEX, VVVV, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>;
   let canFoldAsLoad = 1, isReMaterializable = 1 in {
   def rm : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.ScalarMemOp:$src),
              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
@@ -3954,14 +3965,14 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
 }
 
 defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>,
-                                  VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
+                                  VEX_LIG, TB, XS, EVEX_CD8<32, CD8VT1>;
 
 defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
-                                  VEX_LIG, XD, REX_W, EVEX_CD8<64, CD8VT1>;
+                                  VEX_LIG, TB, XD, REX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info,
                                   HasFP16>,
-                                  VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+                                  VEX_LIG, T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
 
 multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
                                        PatLeaf ZeroFP, X86VectorVTInfo _> {
@@ -4286,7 +4297,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
     def VMOVSHZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
         (ins VR128X:$src1, VR128X:$src2),
         "vmovsh\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-        []>, T_MAP5XS, EVEX_4V, VEX_LIG,
+        []>, T_MAP5, XS, EVEX, VVVV, VEX_LIG,
         Sched<[SchedWriteFShuffle.XMM]>;
 
     let Constraints = "$src0 = $dst" in
@@ -4295,20 +4306,20 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
          VR128X:$src1, VR128X:$src2),
         "vmovsh\t{$src2, $src1, $dst {${mask}}|"#
           "$dst {${mask}}, $src1, $src2}",
-        []>, T_MAP5XS, EVEX_K, EVEX_4V, VEX_LIG,
+        []>, T_MAP5, XS, EVEX_K, EVEX, VVVV, VEX_LIG,
         Sched<[SchedWriteFShuffle.XMM]>;
 
     def VMOVSHZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
         (ins f16x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
         "vmovsh\t{$src2, $src1, $dst {${mask}} {z}|"#
           "$dst {${mask}} {z}, $src1, $src2}",
-        []>, EVEX_KZ, T_MAP5XS, EVEX_4V, VEX_LIG,
+        []>, EVEX_KZ, T_MAP5, XS, EVEX, VVVV, VEX_LIG,
         Sched<[SchedWriteFShuffle.XMM]>;
   }
   def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
                            "vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           []>, XS, EVEX_4V, VEX_LIG,
+                           []>, TB, XS, EVEX, VVVV, VEX_LIG,
                            Sched<[SchedWriteFShuffle.XMM]>;
 
   let Constraints = "$src0 = $dst" in
@@ -4317,20 +4328,20 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
                                                    VR128X:$src1, VR128X:$src2),
                              "vmovss\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
-                             []>, EVEX_K, XS, EVEX_4V, VEX_LIG,
+                             []>, EVEX_K, TB, XS, EVEX, VVVV, VEX_LIG,
                              Sched<[SchedWriteFShuffle.XMM]>;
 
   def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                          (ins f32x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
                          "vmovss\t{$src2, $src1, $dst {${mask}} {z}|"#
                                     "$dst {${mask}} {z}, $src1, $src2}",
-                         []>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
+                         []>, EVEX_KZ, TB, XS, EVEX, VVVV, VEX_LIG,
                          Sched<[SchedWriteFShuffle.XMM]>;
 
   def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
                            "vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           []>, XD, EVEX_4V, VEX_LIG, REX_W,
+                           []>, TB, XD, EVEX, VVVV, VEX_LIG, REX_W,
                            Sched<[SchedWriteFShuffle.XMM]>;
 
   let Constraints = "$src0 = $dst" in
@@ -4339,7 +4350,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
                                                    VR128X:$src1, VR128X:$src2),
                              "vmovsd\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
-                             []>, EVEX_K, XD, EVEX_4V, VEX_LIG,
+                             []>, EVEX_K, TB, XD, EVEX, VVVV, VEX_LIG,
                              REX_W, Sched<[SchedWriteFShuffle.XMM]>;
 
   def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
@@ -4347,7 +4358,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
                                                           VR128X:$src2),
                               "vmovsd\t{$src2, $src1, $dst {${mask}} {z}|"#
                                          "$dst {${mask}} {z}, $src1, $src2}",
-                              []>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
+                              []>, EVEX_KZ, TB, XD, EVEX, VVVV, VEX_LIG,
                               REX_W, Sched<[SchedWriteFShuffle.XMM]>;
 }
 
@@ -4546,20 +4557,20 @@ let Predicates = [HasAVX512] in {
 def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
                       (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
                       [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.ZMM.RM]>,
-                      EVEX, T8PD, EVEX_V512, EVEX_CD8<64, CD8VF>;
+                      EVEX, T8, PD, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
 let Predicates = [HasVLX] in {
   def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
                        (ins i256mem:$src),
                        "vmovntdqa\t{$src, $dst|$dst, $src}",
                        [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.YMM.RM]>,
-                       EVEX, T8PD, EVEX_V256, EVEX_CD8<64, CD8VF>;
+                       EVEX, T8, PD, EVEX_V256, EVEX_CD8<64, CD8VF>;
 
   def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
                       (ins i128mem:$src),
                       "vmovntdqa\t{$src, $dst|$dst, $src}",
                       [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.XMM.RM]>,
-                      EVEX, T8PD, EVEX_V128, EVEX_CD8<64, CD8VF>;
+                      EVEX, T8, PD, EVEX_V128, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
@@ -4585,11 +4596,11 @@ multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr,
 }
 
 defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info,
-                                SchedWriteVecMoveLSNT>, PD;
+                                SchedWriteVecMoveLSNT>, TB, PD;
 defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info,
-                                SchedWriteFMoveLSNT>, PD, REX_W;
+                                SchedWriteFMoveLSNT>, TB, PD, REX_W;
 defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info,
-                                SchedWriteFMoveLSNT>, PS;
+                                SchedWriteFMoveLSNT>, TB;
 
 let Predicates = [HasAVX512], AddedComplexity = 400 in {
   def : Pat<(alignednontemporalstore (v16i32 VR512:$src), addr:$dst),
@@ -4665,14 +4676,14 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                     "$src2, $src1", "$src1, $src2",
                     (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
-                    IsCommutable, IsCommutable>, AVX512BIBase, EVEX_4V,
+                    IsCommutable, IsCommutable>, AVX512BIBase, EVEX, VVVV,
                     Sched<[sched]>;
 
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2)))>,
-                  AVX512BIBase, EVEX_4V,
+                  AVX512BIBase, EVEX, VVVV,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4686,7 +4697,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   "$src1, ${src2}"#_.BroadcastStr,
                   (_.VT (OpNode _.RC:$src1,
                                 (_.BroadcastLdFrag addr:$src2)))>,
-                  AVX512BIBase, EVEX_4V, EVEX_B,
+                  AVX512BIBase, EVEX, VVVV, EVEX_B,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4796,13 +4807,13 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
                                          (_Src.VT _Src.RC:$src1),
                                          (_Src.VT _Src.RC:$src2))),
                             IsCommutable>,
-                            AVX512BIBase, EVEX_4V, Sched<[sched]>;
+                            AVX512BIBase, EVEX, VVVV, Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
                                       (_Src.LdFrag addr:$src2)))>,
-                        AVX512BIBase, EVEX_4V,
+                        AVX512BIBase, EVEX, VVVV,
                         Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
@@ -4812,7 +4823,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
                      "$src1, ${src2}"#_Brdct.BroadcastStr,
                     (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
                                  (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>,
-                    AVX512BIBase, EVEX_4V, EVEX_B,
+                    AVX512BIBase, EVEX, VVVV, EVEX_B,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4829,22 +4840,22 @@ defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", uaddsat,
 defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", usubsat,
                                      SchedWriteVecALU, HasBWI, 0>;
 defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
-                                    SchedWritePMULLD, HasAVX512, 1>, T8PD;
+                                    SchedWritePMULLD, HasAVX512, 1>, T8;
 defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
                                     SchedWriteVecIMul, HasBWI, 1>;
 defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
-                                    SchedWriteVecIMul, HasDQI, 1>, T8PD,
+                                    SchedWriteVecIMul, HasDQI, 1>, T8,
                                     NotEVEX2VEXConvertible;
 defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SchedWriteVecIMul,
                                     HasBWI, 1>;
 defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul,
                                      HasBWI, 1>;
 defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs,
-                                      SchedWriteVecIMul, HasBWI, 1>, T8PD;
+                                      SchedWriteVecIMul, HasBWI, 1>, T8;
 defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", avgceilu,
                                    SchedWriteVecALU, HasBWI, 1>;
 defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq,
-                                    SchedWriteVecIMul, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecIMul, HasAVX512, 1>, T8;
 defm VPMULUDQ : avx512_binop_rm_vl_q<0xF4, "vpmuludq", X86pmuludq,
                                      SchedWriteVecIMul, HasAVX512, 1>;
 
@@ -4872,7 +4883,7 @@ multiclass avx512_binop_all<bits<8> opc, string OpcodeStr,
 
 defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SchedWriteVecALU,
                                 avx512vl_i8_info, avx512vl_i8_info,
-                                X86multishift, HasVBMI, 0>, T8PD;
+                                X86multishift, HasVBMI, 0>, T8;
 
 multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _Src, X86VectorVTInfo _Dst,
@@ -4884,7 +4895,7 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                      "$src1, ${src2}"#_Src.BroadcastStr,
                     (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
                                  (_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>,
-                    EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
+                    EVEX, VVVV, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4899,13 +4910,13 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
                                          (_Src.VT _Src.RC:$src1),
                                          (_Src.VT _Src.RC:$src2))),
                             IsCommutable, IsCommutable>,
-                            EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[sched]>;
+                            EVEX_CD8<_Src.EltSize, CD8VF>, EVEX, VVVV, Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
                                       (_Src.LdFrag addr:$src2)))>,
-                         EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>,
+                         EVEX, VVVV, EVEX_CD8<_Src.EltSize, CD8VF>,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4967,48 +4978,48 @@ defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512B
 defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase;
 
 defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
-                     avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD, WIG;
+                     avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8, WIG;
 defm VPMADDWD   : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
                      avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase, WIG;
 
 defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
-                                    SchedWriteVecALU, HasBWI, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>, T8;
 defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,
                                     SchedWriteVecALU, HasBWI, 1>;
 defm VPMAXSD : avx512_binop_rm_vl_d<0x3D, "vpmaxsd", smax,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecALU, HasAVX512, 1>, T8;
 defm VPMAXSQ : avx512_binop_rm_vl_q<0x3D, "vpmaxsq", smax,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8,
                                     NotEVEX2VEXConvertible;
 
 defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,
                                     SchedWriteVecALU, HasBWI, 1>;
 defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,
-                                    SchedWriteVecALU, HasBWI, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>, T8;
 defm VPMAXUD : avx512_binop_rm_vl_d<0x3F, "vpmaxud", umax,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecALU, HasAVX512, 1>, T8;
 defm VPMAXUQ : avx512_binop_rm_vl_q<0x3F, "vpmaxuq", umax,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8,
                                     NotEVEX2VEXConvertible;
 
 defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,
-                                    SchedWriteVecALU, HasBWI, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>, T8;
 defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,
                                     SchedWriteVecALU, HasBWI, 1>;
 defm VPMINSD : avx512_binop_rm_vl_d<0x39, "vpminsd", smin,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecALU, HasAVX512, 1>, T8;
 defm VPMINSQ : avx512_binop_rm_vl_q<0x39, "vpminsq", smin,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8,
                                     NotEVEX2VEXConvertible;
 
 defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,
                                     SchedWriteVecALU, HasBWI, 1>;
 defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
-                                    SchedWriteVecALU, HasBWI, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>, T8;
 defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecALU, HasAVX512, 1>, T8;
 defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8,
                                     NotEVEX2VEXConvertible;
 
 // PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
@@ -5445,18 +5456,18 @@ multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDPatternOperator
                               sched.PS.Scl, IsCommutable>,
              avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, RndNode,
                               sched.PS.Scl>,
-                              XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
+                              TB, XS, EVEX, VVVV, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
   defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
                               sched.PD.Scl, IsCommutable>,
              avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, RndNode,
                               sched.PD.Scl>,
-                              XD, REX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+                              TB, XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>;
   let Predicates = [HasFP16] in
     defm SHZ : avx512_fp_scalar<opc, OpcodeStr#"sh", f16x_info, OpNode,
                                 VecNode, sched.PH.Scl, IsCommutable>,
                avx512_fp_scalar_round<opc, OpcodeStr#"sh", f16x_info, RndNode,
                                 sched.PH.Scl>,
-                                T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+                                T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>;
 }
 
 multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -5465,16 +5476,16 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
                               VecNode, SaeNode, sched.PS.Scl, IsCommutable,
                               NAME#"SS">,
-                              XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
+                              TB, XS, EVEX, VVVV, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
   defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
                               VecNode, SaeNode, sched.PD.Scl, IsCommutable,
                               NAME#"SD">,
-                              XD, REX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+                              TB, XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>;
   let Predicates = [HasFP16] in {
     defm SHZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sh", f16x_info, OpNode,
                                 VecNode, SaeNode, sched.PH.Scl, IsCommutable,
                                 NAME#"SH">,
-                                T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>,
+                                T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>,
                                 NotEVEX2VEXConvertible;
   }
 }
@@ -5515,30 +5526,30 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
   }
 }
 defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
-                                         SchedWriteFCmp.Scl, "VMINCSS">, XS,
-                                         EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
+                                         SchedWriteFCmp.Scl, "VMINCSS">, TB, XS,
+                                         EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
 
 defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
-                                         SchedWriteFCmp.Scl, "VMINCSD">, XD,
-                                         REX_W, EVEX_4V, VEX_LIG,
+                                         SchedWriteFCmp.Scl, "VMINCSD">, TB, XD,
+                                         REX_W, EVEX, VVVV, VEX_LIG,
                                          EVEX_CD8<64, CD8VT1>, SIMD_EXC;
 
 defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
-                                         SchedWriteFCmp.Scl, "VMAXCSS">, XS,
-                                         EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
+                                         SchedWriteFCmp.Scl, "VMAXCSS">, TB, XS,
+                                         EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
 
 defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
-                                         SchedWriteFCmp.Scl, "VMAXCSD">, XD,
-                                         REX_W, EVEX_4V, VEX_LIG,
+                                         SchedWriteFCmp.Scl, "VMAXCSD">, TB, XD,
+                                         REX_W, EVEX, VVVV, VEX_LIG,
                                          EVEX_CD8<64, CD8VT1>, SIMD_EXC;
 
 defm VMINCSHZ : avx512_comutable_binop_s<0x5D, "vminsh", f16x_info, X86fminc,
-                                         SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5XS,
-                                         EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
+                                         SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5, XS,
+                                         EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
                                          NotEVEX2VEXConvertible;
 defm VMAXCSHZ : avx512_comutable_binop_s<0x5F, "vmaxsh", f16x_info, X86fmaxc,
-                                         SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5XS,
-                                         EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
+                                         SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5, XS,
+                                         EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
                                          NotEVEX2VEXConvertible;
 
 multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -5556,21 +5567,21 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
                                  "$src2, $src1", "$src1, $src2",
                                  (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
                                  (_.VT (MaskOpNode _.RC:$src1, _.RC:$src2)), ClobberConstraint,
-                                 IsCommutable, IsKCommutable, IsKCommutable>, EVEX_4V, Sched<[sched]>;
+                                 IsCommutable, IsKCommutable, IsKCommutable>, EVEX, VVVV, Sched<[sched]>;
   let mayLoad = 1 in {
     defm rm: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
                                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#suffix,
                                    "$src2, $src1", "$src1, $src2",
                                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2)),
                                    (MaskOpNode _.RC:$src1, (_.LdFrag addr:$src2)),
-                                   ClobberConstraint>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                                   ClobberConstraint>, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
     defm rmb: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
                                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#suffix,
                                     "${src2}"#_.BroadcastStr#", $src1",
                                     "$src1, ${src2}"#_.BroadcastStr,
                                     (OpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
                                     (MaskOpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
-                                    ClobberConstraint>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                                    ClobberConstraint>, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
     }
   }
 }
@@ -5586,7 +5597,7 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
                   "$rc, $src2, $src1", "$src1, $src2, $rc",
                   (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc))),
                   0, 0, 0, vselect_mask, ClobberConstraint>,
-                  EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+                  EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
@@ -5597,7 +5608,7 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
                   "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                   (_.VT (OpNodeSAE _.RC:$src1, _.RC:$src2))>,
-                  EVEX_4V, EVEX_B, Sched<[sched]>;
+                  EVEX, VVVV, EVEX_B, Sched<[sched]>;
 }
 
 multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -5607,27 +5618,27 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator Op
                              bit IsPD128Commutable = IsCommutable> {
   let Predicates = [prd] in {
   defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f32_info,
-                              sched.PS.ZMM, IsCommutable>, EVEX_V512, PS,
+                              sched.PS.ZMM, IsCommutable>, EVEX_V512, TB,
                               EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f64_info,
-                              sched.PD.ZMM, IsCommutable>, EVEX_V512, PD, REX_W,
+                              sched.PD.ZMM, IsCommutable>, EVEX_V512, TB, PD, REX_W,
                               EVEX_CD8<64, CD8VF>;
   }
 
     // Define only if AVX512VL feature is present.
   let Predicates = [prd, HasVLX] in {
     defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f32x_info,
-                                   sched.PS.XMM, IsCommutable>, EVEX_V128, PS,
+                                   sched.PS.XMM, IsCommutable>, EVEX_V128, TB,
                                    EVEX_CD8<32, CD8VF>;
     defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f32x_info,
-                                   sched.PS.YMM, IsCommutable>, EVEX_V256, PS,
+                                   sched.PS.YMM, IsCommutable>, EVEX_V256, TB,
                                    EVEX_CD8<32, CD8VF>;
     defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v2f64x_info,
                                    sched.PD.XMM, IsPD128Commutable,
-                                   IsCommutable>, EVEX_V128, PD, REX_W,
+                                   IsCommutable>, EVEX_V128, TB, PD, REX_W,
                                    EVEX_CD8<64, CD8VF>;
     defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f64x_info,
-                                   sched.PD.YMM, IsCommutable>, EVEX_V256, PD, REX_W,
+                                   sched.PD.YMM, IsCommutable>, EVEX_V256, TB, PD, REX_W,
                                    EVEX_CD8<64, CD8VF>;
   }
 }
@@ -5637,15 +5648,15 @@ multiclass avx512_fp_binop_ph<bits<8> opc, string OpcodeStr, SDPatternOperator O
                               X86SchedWriteSizes sched, bit IsCommutable = 0> {
   let Predicates = [HasFP16] in {
     defm PHZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v32f16_info,
-                                sched.PH.ZMM, IsCommutable>, EVEX_V512, T_MAP5PS,
+                                sched.PH.ZMM, IsCommutable>, EVEX_V512, T_MAP5,
                                 EVEX_CD8<16, CD8VF>;
   }
   let Predicates = [HasVLX, HasFP16] in {
     defm PHZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f16x_info,
-                                   sched.PH.XMM, IsCommutable>, EVEX_V128, T_MAP5PS,
+                                   sched.PH.XMM, IsCommutable>, EVEX_V128, T_MAP5,
                                    EVEX_CD8<16, CD8VF>;
     defm PHZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f16x_info,
-                                   sched.PH.YMM, IsCommutable>, EVEX_V256, T_MAP5PS,
+                                   sched.PH.YMM, IsCommutable>, EVEX_V256, T_MAP5,
                                    EVEX_CD8<16, CD8VF>;
   }
 }
@@ -5656,14 +5667,14 @@ multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeR
   let Predicates = [HasFP16] in {
     defm PHZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PH.ZMM,
                                       v32f16_info>,
-                                      EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                      EVEX_V512, T_MAP5, EVEX_CD8<16, CD8VF>;
   }
   defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
                                     v16f32_info>,
-                                    EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+                                    EVEX_V512, TB, EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM,
                                     v8f64_info>,
-                                    EVEX_V512, PD, REX_W,EVEX_CD8<64, CD8VF>;
+                                    EVEX_V512, TB, PD, REX_W,EVEX_CD8<64, CD8VF>;
 }
 
 let Uses = [MXCSR] in
@@ -5672,14 +5683,14 @@ multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd
   let Predicates = [HasFP16] in {
     defm PHZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PH.ZMM,
                                     v32f16_info>,
-                                    EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                    EVEX_V512, T_MAP5, EVEX_CD8<16, CD8VF>;
   }
   defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
                                   v16f32_info>,
-                                  EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+                                  EVEX_V512, TB, EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM,
                                   v8f64_info>,
-                                  EVEX_V512, PD, REX_W,EVEX_CD8<64, CD8VF>;
+                                  EVEX_V512, TB, PD, REX_W,EVEX_CD8<64, CD8VF>;
 }
 
 defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, fadd, HasAVX512,
@@ -5734,18 +5745,18 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
-                  EVEX_4V, Sched<[sched]>;
+                  EVEX, VVVV, Sched<[sched]>;
   defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix,
                   "$src2, $src1", "$src1, $src2",
                   (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
-                  EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                  EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#_.Suffix,
                    "${src2}"#_.BroadcastStr#", $src1",
                    "$src1, ${src2}"#_.BroadcastStr,
                    (OpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
-                   EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                   EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
@@ -5770,43 +5781,43 @@ multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr
   let Predicates = [HasFP16] in {
     defm PHZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v32f16_info>,
                avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v32f16_info>,
-                                EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+                                EVEX_V512, T_MAP6, PD, EVEX_CD8<16, CD8VF>;
     defm SHZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f16x_info>,
                avx512_fp_scalar_round<opcScaler, OpcodeStr#"sh", f16x_info, X86scalefsRnd, sched.Scl>,
-                             EVEX_4V, T_MAP6PD, EVEX_CD8<16, CD8VT1>;
+                             EVEX, VVVV, T_MAP6, PD, EVEX_CD8<16, CD8VT1>;
   }
   defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v16f32_info>,
              avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v16f32_info>,
-                              EVEX_V512, EVEX_CD8<32, CD8VF>, T8PD;
+                              EVEX_V512, EVEX_CD8<32, CD8VF>, T8, PD;
   defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v8f64_info>,
              avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v8f64_info>,
-                              EVEX_V512, REX_W, EVEX_CD8<64, CD8VF>, T8PD;
+                              EVEX_V512, REX_W, EVEX_CD8<64, CD8VF>, T8, PD;
   defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>,
              avx512_fp_scalar_round<opcScaler, OpcodeStr#"ss", f32x_info,
                                     X86scalefsRnd, sched.Scl>,
-                                    EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, T8PD;
+                                    EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, T8, PD;
   defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>,
              avx512_fp_scalar_round<opcScaler, OpcodeStr#"sd", f64x_info,
                                     X86scalefsRnd, sched.Scl>,
-                                    EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, REX_W, T8PD;
+                                    EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>, REX_W, T8, PD;
 
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX] in {
     defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v4f32x_info>,
-                                   EVEX_V128, EVEX_CD8<32, CD8VF>, T8PD;
+                                   EVEX_V128, EVEX_CD8<32, CD8VF>, T8, PD;
     defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v8f32x_info>,
-                                   EVEX_V256, EVEX_CD8<32, CD8VF>, T8PD;
+                                   EVEX_V256, EVEX_CD8<32, CD8VF>, T8, PD;
     defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v2f64x_info>,
-                                   EVEX_V128, REX_W, EVEX_CD8<64, CD8VF>, T8PD;
+                                   EVEX_V128, REX_W, EVEX_CD8<64, CD8VF>, T8, PD;
     defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v4f64x_info>,
-                                   EVEX_V256, REX_W, EVEX_CD8<64, CD8VF>, T8PD;
+                                   EVEX_V256, REX_W, EVEX_CD8<64, CD8VF>, T8, PD;
   }
 
   let Predicates = [HasFP16, HasVLX] in {
     defm PHZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v8f16x_info>,
-                                   EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6PD;
+                                   EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6, PD;
     defm PHZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v16f16x_info>,
-                                   EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6PD;
+                                   EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6, PD;
   }
 }
 defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef",
@@ -5825,13 +5836,13 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr,
                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
                    (null_frag), (null_frag), 1>,
-                   EVEX_4V, Sched<[sched]>;
+                   EVEX, VVVV, Sched<[sched]>;
   let mayLoad = 1 in
   defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (null_frag), (null_frag)>,
-                   EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                   EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -5844,7 +5855,7 @@ multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr,
                     "${src2}"#_.BroadcastStr#", $src1",
                     "$src1, ${src2}"#_.BroadcastStr,
                     (null_frag), (null_frag)>,
-                    EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                    EVEX_B, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -5898,9 +5909,9 @@ multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string Opcode
   avx512_vptest_dq<opc_dq, OpcodeStr, sched>;
 
 defm VPTESTM   : avx512_vptest_all_forms<0x26, 0x27, "vptestm",
-                                         SchedWriteVecLogic>, T8PD;
+                                         SchedWriteVecLogic>, T8, PD;
 defm VPTESTNM  : avx512_vptest_all_forms<0x26, 0x27, "vptestnm",
-                                         SchedWriteVecLogic>, T8XS;
+                                         SchedWriteVecLogic>, T8, XS;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Shift instructions
@@ -5944,13 +5955,13 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (ins _.RC:$src1, VR128X:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2)))>,
-                   AVX512BIBase, EVEX_4V, Sched<[sched]>;
+                   AVX512BIBase, EVEX, VVVV, Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (SrcVT (load addr:$src2))))>,
                    AVX512BIBase,
-                   EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                   EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
@@ -6035,22 +6046,22 @@ multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
 defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli,
                                  SchedWriteVecShiftImm>,
              avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli,
-                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
 
 defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli,
                                  SchedWriteVecShiftImm>,
              avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli,
-                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
 
 defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai,
                                  SchedWriteVecShiftImm, 1>,
              avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai,
-                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
 
 defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri,
-                                 SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+                                 SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
 defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli,
-                                 SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+                                 SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
 
 defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl,
                                 SchedWriteVecShift>;
@@ -6097,13 +6108,13 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2)))>,
-                   AVX5128IBase, EVEX_4V, Sched<[sched]>;
+                   AVX5128IBase, EVEX, VVVV, Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1,
                    (_.VT (_.LdFrag addr:$src2))))>,
-                   AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                   AVX5128IBase, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -6116,7 +6127,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     "${src2}"#_.BroadcastStr#", $src1",
                     "$src1, ${src2}"#_.BroadcastStr,
                     (_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>,
-                    AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                    AVX5128IBase, EVEX_B, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6374,14 +6385,14 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1,
                                (Ctrl.VT Ctrl.RC:$src2)))>,
-                  T8PD, EVEX_4V, Sched<[sched]>;
+                  T8, PD, EVEX, VVVV, Sched<[sched]>;
   defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode
                            _.RC:$src1,
                            (Ctrl.VT (Ctrl.LdFrag addr:$src2))))>,
-                  T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                  T8, PD, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
@@ -6390,7 +6401,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                    (_.VT (OpNode
                             _.RC:$src1,
                             (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>,
-                   T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+                   T8, PD, EVEX, VVVV, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6469,13 +6480,13 @@ def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
           (ins VR128X:$src1, VR128X:$src2),
           "vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))]>,
-          Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V;
+          Sched<[SchedWriteFShuffle.XMM]>, EVEX, VVVV;
 let isCommutable = 1 in
 def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
           (ins VR128X:$src1, VR128X:$src2),
           "vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))]>,
-          Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V;
+          Sched<[SchedWriteFShuffle.XMM]>, EVEX, VVVV;
 
 //===----------------------------------------------------------------------===//
 // VMOVHPS/PD VMOVLPS Instructions
@@ -6494,19 +6505,19 @@ multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr,
                      (OpNode _.RC:$src1,
                        (_.VT (bitconvert
                          (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))]>,
-                  Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>, EVEX_4V;
+                  Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>, EVEX, VVVV;
 }
 
 // No patterns for MOVLPS/MOVHPS as the Movlhps node should only be created in
 // SSE1. And MOVLPS pattern is even more complex.
 defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", null_frag,
-                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, TB;
 defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Unpckl,
-                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, REX_W;
+                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, TB, PD, REX_W;
 defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", null_frag,
-                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, TB;
 defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movsd,
-                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, REX_W;
+                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, TB, PD, REX_W;
 
 let Predicates = [HasAVX512] in {
   // VMOVHPD patterns
@@ -6565,14 +6576,14 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)),
           (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
-          EVEX_4V, Sched<[sched]>;
+          EVEX, VVVV, Sched<[sched]>;
 
   defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))),
           (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
-          EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+          EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
                           sched.ReadAfterFold]>;
 
   defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -6583,7 +6594,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
              _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))),
             (MaskOpNode _.RC:$src2,
              _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>,
-            EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
+            EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
                                     sched.ReadAfterFold]>;
   }
 }
@@ -6598,7 +6609,7 @@ multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
           (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))),
           (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>,
-          EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+          EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -6627,13 +6638,13 @@ multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDPatternOperator O
                               SDNode MaskOpNode, SDNode OpNodeRnd> {
     defm PH : avx512_fma3p_213_common<opc, OpcodeStr#"ph", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f16_info, HasFP16>, T_MAP6PD;
+                                      avx512vl_f16_info, HasFP16>, T_MAP6, PD;
     defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f32_info>, T8PD;
+                                      avx512vl_f32_info>, T8, PD;
     defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f64_info>, T8PD, REX_W;
+                                      avx512vl_f64_info>, T8, PD, REX_W;
 }
 
 defm VFMADD213    : avx512_fma3p_213_f<0xA8, "vfmadd213", any_fma,
@@ -6660,14 +6671,14 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (null_frag),
           (_.VT (MaskOpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
-          EVEX_4V, Sched<[sched]>;
+          EVEX, VVVV, Sched<[sched]>;
 
   defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
           (_.VT (MaskOpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
-          EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+          EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
                           sched.ReadAfterFold]>;
 
   defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -6679,7 +6690,7 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
                       _.RC:$src1)),
          (_.VT (MaskOpNode _.RC:$src2,
                            (_.VT (_.BroadcastLdFrag addr:$src3)),
-                           _.RC:$src1)), 1, 0>, EVEX_4V, EVEX_B,
+                           _.RC:$src1)), 1, 0>, EVEX, VVVV, EVEX_B,
          Sched<[sched.Folded, sched.ReadAfterFold,
                 sched.ReadAfterFold]>;
   }
@@ -6695,7 +6706,7 @@ multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
           (null_frag),
           (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))),
-          1, 1>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+          1, 1>, EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -6724,13 +6735,13 @@ multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDPatternOperator O
                               SDNode MaskOpNode, SDNode OpNodeRnd > {
     defm PH : avx512_fma3p_231_common<opc, OpcodeStr#"ph", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f16_info, HasFP16>, T_MAP6PD;
+                                      avx512vl_f16_info, HasFP16>, T_MAP6, PD;
     defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f32_info>, T8PD;
+                                      avx512vl_f32_info>, T8, PD;
     defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f64_info>, T8PD, REX_W;
+                                      avx512vl_f64_info>, T8, PD, REX_W;
 }
 
 defm VFMADD231    : avx512_fma3p_231_f<0xB8, "vfmadd231", any_fma,
@@ -6756,7 +6767,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (null_frag),
           (_.VT (MaskOpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>,
-          EVEX_4V, Sched<[sched]>;
+          EVEX, VVVV, Sched<[sched]>;
 
   // Pattern is 312 order so that the load is in a different place from the
   // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -6765,7 +6776,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)),
           (_.VT (MaskOpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>,
-          EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+          EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
                           sched.ReadAfterFold]>;
 
   // Pattern is 312 order so that the load is in a different place from the
@@ -6778,7 +6789,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
                        _.RC:$src1, _.RC:$src2)),
          (_.VT (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
                            _.RC:$src1, _.RC:$src2)), 1, 0>,
-         EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
+         EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
                                  sched.ReadAfterFold]>;
   }
 }
@@ -6793,7 +6804,7 @@ multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
           (null_frag),
           (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))),
-          1, 1>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+          1, 1>, EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -6822,13 +6833,13 @@ multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDPatternOperator O
                               SDNode MaskOpNode, SDNode OpNodeRnd > {
     defm PH : avx512_fma3p_132_common<opc, OpcodeStr#"ph", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f16_info, HasFP16>, T_MAP6PD;
+                                      avx512vl_f16_info, HasFP16>, T_MAP6, PD;
     defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f32_info>, T8PD;
+                                      avx512vl_f32_info>, T8, PD;
     defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f64_info>, T8PD, REX_W;
+                                      avx512vl_f64_info>, T8, PD, REX_W;
 }
 
 defm VFMADD132    : avx512_fma3p_132_f<0x98, "vfmadd132", any_fma,
@@ -6851,33 +6862,33 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
   defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3), OpcodeStr,
           "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
-          EVEX_4V, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
+          EVEX, VVVV, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
 
   let mayLoad = 1 in
   defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
           "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
-          EVEX_4V, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold,
+          EVEX, VVVV, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold,
                           SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
 
   let Uses = [MXCSR] in
   defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
          OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1>,
-         EVEX_4V, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
+         EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
 
   let isCodeGenOnly = 1, isCommutable = 1 in {
     def r     : AVX512<opc, MRMSrcReg, (outs _.FRC:$dst),
                      (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
                      !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                     !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, EVEX_4V, SIMD_EXC;
+                     !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, EVEX, VVVV, SIMD_EXC;
     def m     : AVX512<opc, MRMSrcMem, (outs _.FRC:$dst),
                     (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
                     !strconcat(OpcodeStr,
                                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                     [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold,
-                                     SchedWriteFMA.Scl.ReadAfterFold]>, EVEX_4V, SIMD_EXC;
+                                     SchedWriteFMA.Scl.ReadAfterFold]>, EVEX, VVVV, SIMD_EXC;
 
     let Uses = [MXCSR] in
     def rb    : AVX512<opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -6885,7 +6896,7 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
                      !strconcat(OpcodeStr,
                               "\t{$rc, $src3, $src2, $dst|$dst, $src2, $src3, $rc}"),
                      !if(MaskOnlyReg, [], [RHS_b])>, EVEX_B, EVEX_RC,
-                     Sched<[SchedWriteFMA.Scl]>, EVEX_4V;
+                     Sched<[SchedWriteFMA.Scl]>, EVEX, VVVV;
   }// isCodeGenOnly = 1
 }// Constraints = "$src1 = $dst"
 }
@@ -6929,15 +6940,15 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
   let Predicates = [HasAVX512] in {
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
                                  OpNodeRnd, f32x_info, "SS">,
-                                 EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD;
+                                 EVEX_CD8<32, CD8VT1>, VEX_LIG, T8, PD;
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
                                  OpNodeRnd, f64x_info, "SD">,
-                                 EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8PD;
+                                 EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8, PD;
   }
   let Predicates = [HasFP16] in {
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
                                  OpNodeRnd, f16x_info, "SH">,
-                                 EVEX_CD8<16, CD8VT1>, VEX_LIG, T_MAP6PD;
+                                 EVEX_CD8<16, CD8VT1>, VEX_LIG, T_MAP6, PD;
   }
 }
 
@@ -7189,13 +7200,13 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
-          T8PD, EVEX_4V, Sched<[sched]>;
+          T8, PD, EVEX, VVVV, Sched<[sched]>;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
-          T8PD, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+          T8, PD, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
                                 sched.ReadAfterFold]>;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -7205,7 +7216,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
             (OpNode _.RC:$src2,
                     (_.VT (_.BroadcastLdFrag addr:$src3)),
                     _.RC:$src1)>,
-            T8PD, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
+            T8, PD, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
                                           sched.ReadAfterFold]>;
   }
 }
@@ -7247,19 +7258,19 @@ let ExeDomain = DstVT.ExeDomain, Uses = _Uses,
     def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
               (ins DstVT.FRC:$src1, SrcRC:$src),
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
-              EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+              EVEX, VVVV, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
     let mayLoad = 1 in
       def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
               (ins DstVT.FRC:$src1, x86memop:$src),
               asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
-              EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
   } // hasSideEffects = 0
   def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
                 (ins DstVT.RC:$src1, SrcRC:$src2),
                 !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set DstVT.RC:$dst,
                       (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2))]>,
-               EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+               EVEX, VVVV, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
 
   def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
                 (ins DstVT.RC:$src1, x86memop:$src2),
@@ -7267,7 +7278,7 @@ let ExeDomain = DstVT.ExeDomain, Uses = _Uses,
                 [(set DstVT.RC:$dst,
                       (OpNode (DstVT.VT DstVT.RC:$src1),
                                (ld_frag addr:$src2)))]>,
-                EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
   def : InstAlias<"v"#asm#mem#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   (!cast<Instruction>(NAME#"rr_Int") DstVT.RC:$dst,
@@ -7287,7 +7298,7 @@ multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode,
                     (OpNode (DstVT.VT DstVT.RC:$src1),
                              SrcRC:$src2,
                              (i32 timm:$rc)))]>,
-              EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+              EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
   def : InstAlias<"v"#asm#mem#"\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}",
                   (!cast<Instruction>(NAME#"rrb_Int") DstVT.RC:$dst,
                   DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), 0, "att">;
@@ -7307,18 +7318,18 @@ let Predicates = [HasAVX512] in {
 defm VCVTSI2SSZ  : avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
                                  WriteCvtI2SS, GR32,
                                  v4f32x_info, i32mem, loadi32, "cvtsi2ss", "l">,
-                                 XS, EVEX_CD8<32, CD8VT1>;
+                                 TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
                                  WriteCvtI2SS, GR64,
                                  v4f32x_info, i64mem, loadi64, "cvtsi2ss", "q">,
-                                 XS, REX_W, EVEX_CD8<64, CD8VT1>;
+                                 TB, XS, REX_W, EVEX_CD8<64, CD8VT1>;
 defm VCVTSI2SDZ  : avx512_vcvtsi<0x2A, null_frag, WriteCvtI2SD, GR32,
                                  v2f64x_info, i32mem, loadi32, "cvtsi2sd", "l", [], 0>,
-                                 XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                                 TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
 defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
                                  WriteCvtI2SD, GR64,
                                  v2f64x_info, i64mem, loadi64, "cvtsi2sd", "q">,
-                                 XD, REX_W, EVEX_CD8<64, CD8VT1>;
+                                 TB, XD, REX_W, EVEX_CD8<64, CD8VT1>;
 
 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
               (VCVTSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
@@ -7346,18 +7357,18 @@ def : Pat<(f64 (any_sint_to_fp GR64:$src)),
 defm VCVTUSI2SSZ   : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
                                   WriteCvtI2SS, GR32,
                                   v4f32x_info, i32mem, loadi32,
-                                  "cvtusi2ss", "l">, XS, EVEX_CD8<32, CD8VT1>;
+                                  "cvtusi2ss", "l">, TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
                                   WriteCvtI2SS, GR64,
                                   v4f32x_info, i64mem, loadi64, "cvtusi2ss", "q">,
-                                  XS, REX_W, EVEX_CD8<64, CD8VT1>;
+                                  TB, XS, REX_W, EVEX_CD8<64, CD8VT1>;
 defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, null_frag, WriteCvtI2SD, GR32, v2f64x_info,
                                   i32mem, loadi32, "cvtusi2sd", "l", [], 0>,
-                                  XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                                  TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
 defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
                                   WriteCvtI2SD, GR64,
                                   v2f64x_info, i64mem, loadi64, "cvtusi2sd", "q">,
-                                  XD, REX_W, EVEX_CD8<64, CD8VT1>;
+                                  TB, XD, REX_W, EVEX_CD8<64, CD8VT1>;
 
 def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
               (VCVTUSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
@@ -7422,28 +7433,28 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
 // Convert float/double to signed/unsigned int 32/64
 defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,X86cvts2si,
                                    X86cvts2siRnd, WriteCvtSS2I, "cvtss2si", "{l}">,
-                                   XS, EVEX_CD8<32, CD8VT1>;
+                                   TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info, X86cvts2si,
                                    X86cvts2siRnd, WriteCvtSS2I, "cvtss2si", "{q}">,
-                                   XS, REX_W, EVEX_CD8<32, CD8VT1>;
+                                   TB, XS, REX_W, EVEX_CD8<32, CD8VT1>;
 defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info, X86cvts2usi,
                                    X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{l}">,
-                                   XS, EVEX_CD8<32, CD8VT1>;
+                                   TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info, X86cvts2usi,
                                    X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{q}">,
-                                   XS, REX_W, EVEX_CD8<32, CD8VT1>;
+                                   TB, XS, REX_W, EVEX_CD8<32, CD8VT1>;
 defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, X86cvts2si,
                                    X86cvts2siRnd, WriteCvtSD2I, "cvtsd2si", "{l}">,
-                                   XD, EVEX_CD8<64, CD8VT1>;
+                                   TB, XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info, X86cvts2si,
                                    X86cvts2siRnd, WriteCvtSD2I, "cvtsd2si", "{q}">,
-                                   XD, REX_W, EVEX_CD8<64, CD8VT1>;
+                                   TB, XD, REX_W, EVEX_CD8<64, CD8VT1>;
 defm VCVTSD2USIZ:   avx512_cvt_s_int_round<0x79, f64x_info, i32x_info, X86cvts2usi,
                                    X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{l}">,
-                                   XD, EVEX_CD8<64, CD8VT1>;
+                                   TB, XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, X86cvts2usi,
                                    X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{q}">,
-                                   XD, REX_W, EVEX_CD8<64, CD8VT1>;
+                                   TB, XD, REX_W, EVEX_CD8<64, CD8VT1>;
 
 multiclass avx512_cvt_s<bits<8> opc, string asm, X86VectorVTInfo SrcVT,
                         X86VectorVTInfo DstVT, SDNode OpNode,
@@ -7463,13 +7474,13 @@ multiclass avx512_cvt_s<bits<8> opc, string asm, X86VectorVTInfo SrcVT,
 }
 
 defm VCVTSS2SIZ: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i32x_info,
-                       lrint, WriteCvtSS2I>, XS, EVEX_CD8<32, CD8VT1>;
+                       lrint, WriteCvtSS2I>, TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSS2SI64Z: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i64x_info,
-                       llrint, WriteCvtSS2I>, REX_W, XS, EVEX_CD8<32, CD8VT1>;
+                       llrint, WriteCvtSS2I>, REX_W, TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSD2SIZ: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i32x_info,
-                       lrint, WriteCvtSD2I>, XD, EVEX_CD8<64, CD8VT1>;
+                       lrint, WriteCvtSD2I>, TB, XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTSD2SI64Z: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i64x_info,
-                       llrint, WriteCvtSD2I>, REX_W, XD, EVEX_CD8<64, CD8VT1>;
+                       llrint, WriteCvtSD2I>, REX_W, TB, XD, EVEX_CD8<64, CD8VT1>;
 
 let Predicates = [HasAVX512] in {
   def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64Zrr FR32:$src)>;
@@ -7609,29 +7620,29 @@ let Predicates = [prd], ExeDomain = _SrcRC.ExeDomain in {
 
 defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info,
                         any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
-                        "{l}">, XS, EVEX_CD8<32, CD8VT1>;
+                        "{l}">, TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info,
                         any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
-                        "{q}">, REX_W, XS, EVEX_CD8<32, CD8VT1>;
+                        "{q}">, REX_W, TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info,
                         any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
-                        "{l}">, XD, EVEX_CD8<64, CD8VT1>;
+                        "{l}">, TB, XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info,
                         any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
-                        "{q}">, REX_W, XD, EVEX_CD8<64, CD8VT1>;
+                        "{q}">, REX_W, TB, XD, EVEX_CD8<64, CD8VT1>;
 
 defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info,
                         any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
-                        "{l}">, XS, EVEX_CD8<32, CD8VT1>;
+                        "{l}">, TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info,
                         any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
-                        "{q}">, XS,REX_W, EVEX_CD8<32, CD8VT1>;
+                        "{q}">, TB, XS,REX_W, EVEX_CD8<32, CD8VT1>;
 defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info,
                         any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
-                        "{l}">, XD, EVEX_CD8<64, CD8VT1>;
+                        "{l}">, TB, XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
                         any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
-                        "{q}">, XD, REX_W, EVEX_CD8<64, CD8VT1>;
+                        "{q}">, TB, XD, REX_W, EVEX_CD8<64, CD8VT1>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Convert form float to double and back
@@ -7646,25 +7657,25 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
                                        (_Src.VT _Src.RC:$src2)))>,
-                         EVEX_4V, VEX_LIG, Sched<[sched]>;
+                         EVEX, VVVV, VEX_LIG, Sched<[sched]>;
   defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
                                   (_Src.ScalarIntMemFrags addr:$src2)))>,
-                         EVEX_4V, VEX_LIG,
+                         EVEX, VVVV, VEX_LIG,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   let isCodeGenOnly = 1, hasSideEffects = 0 in {
     def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst),
                (ins _.FRC:$src1, _Src.FRC:$src2),
                OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-               EVEX_4V, VEX_LIG, Sched<[sched]>;
+               EVEX, VVVV, VEX_LIG, Sched<[sched]>;
     let mayLoad = 1 in
     def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst),
                (ins _.FRC:$src1, _Src.ScalarMemOp:$src2),
                OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-               EVEX_4V, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
+               EVEX, VVVV, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
@@ -7678,7 +7689,7 @@ multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTIn
                         "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                         (_.VT (OpNodeSAE (_.VT _.RC:$src1),
                                          (_Src.VT _Src.RC:$src2)))>,
-                        EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
+                        EVEX, VVVV, VEX_LIG, EVEX_B, Sched<[sched]>;
 }
 
 // Scalar Conversion with rounding control (RC)
@@ -7691,7 +7702,7 @@ multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInf
                         "$rc, $src2, $src1", "$src1, $src2, $rc",
                         (_.VT (OpNodeRnd (_.VT _.RC:$src1),
                                          (_Src.VT _Src.RC:$src2), (i32 timm:$rc)))>,
-                        EVEX_4V, VEX_LIG, Sched<[sched]>,
+                        EVEX, VVVV, VEX_LIG, Sched<[sched]>,
                         EVEX_B, EVEX_RC;
 }
 multiclass avx512_cvt_fp_scalar_trunc<bits<8> opc, string OpcodeStr,
@@ -7719,22 +7730,22 @@ multiclass avx512_cvt_fp_scalar_extend<bits<8> opc, string OpcodeStr,
 }
 defm VCVTSD2SS : avx512_cvt_fp_scalar_trunc<0x5A, "vcvtsd2ss", X86frounds,
                                          X86froundsRnd, WriteCvtSD2SS, f64x_info,
-                                         f32x_info>, XD, REX_W;
+                                         f32x_info>, TB, XD, REX_W;
 defm VCVTSS2SD : avx512_cvt_fp_scalar_extend<0x5A, "vcvtss2sd", X86fpexts,
                                           X86fpextsSAE, WriteCvtSS2SD, f32x_info,
-                                          f64x_info>, XS;
+                                          f64x_info>, TB, XS;
 defm VCVTSD2SH : avx512_cvt_fp_scalar_trunc<0x5A, "vcvtsd2sh", X86frounds,
                                           X86froundsRnd, WriteCvtSD2SS, f64x_info,
-                                          f16x_info, HasFP16>, T_MAP5XD, REX_W;
+                                          f16x_info, HasFP16>, T_MAP5, XD, REX_W;
 defm VCVTSH2SD : avx512_cvt_fp_scalar_extend<0x5A, "vcvtsh2sd", X86fpexts,
                                           X86fpextsSAE, WriteCvtSS2SD, f16x_info,
-                                          f64x_info, HasFP16>, T_MAP5XS;
+                                          f64x_info, HasFP16>, T_MAP5, XS;
 defm VCVTSS2SH : avx512_cvt_fp_scalar_trunc<0x1D, "vcvtss2sh", X86frounds,
                                           X86froundsRnd, WriteCvtSD2SS, f32x_info,
-                                          f16x_info, HasFP16>, T_MAP5PS;
+                                          f16x_info, HasFP16>, T_MAP5;
 defm VCVTSH2SS : avx512_cvt_fp_scalar_extend<0x13, "vcvtsh2ss", X86fpexts,
                                           X86fpextsSAE, WriteCvtSS2SD, f16x_info,
-                                          f32x_info, HasFP16>, T_MAP6PS;
+                                          f32x_info, HasFP16>, T_MAP6;
 
 def : Pat<(f64 (any_fpextend FR32X:$src)),
           (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>,
@@ -7996,10 +8007,10 @@ multiclass avx512_cvt_trunc<bits<8> opc, string OpcodeStr,
 
 defm VCVTPD2PS : avx512_cvt_trunc<0x5A, "vcvtpd2ps",
                                   avx512vl_f32_info, avx512vl_f64_info, SchedWriteCvtPD2PS>,
-                                  REX_W, PD, EVEX_CD8<64, CD8VF>;
+                                  REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
 defm VCVTPS2PD : avx512_cvt_extend<0x5A, "vcvtps2pd",
                                    avx512vl_f64_info, avx512vl_f32_info, SchedWriteCvtPS2PD>,
-                                   PS, EVEX_CD8<32, CD8VH>;
+                                   TB, EVEX_CD8<32, CD8VH>;
 
 // Extend Half to Double
 multiclass avx512_cvtph2pd<bits<8> opc, string OpcodeStr,
@@ -8108,14 +8119,14 @@ multiclass avx512_cvtpd2ph<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sc
 
 defm VCVTPS2PHX : avx512_cvt_trunc<0x1D, "vcvtps2phx", avx512vl_f16_info,
                                    avx512vl_f32_info, SchedWriteCvtPD2PS,
-                                   HasFP16>, T_MAP5PD, EVEX_CD8<32, CD8VF>;
+                                   HasFP16>, T_MAP5, PD, EVEX_CD8<32, CD8VF>;
 defm VCVTPH2PSX : avx512_cvt_extend<0x13, "vcvtph2psx", avx512vl_f32_info,
                                     avx512vl_f16_info, SchedWriteCvtPS2PD,
-                                    HasFP16>, T_MAP6PD, EVEX_CD8<16, CD8VH>;
+                                    HasFP16>, T_MAP6, PD, EVEX_CD8<16, CD8VH>;
 defm VCVTPD2PH : avx512_cvtpd2ph<0x5A, "vcvtpd2ph", SchedWriteCvtPD2PS>,
-                                 REX_W, T_MAP5PD, EVEX_CD8<64, CD8VF>;
+                                 REX_W, T_MAP5, PD, EVEX_CD8<64, CD8VF>;
 defm VCVTPH2PD : avx512_cvtph2pd<0x5A, "vcvtph2pd", SchedWriteCvtPS2PD>,
-                                 T_MAP5PS, EVEX_CD8<16, CD8VQ>;
+                                 T_MAP5, EVEX_CD8<16, CD8VQ>;
 
 let Predicates = [HasFP16, HasVLX] in {
   // Special patterns to allow use of X86vmfpround for masking. Instruction
@@ -8596,120 +8607,120 @@ multiclass avx512_cvtqq2ps_dq2ph<bits<8> opc, string OpcodeStr, SDPatternOperato
 
 defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", any_sint_to_fp, sint_to_fp,
                                  X86any_VSintToFP, X86VSintToFP,
-                                 SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>;
+                                 SchedWriteCvtDQ2PD>, TB, XS, EVEX_CD8<32, CD8VH>;
 
 defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", any_sint_to_fp, sint_to_fp,
                                 X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
-                                PS, EVEX_CD8<32, CD8VF>;
+                                TB, EVEX_CD8<32, CD8VF>;
 
 defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86any_cvttp2si,
                                  X86cvttp2si, X86cvttp2siSAE,
-                                 SchedWriteCvtPS2DQ>, XS, EVEX_CD8<32, CD8VF>;
+                                 SchedWriteCvtPS2DQ>, TB, XS, EVEX_CD8<32, CD8VF>;
 
 defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86any_cvttp2si,
                                  X86cvttp2si, X86cvttp2siSAE,
                                  SchedWriteCvtPD2DQ>,
-                                 PD, REX_W, EVEX_CD8<64, CD8VF>;
+                                 TB, PD, REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
-                                 SchedWriteCvtPS2DQ>, PS, EVEX_CD8<32, CD8VF>;
+                                 SchedWriteCvtPS2DQ>, TB, EVEX_CD8<32, CD8VF>;
 
 defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
                                  SchedWriteCvtPD2DQ>,
-                                 PS, REX_W, EVEX_CD8<64, CD8VF>;
+                                 TB, REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", any_uint_to_fp,
                                   uint_to_fp, X86any_VUintToFP, X86VUintToFP,
-                                  SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>;
+                                  SchedWriteCvtDQ2PD>, TB, XS, EVEX_CD8<32, CD8VH>;
 
 defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", any_uint_to_fp,
                                  uint_to_fp, X86VUintToFpRnd,
-                                 SchedWriteCvtDQ2PS>, XD, EVEX_CD8<32, CD8VF>;
+                                 SchedWriteCvtDQ2PS>, TB, XD, EVEX_CD8<32, CD8VF>;
 
 defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int, X86cvtp2Int,
-                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, TB, PD,
                                  EVEX_CD8<32, CD8VF>;
 
 defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int, X86cvtp2Int,
-                                 X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, XD,
+                                 X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, TB, XD,
                                  REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt, X86cvtp2UInt,
                                  X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>,
-                                 PS, EVEX_CD8<32, CD8VF>;
+                                 TB, EVEX_CD8<32, CD8VF>;
 
 defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt, X86cvtp2UInt,
                                  X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, REX_W,
-                                 PS, EVEX_CD8<64, CD8VF>;
+                                 TB, EVEX_CD8<64, CD8VF>;
 
 defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int, X86cvtp2Int,
                                  X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, REX_W,
-                                 PD, EVEX_CD8<64, CD8VF>;
+                                 TB, PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int, X86cvtp2Int,
-                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, TB, PD,
                                  EVEX_CD8<32, CD8VH>;
 
 defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt, X86cvtp2UInt,
                                  X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, REX_W,
-                                 PD, EVEX_CD8<64, CD8VF>;
+                                 TB, PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt, X86cvtp2UInt,
-                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, PD,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, TB, PD,
                                  EVEX_CD8<32, CD8VH>;
 
 defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86any_cvttp2si,
                                  X86cvttp2si, X86cvttp2siSAE,
                                  SchedWriteCvtPD2DQ>, REX_W,
-                                 PD, EVEX_CD8<64, CD8VF>;
+                                 TB, PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86any_cvttp2si,
                                  X86cvttp2si, X86cvttp2siSAE,
-                                 SchedWriteCvtPS2DQ>, PD,
+                                 SchedWriteCvtPS2DQ>, TB, PD,
                                  EVEX_CD8<32, CD8VH>;
 
 defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
                                  SchedWriteCvtPD2DQ>, REX_W,
-                                 PD, EVEX_CD8<64, CD8VF>;
+                                 TB, PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
-                                 SchedWriteCvtPS2DQ>, PD,
+                                 SchedWriteCvtPS2DQ>, TB, PD,
                                  EVEX_CD8<32, CD8VH>;
 
 defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", any_sint_to_fp,
                             sint_to_fp, X86VSintToFpRnd,
-                            SchedWriteCvtDQ2PD>, REX_W, XS, EVEX_CD8<64, CD8VF>;
+                            SchedWriteCvtDQ2PD>, REX_W, TB, XS, EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", any_uint_to_fp,
                             uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PD>,
-                            REX_W, XS, EVEX_CD8<64, CD8VF>;
+                            REX_W, TB, XS, EVEX_CD8<64, CD8VF>;
 
 defm VCVTDQ2PH : avx512_cvtqq2ps_dq2ph<0x5B, "vcvtdq2ph", any_sint_to_fp, sint_to_fp,
                             X86any_VSintToFP, X86VMSintToFP,
                             X86VSintToFpRnd, avx512vl_f16_info, avx512vl_i32_info,
                             SchedWriteCvtDQ2PS, HasFP16>,
-                            T_MAP5PS, EVEX_CD8<32, CD8VF>;
+                            T_MAP5, EVEX_CD8<32, CD8VF>;
 
 defm VCVTUDQ2PH : avx512_cvtqq2ps_dq2ph<0x7A, "vcvtudq2ph", any_uint_to_fp, uint_to_fp,
                             X86any_VUintToFP, X86VMUintToFP,
                             X86VUintToFpRnd, avx512vl_f16_info, avx512vl_i32_info,
-                            SchedWriteCvtDQ2PS, HasFP16>, T_MAP5XD,
+                            SchedWriteCvtDQ2PS, HasFP16>, T_MAP5, XD,
                             EVEX_CD8<32, CD8VF>;
 
 defm VCVTQQ2PS : avx512_cvtqq2ps_dq2ph<0x5B, "vcvtqq2ps", any_sint_to_fp, sint_to_fp,
                             X86any_VSintToFP, X86VMSintToFP,
                             X86VSintToFpRnd, avx512vl_f32_info, avx512vl_i64_info,
-                            SchedWriteCvtDQ2PS>, REX_W, PS,
+                            SchedWriteCvtDQ2PS>, REX_W, TB,
                             EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PS : avx512_cvtqq2ps_dq2ph<0x7A, "vcvtuqq2ps", any_uint_to_fp, uint_to_fp,
                             X86any_VUintToFP, X86VMUintToFP,
                             X86VUintToFpRnd, avx512vl_f32_info, avx512vl_i64_info,
-                            SchedWriteCvtDQ2PS>, REX_W, XD,
+                            SchedWriteCvtDQ2PS>, REX_W, TB, XD,
                             EVEX_CD8<64, CD8VF>;
 
 let Predicates = [HasVLX] in {
@@ -8912,12 +8923,12 @@ multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                             (ins _src.RC:$src), "vcvtph2ps", "$src", "$src",
                             (X86any_cvtph2ps (_src.VT _src.RC:$src)),
                             (X86cvtph2ps (_src.VT _src.RC:$src))>,
-                            T8PD, Sched<[sched]>;
+                            T8, PD, Sched<[sched]>;
   defm rm : AVX512_maskable_split<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
                             (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
                             (X86any_cvtph2ps (_src.VT ld_dag)),
                             (X86cvtph2ps (_src.VT ld_dag))>,
-                            T8PD, Sched<[sched.Folded]>;
+                            T8, PD, Sched<[sched.Folded]>;
 }
 
 multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
@@ -8927,7 +8938,7 @@ multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                              (ins _src.RC:$src), "vcvtph2ps",
                              "{sae}, $src", "$src, {sae}",
                              (X86cvtph2psSAE (_src.VT _src.RC:$src))>,
-                             T8PD, EVEX_B, Sched<[sched]>;
+                             T8, PD, EVEX_B, Sched<[sched]>;
 }
 
 let Predicates = [HasAVX512] in
@@ -9068,55 +9079,55 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
 
 let Defs = [EFLAGS], Predicates = [HasAVX512] in {
   defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86any_fcmp, f32, f32mem, loadf32,
-                                 "ucomiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
+                                 "ucomiss", SSEPackedSingle>, TB, EVEX, VEX_LIG,
                                  EVEX_CD8<32, CD8VT1>;
   defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86any_fcmp, f64, f64mem, loadf64,
-                                  "ucomisd", SSEPackedDouble>, PD, EVEX,
+                                  "ucomisd", SSEPackedDouble>, TB, PD, EVEX,
                                   VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
   defm VCOMISSZ  : sse12_ord_cmp<0x2F, FR32X, X86strict_fcmps, f32, f32mem, loadf32,
-                                 "comiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
+                                 "comiss", SSEPackedSingle>, TB, EVEX, VEX_LIG,
                                  EVEX_CD8<32, CD8VT1>;
   defm VCOMISDZ  : sse12_ord_cmp<0x2F, FR64X, X86strict_fcmps, f64, f64mem, loadf64,
-                                 "comisd", SSEPackedDouble>, PD, EVEX,
+                                 "comisd", SSEPackedDouble>, TB, PD, EVEX,
                                   VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
   let isCodeGenOnly = 1 in {
     defm VUCOMISSZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
-                          sse_load_f32, "ucomiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
+                          sse_load_f32, "ucomiss", SSEPackedSingle>, TB, EVEX, VEX_LIG,
                           EVEX_CD8<32, CD8VT1>;
     defm VUCOMISDZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
-                          sse_load_f64, "ucomisd", SSEPackedDouble>, PD, EVEX,
+                          sse_load_f64, "ucomisd", SSEPackedDouble>, TB, PD, EVEX,
                           VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
 
     defm VCOMISSZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
-                          sse_load_f32, "comiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
+                          sse_load_f32, "comiss", SSEPackedSingle>, TB, EVEX, VEX_LIG,
                           EVEX_CD8<32, CD8VT1>;
     defm VCOMISDZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
-                          sse_load_f64, "comisd", SSEPackedDouble>, PD, EVEX,
+                          sse_load_f64, "comisd", SSEPackedDouble>, TB, PD, EVEX,
                           VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
   }
 }
 
 let Defs = [EFLAGS], Predicates = [HasFP16] in {
   defm VUCOMISHZ : avx512_ord_cmp_sae<0x2E, v8f16x_info, "vucomish",
-                                SSEPackedSingle>, AVX512PSIi8Base, T_MAP5PS,
+                                SSEPackedSingle>, AVX512PSIi8Base, T_MAP5,
                                 EVEX_CD8<16, CD8VT1>;
   defm VCOMISHZ : avx512_ord_cmp_sae<0x2F, v8f16x_info, "vcomish",
-                                SSEPackedSingle>, AVX512PSIi8Base, T_MAP5PS,
+                                SSEPackedSingle>, AVX512PSIi8Base, T_MAP5,
                                 EVEX_CD8<16, CD8VT1>;
   defm VUCOMISHZ : sse12_ord_cmp<0x2E, FR16X, X86any_fcmp, f16, f16mem, loadf16,
-                                "ucomish", SSEPackedSingle>, T_MAP5PS, EVEX,
+                                "ucomish", SSEPackedSingle>, T_MAP5, EVEX,
                                 VEX_LIG, EVEX_CD8<16, CD8VT1>;
   defm VCOMISHZ : sse12_ord_cmp<0x2F, FR16X, X86strict_fcmps, f16, f16mem, loadf16,
-                                "comish", SSEPackedSingle>, T_MAP5PS, EVEX,
+                                "comish", SSEPackedSingle>, T_MAP5, EVEX,
                                 VEX_LIG, EVEX_CD8<16, CD8VT1>;
   let isCodeGenOnly = 1 in {
     defm VUCOMISHZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v8f16, shmem,
                                 sse_load_f16, "ucomish", SSEPackedSingle>,
-                                T_MAP5PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+                                T_MAP5, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
 
     defm VCOMISHZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v8f16, shmem,
                                 sse_load_f16, "comish", SSEPackedSingle>,
-                                T_MAP5PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+                                T_MAP5, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
   }
 }
 
@@ -9129,35 +9140,35 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
                            (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
-                           EVEX_4V, VEX_LIG, Sched<[sched]>;
+                           EVEX, VVVV, VEX_LIG, Sched<[sched]>;
   defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (OpNode (_.VT _.RC:$src1),
-                          (_.ScalarIntMemFrags addr:$src2))>, EVEX_4V, VEX_LIG,
+                          (_.ScalarIntMemFrags addr:$src2))>, EVEX, VVVV, VEX_LIG,
                           Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 }
 
 defm VRCPSHZ : avx512_fp14_s<0x4D, "vrcpsh", X86rcp14s, SchedWriteFRcp.Scl,
                                f16x_info, HasFP16>, EVEX_CD8<16, CD8VT1>,
-                               T_MAP6PD;
+                               T_MAP6, PD;
 defm VRSQRTSHZ : avx512_fp14_s<0x4F, "vrsqrtsh", X86rsqrt14s,
                                  SchedWriteFRsqrt.Scl, f16x_info, HasFP16>,
-                                 EVEX_CD8<16, CD8VT1>, T_MAP6PD;
+                                 EVEX_CD8<16, CD8VT1>, T_MAP6, PD;
 let Uses = [MXCSR] in {
 defm VRCP14SSZ : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SchedWriteFRcp.Scl,
                                f32x_info>, EVEX_CD8<32, CD8VT1>,
-                               T8PD;
+                               T8, PD;
 defm VRCP14SDZ : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, SchedWriteFRcp.Scl,
                                f64x_info>, REX_W, EVEX_CD8<64, CD8VT1>,
-                               T8PD;
+                               T8, PD;
 defm VRSQRT14SSZ : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s,
                                  SchedWriteFRsqrt.Scl, f32x_info>,
-                                 EVEX_CD8<32, CD8VT1>, T8PD;
+                                 EVEX_CD8<32, CD8VT1>, T8, PD;
 defm VRSQRT14SDZ : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s,
                                  SchedWriteFRsqrt.Scl, f64x_info>, REX_W,
-                                 EVEX_CD8<64, CD8VT1>, T8PD;
+                                 EVEX_CD8<64, CD8VT1>, T8, PD;
 }
 
 /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
@@ -9166,19 +9177,19 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
   let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
-                         (_.VT (OpNode _.RC:$src))>, EVEX, T8PD,
+                         (_.VT (OpNode _.RC:$src))>, EVEX, T8, PD,
                          Sched<[sched]>;
   defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
                          (OpNode (_.VT
-                           (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD,
+                           (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8, PD,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                           (ins _.ScalarMemOp:$src), OpcodeStr,
                           "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr,
                           (OpNode (_.VT
                             (_.BroadcastLdFrag addr:$src)))>,
-                          EVEX, T8PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                          EVEX, T8, PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
@@ -9192,7 +9203,7 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }
   let Predicates = [HasFP16] in
   defm PHZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"), OpNode, sched.ZMM,
-                           v32f16_info>, EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+                           v32f16_info>, EVEX_V512, T_MAP6, EVEX_CD8<16, CD8VF>;
 
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX], Uses = [MXCSR] in {
@@ -9212,10 +9223,10 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode,
   let Predicates = [HasFP16, HasVLX] in {
     defm PHZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"),
                                 OpNode, sched.XMM, v8f16x_info>,
-                                EVEX_V128, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+                                EVEX_V128, T_MAP6, EVEX_CD8<16, CD8VF>;
     defm PHZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"),
                                 OpNode, sched.YMM, v16f16x_info>,
-                                EVEX_V256, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+                                EVEX_V256, T_MAP6, EVEX_CD8<16, CD8VF>;
   }
 }
 
@@ -9250,16 +9261,16 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
 multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
   defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, OpNodeSAE,
-                           sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD, EVEX_4V;
+                           sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG, T8, PD, EVEX, VVVV;
   defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, OpNodeSAE,
-                           sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8PD, EVEX_4V;
+                           sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8, PD, EVEX, VVVV;
 }
 
 multiclass avx512_vgetexpsh<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
   let Predicates = [HasFP16] in
   defm SHZ : avx512_fp28_s<opc, OpcodeStr#"sh", f16x_info, OpNode,  OpNodeSAE, sched>,
-               EVEX_CD8<16, CD8VT1>, T_MAP6PD, EVEX_4V;
+               EVEX_CD8<16, CD8VT1>, T_MAP6, PD, EVEX, VVVV;
 }
 
 let Predicates = [HasERI] in {
@@ -9311,10 +9322,10 @@ multiclass  avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode,
                        SDNode OpNodeSAE, X86SchedWriteWidths sched> {
    defm PSZ : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>,
               avx512_fp28_p_sae<opc, OpcodeStr#"ps", v16f32_info, OpNodeSAE, sched.ZMM>,
-              T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+              T8, PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
    defm PDZ : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>,
               avx512_fp28_p_sae<opc, OpcodeStr#"pd", v8f64_info, OpNodeSAE, sched.ZMM>,
-              T8PD, EVEX_V512, REX_W, EVEX_CD8<64, CD8VF>;
+              T8, PD, EVEX_V512, REX_W, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
@@ -9323,16 +9334,16 @@ multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX] in {
     defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode,
                                 sched.XMM>,
-                                EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
+                                EVEX_V128, T8, PD, EVEX_CD8<32, CD8VF>;
     defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode,
                                 sched.YMM>,
-                                EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
+                                EVEX_V256, T8, PD, EVEX_CD8<32, CD8VF>;
     defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode,
                                 sched.XMM>,
-                                EVEX_V128, REX_W, T8PD, EVEX_CD8<64, CD8VF>;
+                                EVEX_V128, REX_W, T8, PD, EVEX_CD8<64, CD8VF>;
     defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode,
                                 sched.YMM>,
-                                EVEX_V256, REX_W, T8PD, EVEX_CD8<64, CD8VF>;
+                                EVEX_V256, REX_W, T8, PD, EVEX_CD8<64, CD8VF>;
   }
 }
 
@@ -9341,12 +9352,12 @@ multiclass  avx512_vgetexp_fp16<bits<8> opc, string OpcodeStr, SDNode OpNode,
   let Predicates = [HasFP16] in
   defm PHZ : avx512_fp28_p<opc, OpcodeStr#"ph", v32f16_info, OpNode, sched.ZMM>,
               avx512_fp28_p_sae<opc, OpcodeStr#"ph", v32f16_info, OpNodeSAE, sched.ZMM>,
-              T_MAP6PD, EVEX_V512, EVEX_CD8<16, CD8VF>;
+              T_MAP6, PD, EVEX_V512, EVEX_CD8<16, CD8VF>;
   let Predicates = [HasFP16, HasVLX] in {
     defm PHZ128 : avx512_fp28_p<opc, OpcodeStr#"ph", v8f16x_info, OpNode, sched.XMM>,
-                                     EVEX_V128, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+                                     EVEX_V128, T_MAP6, PD, EVEX_CD8<16, CD8VF>;
     defm PHZ256 : avx512_fp28_p<opc, OpcodeStr#"ph", v16f16x_info, OpNode, sched.YMM>,
-                                     EVEX_V256, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+                                     EVEX_V256, T_MAP6, PD, EVEX_CD8<16, CD8VF>;
   }
 }
 let Predicates = [HasERI] in {
@@ -9401,35 +9412,35 @@ multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
   let Predicates = [HasFP16] in
   defm PHZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
                                 sched.PH.ZMM, v32f16_info>,
-                                EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                EVEX_V512, T_MAP5, EVEX_CD8<16, CD8VF>;
   let Predicates = [HasFP16, HasVLX] in {
     defm PHZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
                                      sched.PH.XMM, v8f16x_info>,
-                                     EVEX_V128, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                     EVEX_V128, T_MAP5, EVEX_CD8<16, CD8VF>;
     defm PHZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
                                      sched.PH.YMM, v16f16x_info>,
-                                     EVEX_V256, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                     EVEX_V256, T_MAP5, EVEX_CD8<16, CD8VF>;
   }
   defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
                                 sched.PS.ZMM, v16f32_info>,
-                                EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+                                EVEX_V512, TB, EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
                                 sched.PD.ZMM, v8f64_info>,
-                                EVEX_V512, REX_W, PD, EVEX_CD8<64, CD8VF>;
+                                EVEX_V512, REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX] in {
     defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
                                      sched.PS.XMM, v4f32x_info>,
-                                     EVEX_V128, PS, EVEX_CD8<32, CD8VF>;
+                                     EVEX_V128, TB, EVEX_CD8<32, CD8VF>;
     defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
                                      sched.PS.YMM, v8f32x_info>,
-                                     EVEX_V256, PS, EVEX_CD8<32, CD8VF>;
+                                     EVEX_V256, TB, EVEX_CD8<32, CD8VF>;
     defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
                                      sched.PD.XMM, v2f64x_info>,
-                                     EVEX_V128, REX_W, PD, EVEX_CD8<64, CD8VF>;
+                                     EVEX_V128, REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
     defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
                                      sched.PD.YMM, v4f64x_info>,
-                                     EVEX_V256, REX_W, PD, EVEX_CD8<64, CD8VF>;
+                                     EVEX_V256, REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
   }
 }
 
@@ -9439,13 +9450,13 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
   let Predicates = [HasFP16] in
   defm PHZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ph"),
                                       sched.PH.ZMM, v32f16_info>,
-                                      EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                      EVEX_V512, T_MAP5, EVEX_CD8<16, CD8VF>;
   defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"),
                                       sched.PS.ZMM, v16f32_info>,
-                                      EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+                                      EVEX_V512, TB, EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"),
                                       sched.PD.ZMM, v8f64_info>,
-                                      EVEX_V512, REX_W, PD, EVEX_CD8<64, CD8VF>;
+                                      EVEX_V512, REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
@@ -9501,11 +9512,11 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
 multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr,
                                   X86SchedWriteSizes sched> {
   defm SHZ : avx512_sqrt_scalar<opc, OpcodeStr#"sh", sched.PH.Scl, f16x_info, NAME#"SH", HasFP16>,
-                        EVEX_CD8<16, CD8VT1>, EVEX_4V, T_MAP5XS;
+                        EVEX_CD8<16, CD8VT1>, EVEX, VVVV, T_MAP5, XS;
   defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">,
-                        EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
+                        EVEX_CD8<32, CD8VT1>, EVEX, VVVV, TB, XS;
   defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">,
-                        EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, REX_W;
+                        EVEX_CD8<64, CD8VT1>, EVEX, VVVV, TB, XD, REX_W;
 }
 
 defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", SchedWriteFSqrtSizes>,
@@ -9569,17 +9580,17 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
 let Predicates = [HasFP16] in
 defm VRNDSCALESHZ : avx512_rndscale_scalar<0x0A, "vrndscalesh",
                                            SchedWriteFRnd.Scl, f16x_info>,
-                                           AVX512PSIi8Base, TA, EVEX_4V,
+                                           AVX512PSIi8Base, TA, EVEX, VVVV,
                                            EVEX_CD8<16, CD8VT1>;
 
 defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless",
                                            SchedWriteFRnd.Scl, f32x_info>,
-                                           AVX512AIi8Base, EVEX_4V, VEX_LIG,
+                                           AVX512AIi8Base, EVEX, VVVV, VEX_LIG,
                                            EVEX_CD8<32, CD8VT1>;
 
 defm VRNDSCALESDZ : avx512_rndscale_scalar<0x0B, "vrndscalesd",
                                            SchedWriteFRnd.Scl, f64x_info>,
-                                           REX_W, AVX512AIi8Base, EVEX_4V, VEX_LIG,
+                                           REX_W, AVX512AIi8Base, EVEX, VVVV, VEX_LIG,
                                            EVEX_CD8<64, CD8VT1>;
 
 multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
@@ -9923,16 +9934,16 @@ multiclass avx512_pmovx_bw<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX, HasBWI] in {
     defm Z128:  avx512_pmovx_common<opc, OpcodeStr, sched.XMM, v8i16x_info,
                     v16i8x_info, i64mem, LdFrag, InVecNode>,
-                     EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128, WIG;
+                     EVEX_CD8<8, CD8VH>, T8, PD, EVEX_V128, WIG;
 
     defm Z256:  avx512_pmovx_common<opc, OpcodeStr, sched.YMM, v16i16x_info,
                     v16i8x_info, i128mem, LdFrag, OpNode>,
-                     EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256, WIG;
+                     EVEX_CD8<8, CD8VH>, T8, PD, EVEX_V256, WIG;
   }
   let Predicates = [HasBWI] in {
     defm Z   :  avx512_pmovx_common<opc, OpcodeStr, sched.ZMM, v32i16_info,
                     v32i8x_info, i256mem, LdFrag, OpNode>,
-                     EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512, WIG;
+                     EVEX_CD8<8, CD8VH>, T8, PD, EVEX_V512, WIG;
   }
 }
 
@@ -9943,16 +9954,16 @@ multiclass avx512_pmovx_bd<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_pmovx_common<opc, OpcodeStr, sched.XMM, v4i32x_info,
                    v16i8x_info, i32mem, LdFrag, InVecNode>,
-                         EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, WIG;
+                         EVEX_CD8<8, CD8VQ>, T8, PD, EVEX_V128, WIG;
 
     defm Z256:  avx512_pmovx_common<opc, OpcodeStr, sched.YMM, v8i32x_info,
                    v16i8x_info, i64mem, LdFrag, InVecNode>,
-                         EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, WIG;
+                         EVEX_CD8<8, CD8VQ>, T8, PD, EVEX_V256, WIG;
   }
   let Predicates = [HasAVX512] in {
     defm Z   :  avx512_pmovx_common<opc, OpcodeStr, sched.ZMM, v16i32_info,
                    v16i8x_info, i128mem, LdFrag, OpNode>,
-                         EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512, WIG;
+                         EVEX_CD8<8, CD8VQ>, T8, PD, EVEX_V512, WIG;
   }
 }
 
@@ -9963,16 +9974,16 @@ multiclass avx512_pmovx_bq<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_pmovx_common<opc, OpcodeStr, sched.XMM, v2i64x_info,
                    v16i8x_info, i16mem, LdFrag, InVecNode>,
-                     EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, WIG;
+                     EVEX_CD8<8, CD8VO>, T8, PD, EVEX_V128, WIG;
 
     defm Z256:  avx512_pmovx_common<opc, OpcodeStr, sched.YMM, v4i64x_info,
                    v16i8x_info, i32mem, LdFrag, InVecNode>,
-                     EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, WIG;
+                     EVEX_CD8<8, CD8VO>, T8, PD, EVEX_V256, WIG;
   }
   let Predicates = [HasAVX512] in {
     defm Z   :  avx512_pmovx_common<opc, OpcodeStr, sched.ZMM, v8i64_info,
                    v16i8x_info, i64mem, LdFrag, InVecNode>,
-                     EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, WIG;
+                     EVEX_CD8<8, CD8VO>, T8, PD, EVEX_V512, WIG;
   }
 }
 
@@ -9983,16 +9994,16 @@ multiclass avx512_pmovx_wd<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_pmovx_common<opc, OpcodeStr, sched.XMM, v4i32x_info,
                    v8i16x_info, i64mem, LdFrag, InVecNode>,
-                     EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128, WIG;
+                     EVEX_CD8<16, CD8VH>, T8, PD, EVEX_V128, WIG;
 
     defm Z256:  avx512_pmovx_common<opc, OpcodeStr, sched.YMM, v8i32x_info,
                    v8i16x_info, i128mem, LdFrag, OpNode>,
-                     EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256, WIG;
+                     EVEX_CD8<16, CD8VH>, T8, PD, EVEX_V256, WIG;
   }
   let Predicates = [HasAVX512] in {
     defm Z   :  avx512_pmovx_common<opc, OpcodeStr, sched.ZMM, v16i32_info,
                    v16i16x_info, i256mem, LdFrag, OpNode>,
-                     EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512, WIG;
+                     EVEX_CD8<16, CD8VH>, T8, PD, EVEX_V512, WIG;
   }
 }
 
@@ -10003,16 +10014,16 @@ multiclass avx512_pmovx_wq<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_pmovx_common<opc, OpcodeStr, sched.XMM, v2i64x_info,
                    v8i16x_info, i32mem, LdFrag, InVecNode>,
-                     EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, WIG;
+                     EVEX_CD8<16, CD8VQ>, T8, PD, EVEX_V128, WIG;
 
     defm Z256:  avx512_pmovx_common<opc, OpcodeStr, sched.YMM, v4i64x_info,
                    v8i16x_info, i64mem, LdFrag, InVecNode>,
-                     EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, WIG;
+                     EVEX_CD8<16, CD8VQ>, T8, PD, EVEX_V256, WIG;
   }
   let Predicates = [HasAVX512] in {
     defm Z   :  avx512_pmovx_common<opc, OpcodeStr, sched.ZMM, v8i64_info,
                    v8i16x_info, i128mem, LdFrag, OpNode>,
-                     EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512, WIG;
+                     EVEX_CD8<16, CD8VQ>, T8, PD, EVEX_V512, WIG;
   }
 }
 
@@ -10024,16 +10035,16 @@ multiclass avx512_pmovx_dq<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_pmovx_common<opc, OpcodeStr, sched.XMM, v2i64x_info,
                    v4i32x_info, i64mem, LdFrag, InVecNode>,
-                     EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
+                     EVEX_CD8<32, CD8VH>, T8, PD, EVEX_V128;
 
     defm Z256:  avx512_pmovx_common<opc, OpcodeStr, sched.YMM, v4i64x_info,
                    v4i32x_info, i128mem, LdFrag, OpNode>,
-                     EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256;
+                     EVEX_CD8<32, CD8VH>, T8, PD, EVEX_V256;
   }
   let Predicates = [HasAVX512] in {
     defm Z   :  avx512_pmovx_common<opc, OpcodeStr, sched.ZMM, v8i64_info,
                    v8i32x_info, i256mem, LdFrag, OpNode>,
-                     EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512;
+                     EVEX_CD8<32, CD8VH>, T8, PD, EVEX_V512;
   }
 }
 
@@ -10663,7 +10674,7 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               X86FoldableSchedWrite sched, X86VectorVTInfo DestInfo,
                               X86VectorVTInfo SrcInfo>{
-  let ExeDomain = DestInfo.ExeDomain in {
+  let ExeDomain = DestInfo.ExeDomain, ImmT = Imm8 in {
   defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
                   (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3),
                   OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -10689,7 +10700,7 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            X86FoldableSchedWrite sched, X86VectorVTInfo _>:
   avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, sched, _, _>{
 
-  let ExeDomain = _.ExeDomain in
+  let ExeDomain = _.ExeDomain, ImmT = Imm8 in
   defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
                     OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
@@ -10773,13 +10784,13 @@ multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
                    AVX512VLVectorVTInfo SrcInfo, Predicate Pred = HasBWI> {
   let Predicates = [Pred] in {
     defm Z    : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.ZMM, DestInfo.info512,
-                           SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
+                           SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX, VVVV;
   }
   let Predicates = [Pred, HasVLX] in {
     defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.XMM, DestInfo.info128,
-                           SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
+                           SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX, VVVV;
     defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.YMM, DestInfo.info256,
-                           SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
+                           SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX, VVVV;
   }
 }
 
@@ -10835,38 +10846,38 @@ defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26
 defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
                                                 0x50, X86VRange, X86VRangeSAE,
                                                 SchedWriteFAdd, HasDQI>,
-      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+      AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
 defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
                                                 0x50, X86VRange, X86VRangeSAE,
                                                 SchedWriteFAdd, HasDQI>,
-      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+      AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
 
 defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd",
       f64x_info, 0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, REX_W;
+      AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<64, CD8VT1>, REX_W;
 defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
       0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+      AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>;
 
 defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
       0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, REX_W;
+      AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<64, CD8VT1>, REX_W;
 defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
       0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+      AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>;
 defm VREDUCESH: avx512_common_fp_sae_scalar_imm<"vreducesh", f16x_info,
       0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasFP16>,
-      AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>;
+      AVX512PSIi8Base, TA, VEX_LIG, EVEX, VVVV, EVEX_CD8<16, CD8VT1>;
 
 defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
       0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, REX_W;
+      AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<64, CD8VT1>, REX_W;
 defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
       0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+      AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>;
 defm VGETMANTSH: avx512_common_fp_sae_scalar_imm<"vgetmantsh", f16x_info,
       0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasFP16>,
-      AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>;
+      AVX512PSIi8Base, TA, VEX_LIG, EVEX, VVVV, EVEX_CD8<16, CD8VT1>;
 
 multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                                           X86FoldableSchedWrite sched,
@@ -10920,13 +10931,13 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, X86FoldableSchedWrite sched
 }
 
 defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", WriteFShuffle256,
-      avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+      avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
 defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", WriteFShuffle256,
-      avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+      avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
 defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256,
-      avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+      avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
 defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256,
-      avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+      avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
 
 multiclass avx512_valign<bits<8> opc, string OpcodeStr,
                          X86FoldableSchedWrite sched, X86VectorVTInfo _>{
@@ -10962,15 +10973,15 @@ multiclass avx512_valign_common<string OpcodeStr, X86SchedWriteWidths sched,
                                 AVX512VLVectorVTInfo _> {
   let Predicates = [HasAVX512] in {
     defm Z    : avx512_valign<0x03, OpcodeStr, sched.ZMM, _.info512>,
-                                AVX512AIi8Base, EVEX_4V, EVEX_V512;
+                                AVX512AIi8Base, EVEX, VVVV, EVEX_V512;
   }
   let Predicates = [HasAVX512, HasVLX] in {
     defm Z128 : avx512_valign<0x03, OpcodeStr, sched.XMM, _.info128>,
-                                AVX512AIi8Base, EVEX_4V, EVEX_V128;
+                                AVX512AIi8Base, EVEX, VVVV, EVEX_V128;
     // We can't really override the 256-bit version so change it back to unset.
     let EVEX2VEXOverride = ? in
     defm Z256 : avx512_valign<0x03, OpcodeStr, sched.YMM, _.info256>,
-                                AVX512AIi8Base, EVEX_4V, EVEX_V256;
+                                AVX512AIi8Base, EVEX, VVVV, EVEX_V256;
   }
 }
 
@@ -11258,7 +11269,7 @@ defm : avx512_unary_lowering<"VPOPCNTD", ctpop, avx512vl_i32_info, HasVPOPCNTDQ>
 multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86SchedWriteWidths sched> {
   defm NAME:       avx512_unary_rm_vl<opc, OpcodeStr, OpNode, sched,
-                                      avx512vl_f32_info, HasAVX512>, XS;
+                                      avx512vl_f32_info, HasAVX512>, TB, XS;
 }
 
 defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup,
@@ -11301,7 +11312,7 @@ multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr,
 multiclass avx512_movddup<bits<8> opc, string OpcodeStr,
                           X86SchedWriteWidths sched> {
   defm NAME:      avx512_movddup_common<opc, OpcodeStr, sched,
-                                        avx512vl_f64_info>, XD, REX_W;
+                                        avx512vl_f64_info>, TB, XD, REX_W;
 }
 
 defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", SchedWriteFShuffle>;
@@ -11369,9 +11380,9 @@ multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
                   OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   [(set GR32orGR64:$dst,
                         (X86pextrb (_.VT _.RC:$src1), timm:$src2))]>,
-                  EVEX, TAPD, Sched<[WriteVecExtract]>;
+                  EVEX, TA, PD, Sched<[WriteVecExtract]>;
 
-    defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
+    defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TA, PD;
   }
 }
 
@@ -11382,15 +11393,15 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
                   OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   [(set GR32orGR64:$dst,
                         (X86pextrw (_.VT _.RC:$src1), timm:$src2))]>,
-                  EVEX, PD, Sched<[WriteVecExtract]>;
+                  EVEX, TB, PD, Sched<[WriteVecExtract]>;
 
     let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in
     def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
                    (ins _.RC:$src1, u8imm:$src2),
                    OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                   EVEX, TAPD, Sched<[WriteVecExtract]>;
+                   EVEX, TA, PD, Sched<[WriteVecExtract]>;
 
-    defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
+    defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TA, PD;
   }
 }
 
@@ -11402,14 +11413,14 @@ multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
                   OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   [(set GRC:$dst,
                       (extractelt (_.VT _.RC:$src1), imm:$src2))]>,
-                  EVEX, TAPD, Sched<[WriteVecExtract]>;
+                  EVEX, TA, PD, Sched<[WriteVecExtract]>;
 
     def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
                 (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
                 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                 [(store (extractelt (_.VT _.RC:$src1),
                                     imm:$src2),addr:$dst)]>,
-                EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD,
+                EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TA, PD,
                 Sched<[WriteVecExtractSt]>;
   }
 }
@@ -11427,7 +11438,7 @@ multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
       OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set _.RC:$dst,
           (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), immoperator:$src3)))]>,
-      EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
+      EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
 }
 
 multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -11437,7 +11448,7 @@ multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
         (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
         OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
         [(set _.RC:$dst,
-            (OpNode _.RC:$src1, GR32orGR64:$src2, timm:$src3))]>, EVEX_4V,
+            (OpNode _.RC:$src1, GR32orGR64:$src2, timm:$src3))]>, EVEX, VVVV,
         Sched<[WriteVecInsert]>;
 
     defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag, timm>;
@@ -11452,17 +11463,17 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
         OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
         [(set _.RC:$dst,
             (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
-        EVEX_4V, TAPD, Sched<[WriteVecInsert]>;
+        EVEX, VVVV, TA, PD, Sched<[WriteVecInsert]>;
 
     defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
-                                    _.ScalarLdFrag, imm>, TAPD;
+                                    _.ScalarLdFrag, imm>, TA, PD;
   }
 }
 
 defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info,
-                                     extloadi8>, TAPD, WIG;
+                                     extloadi8>, TA, PD, WIG;
 defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
-                                     extloadi16>, PD, WIG;
+                                     extloadi16>, TB, PD, WIG;
 defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
 defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, REX_W;
 
@@ -11501,11 +11512,11 @@ multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_FP>{
   defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp,
                                     SchedWriteFShuffle>,
                                     EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
-                                    AVX512AIi8Base, EVEX_4V;
+                                    TA, EVEX, VVVV;
 }
 
-defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_f32_info>, PS;
-defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_f64_info>, PD, REX_W;
+defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_f32_info>, TB;
+defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_f64_info>, TB, PD, REX_W;
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - Byte shift Left/Right
@@ -11543,10 +11554,10 @@ multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr,
 }
 defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq",
                                        SchedWriteShuffle, HasBWI>,
-                                       AVX512PDIi8Base, EVEX_4V, WIG;
+                                       AVX512PDIi8Base, EVEX, VVVV, WIG;
 defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
                                        SchedWriteShuffle, HasBWI>,
-                                       AVX512PDIi8Base, EVEX_4V, WIG;
+                                       AVX512PDIi8Base, EVEX, VVVV, WIG;
 
 multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
                                 string OpcodeStr, X86FoldableSchedWrite sched,
@@ -11584,7 +11595,7 @@ multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
 }
 
 defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
-                                        SchedWritePSADBW, HasBWI>, EVEX_4V, WIG;
+                                        SchedWritePSADBW, HasBWI>, EVEX, VVVV, WIG;
 
 // Transforms to swizzle an immediate to enable better matching when
 // memory operand isn't in the right place.
@@ -11659,7 +11670,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               (_.VT _.RC:$src2),
                               (_.VT _.RC:$src3),
                               (i8 timm:$src4)), 1, 1>,
-                      AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
+                      AVX512AIi8Base, EVEX, VVVV, Sched<[sched]>;
   defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
                     OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -11667,7 +11678,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             (_.VT _.RC:$src2),
                             (_.VT (bitconvert (_.LdFrag addr:$src3))),
                             (i8 timm:$src4)), 1, 0>,
-                    AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                    AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
@@ -11677,7 +11688,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             (_.VT _.RC:$src2),
                             (_.VT (_.BroadcastLdFrag addr:$src3)),
                             (i8 timm:$src4)), 1, 0>, EVEX_B,
-                    AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                    AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }// Constraints = "$src1 = $dst"
 
@@ -12002,23 +12013,23 @@ multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched,
   let Predicates = [HasAVX512] in
     defm Z    : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM,
                                 _Vec.info512, _Tbl.info512>, AVX512AIi8Base,
-                                EVEX_4V, EVEX_V512;
+                                EVEX, VVVV, EVEX_V512;
   let Predicates = [HasAVX512, HasVLX] in {
     defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.XMM,
                             _Vec.info128, _Tbl.info128>, AVX512AIi8Base,
-                            EVEX_4V, EVEX_V128;
+                            EVEX, VVVV, EVEX_V128;
     defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.YMM,
                             _Vec.info256, _Tbl.info256>, AVX512AIi8Base,
-                            EVEX_4V, EVEX_V256;
+                            EVEX, VVVV, EVEX_V256;
   }
 }
 
 defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm",
                                            SchedWriteFAdd.Scl, f32x_info, v4i32x_info>,
-                          AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+                          AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>;
 defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm",
                                            SchedWriteFAdd.Scl, f64x_info, v2i64x_info>,
-                          AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, REX_W;
+                          AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<64, CD8VT1>, REX_W;
 defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f32_info,
                          avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
 defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f64_info,
@@ -12165,17 +12176,17 @@ multiclass avx512_vaes<bits<8> Op, string OpStr, string IntPrefix> {
     defm Z128 : AESI_binop_rm_int<Op, OpStr,
                                   !cast<Intrinsic>(IntPrefix),
                                   loadv2i64, 0, VR128X, i128mem>,
-                  EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V128, WIG;
+                  EVEX, VVVV, EVEX_CD8<64, CD8VF>, EVEX_V128, WIG;
     defm Z256 : AESI_binop_rm_int<Op, OpStr,
                                   !cast<Intrinsic>(IntPrefix#"_256"),
                                   loadv4i64, 0, VR256X, i256mem>,
-                  EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V256, WIG;
+                  EVEX, VVVV, EVEX_CD8<64, CD8VF>, EVEX_V256, WIG;
     }
     let Predicates = [HasAVX512, HasVAES] in
     defm Z    : AESI_binop_rm_int<Op, OpStr,
                                   !cast<Intrinsic>(IntPrefix#"_512"),
                                   loadv8i64, 0, VR512, i512mem>,
-                  EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V512, WIG;
+                  EVEX, VVVV, EVEX_CD8<64, CD8VF>, EVEX_V512, WIG;
 }
 
 defm VAESENC      : avx512_vaes<0xDC, "vaesenc", "int_x86_aesni_aesenc">;
@@ -12189,14 +12200,14 @@ defm VAESDECLAST  : avx512_vaes<0xDF, "vaesdeclast", "int_x86_aesni_aesdeclast">
 
 let Predicates = [HasAVX512, HasVPCLMULQDQ] in
 defm VPCLMULQDQZ : vpclmulqdq<VR512, i512mem, loadv8i64, int_x86_pclmulqdq_512>,
-                              EVEX_4V, EVEX_V512, EVEX_CD8<64, CD8VF>, WIG;
+                              EVEX, VVVV, EVEX_V512, EVEX_CD8<64, CD8VF>, WIG;
 
 let Predicates = [HasVLX, HasVPCLMULQDQ] in {
 defm VPCLMULQDQZ128 : vpclmulqdq<VR128X, i128mem, loadv2i64, int_x86_pclmulqdq>,
-                              EVEX_4V, EVEX_V128, EVEX_CD8<64, CD8VF>, WIG;
+                              EVEX, VVVV, EVEX_V128, EVEX_CD8<64, CD8VF>, WIG;
 
 defm VPCLMULQDQZ256: vpclmulqdq<VR256X, i256mem, loadv4i64,
-                                int_x86_pclmulqdq_256>, EVEX_4V, EVEX_V256,
+                                int_x86_pclmulqdq_256>, EVEX, VVVV, EVEX_V256,
                                 EVEX_CD8<64, CD8VF>, WIG;
 }
 
@@ -12217,13 +12228,13 @@ multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
                 (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
                 (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>,
-                T8PD, EVEX_4V, Sched<[sched]>;
+                T8, PD, EVEX, VVVV, Sched<[sched]>;
     defm m:   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
                 (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
                         (VTI.VT (VTI.LdFrag addr:$src3))))>,
-                T8PD, EVEX_4V,
+                T8, PD, EVEX, VVVV,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -12239,7 +12250,7 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
               "$src2, ${src3}"#VTI.BroadcastStr,
               (OpNode VTI.RC:$src1, VTI.RC:$src2,
                (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
-              T8PD, EVEX_4V, EVEX_B,
+              T8, PD, EVEX, VVVV, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -12284,9 +12295,9 @@ multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix,
              avx512vl_i16_info, avx512vl_i16_info, HasVBMI2>,
              REX_W, EVEX_CD8<16, CD8VF>;
   defm D : avx512_common_3Op_imm8<Prefix#"d", avx512vl_i32_info, dqOp,
-             OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+             OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
   defm Q : avx512_common_3Op_imm8<Prefix#"q", avx512vl_i64_info, dqOp, OpNode,
-             sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+             sched, HasVBMI2>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
 }
 
 // Concat & Shift
@@ -12321,13 +12332,13 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    (VTI.VT (OpNode VTI.RC:$src1,
                                             VTI.RC:$src2, VTI.RC:$src3)),
                                    IsCommutable, IsCommutable>,
-                                   EVEX_4V, T8PD, Sched<[sched]>;
+                                   EVEX, VVVV, T8, PD, Sched<[sched]>;
   defm m  :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                                    (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                                    "$src3, $src2", "$src2, $src3",
                                    (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
                                             (VTI.VT (VTI.LdFrag addr:$src3))))>,
-                                   EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD,
+                                   EVEX, VVVV, EVEX_CD8<32, CD8VF>, T8, PD,
                                    Sched<[sched.Folded, sched.ReadAfterFold,
                                           sched.ReadAfterFold]>;
   defm mb :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
@@ -12336,8 +12347,8 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    "$src2, ${src3}"#VTI.BroadcastStr,
                                    (OpNode VTI.RC:$src1, VTI.RC:$src2,
                                     (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
-                                   EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
-                                   T8PD, Sched<[sched.Folded, sched.ReadAfterFold,
+                                   EVEX, VVVV, EVEX_CD8<32, CD8VF>, EVEX_B,
+                                   T8, PD, Sched<[sched.Folded, sched.ReadAfterFold,
                                                 sched.ReadAfterFold]>;
   }
 }
@@ -12406,7 +12417,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
                                 (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
                                 (VTI.VT VTI.RC:$src2)),
                                 (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
-                                (VTI.VT VTI.RC:$src2))>, EVEX_4V, T8PD,
+                                (VTI.VT VTI.RC:$src2))>, EVEX, VVVV, T8, PD,
                                 Sched<[sched]>;
   defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst),
                                 (ins VTI.RC:$src1, VTI.MemOp:$src2),
@@ -12416,7 +12427,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
                                 (VTI.VT (VTI.LdFrag addr:$src2))),
                                 (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
                                 (VTI.VT (VTI.LdFrag addr:$src2)))>,
-                                EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
+                                EVEX, VVVV, EVEX_CD8<8, CD8VF>, T8, PD,
                                 Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -12451,7 +12462,7 @@ multiclass GF2P8MULB_avx512_common<bits<8> Op, string OpStr, SDNode OpNode,
 
 defm VGF2P8MULB : GF2P8MULB_avx512_common<0xCF, "vgf2p8mulb", X86GF2P8mulb,
                                           SchedWriteVecALU>,
-                                          EVEX_CD8<8, CD8VF>, T8PD;
+                                          EVEX_CD8<8, CD8VF>, T8;
 
 multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
                                       X86FoldableSchedWrite sched, X86VectorVTInfo VTI,
@@ -12483,10 +12494,10 @@ multiclass GF2P8AFFINE_avx512_common<bits<8> Op, string OpStr, SDNode OpNode,
 
 defm VGF2P8AFFINEINVQB : GF2P8AFFINE_avx512_common<0xCF, "vgf2p8affineinvqb",
                          X86GF2P8affineinvqb, SchedWriteVecIMul>,
-                         EVEX_4V, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
+                         EVEX, VVVV, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
 defm VGF2P8AFFINEQB    : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb",
                          X86GF2P8affineqb, SchedWriteVecIMul>,
-                         EVEX_4V, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
+                         EVEX, VVVV, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
 
 
 //===----------------------------------------------------------------------===//
@@ -12498,25 +12509,25 @@ let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedSingle,
 defm V4FMADDPSrm : AVX512_maskable_3src_in_asm<0x9A, MRMSrcMem, v16f32_info,
                     (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
                     "v4fmaddps", "$src3, $src2", "$src2, $src3",
-                    []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+                    []>, EVEX_V512, EVEX, VVVV, T8, XD, EVEX_CD8<32, CD8VQ>,
                     Sched<[SchedWriteFMA.ZMM.Folded]>;
 
 defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info,
                      (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
                      "v4fnmaddps", "$src3, $src2", "$src2, $src3",
-                     []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+                     []>, EVEX_V512, EVEX, VVVV, T8, XD, EVEX_CD8<32, CD8VQ>,
                      Sched<[SchedWriteFMA.ZMM.Folded]>;
 
 defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info,
                     (outs VR128X:$dst), (ins  VR128X:$src2, f128mem:$src3),
                     "v4fmaddss", "$src3, $src2", "$src2, $src3",
-                    []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+                    []>, VEX_LIG, EVEX, VVVV, T8, XD, EVEX_CD8<32, CD8VF>,
                     Sched<[SchedWriteFMA.Scl.Folded]>;
 
 defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info,
                      (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
                      "v4fnmaddss", "$src3, $src2", "$src2, $src3",
-                     []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+                     []>, VEX_LIG, EVEX, VVVV, T8, XD, EVEX_CD8<32, CD8VF>,
                      Sched<[SchedWriteFMA.Scl.Folded]>;
 }
 
@@ -12529,13 +12540,13 @@ let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedInt,
 defm VP4DPWSSDrm : AVX512_maskable_3src_in_asm<0x52, MRMSrcMem, v16i32_info,
                     (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
                      "vp4dpwssd", "$src3, $src2", "$src2, $src3",
-                    []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+                    []>, EVEX_V512, EVEX, VVVV, T8, XD, EVEX_CD8<32, CD8VQ>,
                     Sched<[SchedWriteFMA.ZMM.Folded]>;
 
 defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info,
                      (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
                      "vp4dpwssds", "$src3, $src2", "$src2, $src3",
-                     []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+                     []>, EVEX_V512, EVEX, VVVV, T8, XD, EVEX_CD8<32, CD8VQ>,
                      Sched<[SchedWriteFMA.ZMM.Folded]>;
 }
 
@@ -12558,7 +12569,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                   [(set _.KRPC:$dst, (X86vp2intersect
                             _.RC:$src1, (_.VT _.RC:$src2)))]>,
-                  EVEX_4V, T8XD, Sched<[sched]>;
+                  EVEX, VVVV, T8, XD, Sched<[sched]>;
 
   def rm : I<0x68, MRMSrcMem,
                   (outs _.KRPC:$dst),
@@ -12567,7 +12578,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                   [(set _.KRPC:$dst, (X86vp2intersect
                             _.RC:$src1, (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
-                  EVEX_4V, T8XD, EVEX_CD8<_.EltSize, CD8VF>,
+                  EVEX, VVVV, T8, XD, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   def rmb : I<0x68, MRMSrcMem,
@@ -12577,7 +12588,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
                              ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"),
                   [(set _.KRPC:$dst, (X86vp2intersect
                              _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>,
-                  EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+                  EVEX, VVVV, T8, XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -12623,7 +12634,7 @@ let ExeDomain = SSEPackedSingle in
 defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16",
                                         SchedWriteCvtPD2PS, //FIXME: Should be SchedWriteCvtPS2BF
                                         avx512vl_f32_info, avx512vl_bf16_info,
-                                        X86cvtne2ps2bf16, HasBF16, 0>, T8XD;
+                                        X86cvtne2ps2bf16, HasBF16, 0>, T8, XD;
 
 // Truncate Float to BFloat16
 multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr,
@@ -12660,7 +12671,7 @@ multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr,
 }
 
 defm VCVTNEPS2BF16 : avx512_cvtps2bf16<0x72, "vcvtneps2bf16",
-                                       SchedWriteCvtPD2PS>, T8XS,
+                                       SchedWriteCvtPD2PS>, T8, XS,
                                        EVEX_CD8<32, CD8VF>;
 
 let Predicates = [HasBF16, HasVLX] in {
@@ -12744,13 +12755,13 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            (ins src_v.RC:$src2, src_v.RC:$src3),
                            OpcodeStr, "$src3, $src2", "$src2, $src3",
                            (_.VT (OpNode _.RC:$src1, src_v.RC:$src2, src_v.RC:$src3))>,
-                           EVEX_4V, Sched<[sched]>;
+                           EVEX, VVVV, Sched<[sched]>;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                                (ins src_v.RC:$src2, src_v.MemOp:$src3),
                                OpcodeStr, "$src3, $src2", "$src2, $src3",
                                (_.VT (OpNode _.RC:$src1, src_v.RC:$src2,
-                               (src_v.LdFrag addr:$src3)))>, EVEX_4V,
+                               (src_v.LdFrag addr:$src3)))>, EVEX, VVVV,
                                Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -12760,7 +12771,7 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   !strconcat("$src2, ${src3}", _.BroadcastStr),
                   (_.VT (OpNode _.RC:$src1, src_v.RC:$src2,
                   (src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>,
-                  EVEX_B, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                  EVEX_B, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
 }
 } // Constraints = "$src1 = $dst"
@@ -12783,7 +12794,7 @@ multiclass avx512_dpbf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
 let ExeDomain = SSEPackedSingle in
 defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA,
                                        avx512vl_f32_info, avx512vl_bf16_info,
-                                       HasBF16>, T8XS, EVEX_CD8<32, CD8VF>;
+                                       HasBF16>, T8, XS, EVEX_CD8<32, CD8VF>;
 
 //===----------------------------------------------------------------------===//
 // AVX512FP16
@@ -12792,12 +12803,12 @@ defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWrit
 let Predicates = [HasFP16] in {
 // Move word ( r/m16) to Packed word
 def VMOVW2SHrr : AVX512<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
-                      "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, Sched<[WriteVecMoveFromGpr]>;
+                      "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5, PD, EVEX, Sched<[WriteVecMoveFromGpr]>;
 def VMOVWrm : AVX512<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i16mem:$src),
                       "vmovw\t{$src, $dst|$dst, $src}",
                       [(set VR128X:$dst,
                         (v8i16 (scalar_to_vector (loadi16 addr:$src))))]>,
-                      T_MAP5PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFLoad]>;
+                      T_MAP5, PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFLoad]>;
 
 def : Pat<(f16 (bitconvert GR16:$src)),
           (f16 (COPY_TO_REGCLASS
@@ -12854,13 +12865,13 @@ def : Pat<(v16i32 (X86vzmovl
 
 // Move word from xmm register to r/m16
 def VMOVSH2Wrr  : AVX512<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
-                       "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, Sched<[WriteVecMoveToGpr]>;
+                       "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5, PD, EVEX, Sched<[WriteVecMoveToGpr]>;
 def VMOVWmr  : AVX512<0x7E, MRMDestMem, (outs),
                        (ins i16mem:$dst, VR128X:$src),
                        "vmovw\t{$src, $dst|$dst, $src}",
                        [(store (i16 (extractelt (v8i16 VR128X:$src),
                                      (iPTR 0))), addr:$dst)]>,
-                       T_MAP5PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFStore]>;
+                       T_MAP5, PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFStore]>;
 
 def : Pat<(i16 (bitconvert FR16X:$src)),
           (i16 (EXTRACT_SUBREG
@@ -12872,9 +12883,9 @@ def : Pat<(i16 (extractelt (v8i16 VR128X:$src), (iPTR 0))),
 // Allow "vmovw" to use GR64
 let hasSideEffects = 0 in {
   def VMOVW64toSHrr : AVX512<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
-                     "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, REX_W, Sched<[WriteVecMoveFromGpr]>;
+                     "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5, PD, EVEX, REX_W, Sched<[WriteVecMoveFromGpr]>;
   def VMOVSHtoW64rr : AVX512<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
-                     "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, REX_W, Sched<[WriteVecMoveToGpr]>;
+                     "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5, PD, EVEX, REX_W, Sched<[WriteVecMoveToGpr]>;
 }
 }
 
@@ -12920,27 +12931,27 @@ multiclass avx512_cvttph2w<bits<8> opc, string OpcodeStr, SDPatternOperator OpNo
 defm VCVTPH2UW : avx512_cvtph2w<0x7D, "vcvtph2uw", X86cvtp2UInt, X86cvtp2UInt,
                                 X86cvtp2UIntRnd, avx512vl_i16_info,
                                 avx512vl_f16_info, SchedWriteCvtPD2DQ>,
-                                T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                T_MAP5, EVEX_CD8<16, CD8VF>;
 defm VCVTUW2PH : avx512_cvtph2w<0x7D, "vcvtuw2ph", any_uint_to_fp, uint_to_fp,
                                 X86VUintToFpRnd, avx512vl_f16_info,
                                 avx512vl_i16_info, SchedWriteCvtPD2DQ>,
-                                T_MAP5XD, EVEX_CD8<16, CD8VF>;
+                                T_MAP5, XD, EVEX_CD8<16, CD8VF>;
 defm VCVTTPH2W : avx512_cvttph2w<0x7C, "vcvttph2w", X86any_cvttp2si,
                                 X86cvttp2si, X86cvttp2siSAE,
                                 avx512vl_i16_info, avx512vl_f16_info,
-                                SchedWriteCvtPD2DQ>, T_MAP5PD, EVEX_CD8<16, CD8VF>;
+                                SchedWriteCvtPD2DQ>, T_MAP5, PD, EVEX_CD8<16, CD8VF>;
 defm VCVTTPH2UW : avx512_cvttph2w<0x7C, "vcvttph2uw", X86any_cvttp2ui,
                                 X86cvttp2ui, X86cvttp2uiSAE,
                                 avx512vl_i16_info, avx512vl_f16_info,
-                                SchedWriteCvtPD2DQ>, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                SchedWriteCvtPD2DQ>, T_MAP5, EVEX_CD8<16, CD8VF>;
 defm VCVTPH2W : avx512_cvtph2w<0x7D, "vcvtph2w", X86cvtp2Int, X86cvtp2Int,
                                 X86cvtp2IntRnd, avx512vl_i16_info,
                                 avx512vl_f16_info, SchedWriteCvtPD2DQ>,
-                                T_MAP5PD, EVEX_CD8<16, CD8VF>;
+                                T_MAP5, PD, EVEX_CD8<16, CD8VF>;
 defm VCVTW2PH : avx512_cvtph2w<0x7D, "vcvtw2ph", any_sint_to_fp, sint_to_fp,
                                 X86VSintToFpRnd, avx512vl_f16_info,
                                 avx512vl_i16_info, SchedWriteCvtPD2DQ>,
-                                T_MAP5XS, EVEX_CD8<16, CD8VF>;
+                                T_MAP5, XS, EVEX_CD8<16, CD8VF>;
 
 // Convert Half to Signed/Unsigned Doubleword
 multiclass avx512_cvtph2dq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -12980,20 +12991,20 @@ multiclass avx512_cvttph2dq<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
 
 
 defm VCVTPH2DQ : avx512_cvtph2dq<0x5B, "vcvtph2dq", X86cvtp2Int, X86cvtp2Int,
-                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5, PD,
                                  EVEX_CD8<16, CD8VH>;
 defm VCVTPH2UDQ : avx512_cvtph2dq<0x79, "vcvtph2udq", X86cvtp2UInt, X86cvtp2UInt,
-                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5PS,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5,
                                  EVEX_CD8<16, CD8VH>;
 
 defm VCVTTPH2DQ : avx512_cvttph2dq<0x5B, "vcvttph2dq", X86any_cvttp2si,
                                 X86cvttp2si, X86cvttp2siSAE,
-                                SchedWriteCvtPS2DQ>, T_MAP5XS,
+                                SchedWriteCvtPS2DQ>, T_MAP5, XS,
                                 EVEX_CD8<16, CD8VH>;
 
 defm VCVTTPH2UDQ : avx512_cvttph2dq<0x78, "vcvttph2udq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
-                                 SchedWriteCvtPS2DQ>, T_MAP5PS,
+                                 SchedWriteCvtPS2DQ>, T_MAP5,
                                  EVEX_CD8<16, CD8VH>;
 
 // Convert Half to Signed/Unsigned Quardword
@@ -13043,21 +13054,21 @@ multiclass avx512_cvttph2qq<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
 }
 
 defm VCVTPH2QQ : avx512_cvtph2qq<0x7B, "vcvtph2qq", X86cvtp2Int, X86cvtp2Int,
-                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5, PD,
                                  EVEX_CD8<16, CD8VQ>;
 
 defm VCVTPH2UQQ : avx512_cvtph2qq<0x79, "vcvtph2uqq", X86cvtp2UInt, X86cvtp2UInt,
-                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5, PD,
                                  EVEX_CD8<16, CD8VQ>;
 
 defm VCVTTPH2QQ : avx512_cvttph2qq<0x7A, "vcvttph2qq", X86any_cvttp2si,
                                  X86cvttp2si, X86cvttp2siSAE,
-                                 SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 SchedWriteCvtPS2DQ>, T_MAP5, PD,
                                  EVEX_CD8<16, CD8VQ>;
 
 defm VCVTTPH2UQQ : avx512_cvttph2qq<0x78, "vcvttph2uqq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
-                                 SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 SchedWriteCvtPS2DQ>, T_MAP5, PD,
                                  EVEX_CD8<16, CD8VQ>;
 
 // Convert Signed/Unsigned Quardword to Half
@@ -13154,53 +13165,53 @@ multiclass avx512_cvtqq2ph<bits<8> opc, string OpcodeStr, SDPatternOperator OpNo
 }
 
 defm VCVTQQ2PH : avx512_cvtqq2ph<0x5B, "vcvtqq2ph", any_sint_to_fp, sint_to_fp,
-                            X86VSintToFpRnd, SchedWriteCvtDQ2PS>, REX_W, T_MAP5PS,
+                            X86VSintToFpRnd, SchedWriteCvtDQ2PS>, REX_W, T_MAP5,
                             EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PH : avx512_cvtqq2ph<0x7A, "vcvtuqq2ph", any_uint_to_fp, uint_to_fp,
-                            X86VUintToFpRnd, SchedWriteCvtDQ2PS>, REX_W, T_MAP5XD,
+                            X86VUintToFpRnd, SchedWriteCvtDQ2PS>, REX_W, T_MAP5, XD,
                             EVEX_CD8<64, CD8VF>;
 
 // Convert half to signed/unsigned int 32/64
 defm VCVTSH2SIZ: avx512_cvt_s_int_round<0x2D, f16x_info, i32x_info, X86cvts2si,
                                    X86cvts2siRnd, WriteCvtSS2I, "cvtsh2si", "{l}", HasFP16>,
-                                   T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+                                   T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
 defm VCVTSH2SI64Z: avx512_cvt_s_int_round<0x2D, f16x_info, i64x_info, X86cvts2si,
                                    X86cvts2siRnd, WriteCvtSS2I, "cvtsh2si", "{q}", HasFP16>,
-                                   T_MAP5XS, REX_W, EVEX_CD8<16, CD8VT1>;
+                                   T_MAP5, XS, REX_W, EVEX_CD8<16, CD8VT1>;
 defm VCVTSH2USIZ: avx512_cvt_s_int_round<0x79, f16x_info, i32x_info, X86cvts2usi,
                                    X86cvts2usiRnd, WriteCvtSS2I, "cvtsh2usi", "{l}", HasFP16>,
-                                   T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+                                   T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
 defm VCVTSH2USI64Z: avx512_cvt_s_int_round<0x79, f16x_info, i64x_info, X86cvts2usi,
                                    X86cvts2usiRnd, WriteCvtSS2I, "cvtsh2usi", "{q}", HasFP16>,
-                                   T_MAP5XS, REX_W, EVEX_CD8<16, CD8VT1>;
+                                   T_MAP5, XS, REX_W, EVEX_CD8<16, CD8VT1>;
 
 defm VCVTTSH2SIZ: avx512_cvt_s_all<0x2C, "vcvttsh2si", f16x_info, i32x_info,
                         any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
-                        "{l}", HasFP16>, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+                        "{l}", HasFP16>, T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
 defm VCVTTSH2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsh2si", f16x_info, i64x_info,
                         any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
-                        "{q}", HasFP16>, REX_W, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+                        "{q}", HasFP16>, REX_W, T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
 defm VCVTTSH2USIZ: avx512_cvt_s_all<0x78, "vcvttsh2usi", f16x_info, i32x_info,
                         any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
-                        "{l}", HasFP16>, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+                        "{l}", HasFP16>, T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
 defm VCVTTSH2USI64Z: avx512_cvt_s_all<0x78, "vcvttsh2usi", f16x_info, i64x_info,
                         any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
-                        "{q}", HasFP16>, T_MAP5XS, REX_W, EVEX_CD8<16, CD8VT1>;
+                        "{q}", HasFP16>, T_MAP5, XS, REX_W, EVEX_CD8<16, CD8VT1>;
 
 let Predicates = [HasFP16] in {
   defm VCVTSI2SHZ  : avx512_vcvtsi_common<0x2A,  X86SintToFp, X86SintToFpRnd, WriteCvtI2SS, GR32,
                                    v8f16x_info, i32mem, loadi32, "cvtsi2sh", "l">,
-                                   T_MAP5XS, EVEX_CD8<32, CD8VT1>;
+                                   T_MAP5, XS, EVEX_CD8<32, CD8VT1>;
   defm VCVTSI642SHZ: avx512_vcvtsi_common<0x2A,  X86SintToFp, X86SintToFpRnd, WriteCvtI2SS, GR64,
                                    v8f16x_info, i64mem, loadi64, "cvtsi2sh","q">,
-                                   T_MAP5XS, REX_W, EVEX_CD8<64, CD8VT1>;
+                                   T_MAP5, XS, REX_W, EVEX_CD8<64, CD8VT1>;
   defm VCVTUSI2SHZ   : avx512_vcvtsi_common<0x7B,  X86UintToFp, X86UintToFpRnd, WriteCvtI2SS, GR32,
                                     v8f16x_info, i32mem, loadi32,
-                                    "cvtusi2sh","l">, T_MAP5XS, EVEX_CD8<32, CD8VT1>;
+                                    "cvtusi2sh","l">, T_MAP5, XS, EVEX_CD8<32, CD8VT1>;
   defm VCVTUSI642SHZ : avx512_vcvtsi_common<0x7B,  X86UintToFp, X86UintToFpRnd, WriteCvtI2SS, GR64,
                                     v8f16x_info, i64mem, loadi64, "cvtusi2sh", "q">,
-                                    T_MAP5XS, REX_W, EVEX_CD8<64, CD8VT1>;
+                                    T_MAP5, XS, REX_W, EVEX_CD8<64, CD8VT1>;
   def : InstAlias<"vcvtsi2sh\t{$src, $src1, $dst|$dst, $src1, $src}",
               (VCVTSI2SHZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
 
@@ -13390,17 +13401,17 @@ let Constraints = "@earlyclobber $dst, $src1 = $dst" in {
     defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
             (ins _.RC:$src2, _.RC:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
-            (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), IsCommutable>, EVEX_4V;
+            (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), IsCommutable>, EVEX, VVVV;
 
     defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins _.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
-            (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>, EVEX_4V;
+            (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>, EVEX, VVVV;
 
     defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins _.RC:$src2, _.ScalarMemOp:$src3),
             OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr),
-            (_.VT (OpNode _.RC:$src2, (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1))>, EVEX_B, EVEX_4V;
+            (_.VT (OpNode _.RC:$src2, (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1))>, EVEX_B, EVEX, VVVV;
   }
 } // Constraints = "@earlyclobber $dst, $src1 = $dst"
 
@@ -13411,7 +13422,7 @@ multiclass avx512_cfmaop_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
           (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc)))>,
-          EVEX_4V, EVEX_B, EVEX_RC;
+          EVEX, VVVV, EVEX_B, EVEX_RC;
 }
 
 
@@ -13446,14 +13457,14 @@ multiclass avx512_cfmulop_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 let Uses = [MXCSR] in {
   defm VFMADDCPH  : avx512_cfmaop_common<0x56, "vfmaddcph", x86vfmaddc, x86vfmaddcRnd, 1>,
-                                    T_MAP6XS, EVEX_CD8<32, CD8VF>;
+                                    T_MAP6, XS, EVEX_CD8<32, CD8VF>;
   defm VFCMADDCPH : avx512_cfmaop_common<0x56, "vfcmaddcph", x86vfcmaddc, x86vfcmaddcRnd, 0>,
-                                    T_MAP6XD, EVEX_CD8<32, CD8VF>;
+                                    T_MAP6, XD, EVEX_CD8<32, CD8VF>;
 
   defm VFMULCPH  : avx512_cfmulop_common<0xD6, "vfmulcph", x86vfmulc, x86vfmulc,
-                                         x86vfmulcRnd, 1>, T_MAP6XS, EVEX_CD8<32, CD8VF>;
+                                         x86vfmulcRnd, 1>, T_MAP6, XS, EVEX_CD8<32, CD8VF>;
   defm VFCMULCPH : avx512_cfmulop_common<0xD6, "vfcmulcph", x86vfcmulc,
-                                         x86vfcmulc, x86vfcmulcRnd, 0>, T_MAP6XD, EVEX_CD8<32, CD8VF>;
+                                         x86vfcmulc, x86vfcmulcRnd, 0>, T_MAP6, XD, EVEX_CD8<32, CD8VF>;
 }
 
 
@@ -13504,12 +13515,12 @@ multiclass avx512_cfmbinop_sh_common<bits<8> opc, string OpcodeStr, SDNode OpNod
 
 let Uses = [MXCSR] in {
   defm VFMADDCSHZ  : avx512_cfmaop_sh_common<0x57, "vfmaddcsh", x86vfmaddcSh, x86vfmaddcShRnd, 1>,
-                                    T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V;
+                                    T_MAP6, XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX, VVVV;
   defm VFCMADDCSHZ : avx512_cfmaop_sh_common<0x57, "vfcmaddcsh", x86vfcmaddcSh, x86vfcmaddcShRnd, 0>,
-                                    T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V;
+                                    T_MAP6, XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX, VVVV;
 
   defm VFMULCSHZ  : avx512_cfmbinop_sh_common<0xD7, "vfmulcsh", x86vfmulcSh, x86vfmulcShRnd, 1>,
-                                    T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V;
+                                    T_MAP6, XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX, VVVV;
   defm VFCMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfcmulcsh", x86vfcmulcSh, x86vfcmulcShRnd, 0>,
-                                    T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V;
+                                    T_MAP6, XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX, VVVV;
 }
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 8c355e84a065..6b0c1b8c28c9 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -45,737 +45,297 @@ def PLEA64r   : PseudoI<(outs GR64:$dst), (ins anymem:$src), []>;
 }
 
 //===----------------------------------------------------------------------===//
-//  Fixed-Register Multiplication and Division Instructions.
+// MUL/IMUL and DIV/IDIV Instructions
 //
-
-// BinOpRR - Binary instructions with inputs "reg, reg".
-class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, MRMDestReg, typeinfo, outlist,
-        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
-    Sched<[sched]>;
-
-// BinOpRR_F - Binary instructions with inputs "reg, reg", where the pattern
-// has just a EFLAGS as a result.
-class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                SDPatternOperator opnode>
-  : BinOpRR<opcode, mnemonic, typeinfo, (outs), WriteALU,
-            [(set EFLAGS,
-                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>;
-
-// BinOpRR_RF - Binary instructions with inputs "reg, reg", where the pattern
-// has both a regclass and EFLAGS as a result.
-class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                 SDNode opnode>
-  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteALU,
-            [(set typeinfo.RegClass:$dst, EFLAGS,
-                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>;
-
-// BinOpRR_RFF - Binary instructions with inputs "reg, reg", where the pattern
-// has both a regclass and EFLAGS as a result, and has EFLAGS as input.
-class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode>
-  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
-            [(set typeinfo.RegClass:$dst, EFLAGS,
-                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
-                          EFLAGS))]>;
-
-// BinOpRR_Rev - Binary instructions with inputs "reg, reg"(reversed encoding).
-class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  X86FoldableSchedWrite sched = WriteALU>
-  : ITy<opcode, MRMSrcReg, typeinfo,
-        (outs typeinfo.RegClass:$dst),
-        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $dst|$dst, $src2}", []>,
-    Sched<[sched]> {
-  // The disassembler should know about this, but not the asmparser.
-  let isCodeGenOnly = 1;
-  let ForceDisassemble = 1;
-  let hasSideEffects = 0;
+class MulDivOpR<bits<8> o, Format f, string m, X86TypeInfo t,
+             X86FoldableSchedWrite sched, list<dag> p>
+  : UnaryOpR<o, f, m, "$src1", t, (outs), p> {
+  let SchedRW = [sched];
 }
 
-// BinOpRR_RFF_Rev - Binary instructions with inputs "reg, reg"(reversed
-// encoding), with sched = WriteADC.
-class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
-  : BinOpRR_Rev<opcode, mnemonic, typeinfo, WriteADC>;
-
-// BinOpRR_F_Rev - Binary instructions with inputs "reg, reg"(reversed
-// encoding), without outlist dag.
-class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
-  : ITy<opcode, MRMSrcReg, typeinfo, (outs),
-        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", []>,
-    Sched<[WriteALU]> {
-  // The disassembler should know about this, but not the asmparser.
-  let isCodeGenOnly = 1;
-  let ForceDisassemble = 1;
-  let hasSideEffects = 0;
+class MulDivOpM<bits<8> o, Format f, string m, X86TypeInfo t,
+             X86FoldableSchedWrite sched, list<dag> p>
+  : UnaryOpM<o, f, m, "$src1", t, (outs), p> {
+  let SchedRW =
+    [sched.Folded,
+     // Memory operand.
+     ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+     // Register reads (implicit or explicit).
+     sched.ReadAfterFold, sched.ReadAfterFold];
 }
 
-// BinOpRM - Binary instructions with inputs "reg, [mem]".
-class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, MRMSrcMem, typeinfo, outlist,
-        (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
-    Sched<[sched.Folded, sched.ReadAfterFold]>;
-
-// BinOpRM_ImplicitUse - Binary instructions with inputs "reg, [mem]".
-// There is an implicit register read at the end of the operand sequence.
-class BinOpRM_ImplicitUse<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                          dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, MRMSrcMem, typeinfo, outlist,
-        (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
-    Sched<[sched.Folded, sched.ReadAfterFold,
-           // base, scale, index, offset, segment.
-           ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-           // implicit register read.
-           sched.ReadAfterFold]>;
-
-// BinOpRM_F - Binary instructions with inputs "reg, [mem]", where the pattern
-// has just a EFLAGS as a result.
-class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                SDNode opnode>
-  : BinOpRM<opcode, mnemonic, typeinfo, (outs), WriteALU,
-            [(set EFLAGS,
-            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
-
-// BinOpRM_RF - Binary instructions with inputs "reg, [mem]", where the pattern
-// has both a regclass and EFLAGS as a result.
-class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                 SDNode opnode>
-  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteALU,
-            [(set typeinfo.RegClass:$dst, EFLAGS,
-            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
-
-// BinOpRM_RFF - Binary instructions with inputs "reg, [mem]", where the pattern
-// has both a regclass and EFLAGS as a result, and has EFLAGS as input.
-class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode>
-  : BinOpRM_ImplicitUse<opcode, mnemonic, typeinfo,
-                        (outs typeinfo.RegClass:$dst), WriteADC,
-                        [(set typeinfo.RegClass:$dst, EFLAGS,
-                         (opnode typeinfo.RegClass:$src1,
-                         (typeinfo.LoadNode addr:$src2), EFLAGS))]>;
-
-// BinOpRI - Binary instructions with inputs "reg, imm".
-class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Format f, dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, f, typeinfo, outlist,
-        (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
-    Sched<[sched]> {
-  let ImmT = typeinfo.ImmEncoding;
+multiclass Mul<bits<8> o, string m, Format RegMRM, Format MemMRM, SDPatternOperator node> {
+  // AL is really implied by AX, but the registers in Defs must match the
+  // SDNode results (i8, i32).
+  //
+  // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+  // This probably ought to be moved to a def : Pat<> if the
+  // syntax can be accepted.
+  let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+  def 8r : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8,
+                  [(set AL, (node AL, GR8:$src1)), (implicit EFLAGS)]>;
+  let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+  def 16r : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, OpSize16;
+  let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+  def 32r : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, OpSize32;
+  let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+  def 64r : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>;
+  let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+  def 8m : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8,
+                  [(set AL, (node AL, (loadi8 addr:$src1))), (implicit EFLAGS)]>;
+  let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+  def 16m : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, OpSize16;
+  let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+  def 32m : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, OpSize32;
+  let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+  def 64m : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, Requires<[In64BitMode]>;
 }
 
-// BinOpRI_F - Binary instructions with inputs "reg, imm", where the pattern
-// has EFLAGS as a result.
-class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                SDPatternOperator opnode, Format f>
-  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs), WriteALU,
-            [(set EFLAGS,
-                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
-
-// BinOpRI_RF - Binary instructions with inputs "reg, imm", where the pattern
-// has both a regclass and EFLAGS as a result.
-class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                 SDNode opnode, Format f>
-  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteALU,
-            [(set typeinfo.RegClass:$dst, EFLAGS,
-                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
-
-// BinOpRI_RFF - Binary instructions with inputs "reg, imm", where the pattern
-// has both a regclass and EFLAGS as a result, and has EFLAGS as input.
-class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode, Format f>
-  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteADC,
-            [(set typeinfo.RegClass:$dst, EFLAGS,
-                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
-                        EFLAGS))]>;
-
-// BinOpRI8 - Binary instructions with inputs "reg, imm8".
-class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-               Format f, dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, f, typeinfo, outlist,
-        (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
-    Sched<[sched]> {
-  let ImmT = Imm8; // Always 8-bit immediate.
+defm MUL : Mul<0xF7, "mul", MRM4r, MRM4m, mul>;
+defm IMUL : Mul<0xF7, "imul", MRM5r, MRM5m, null_frag>;
+
+multiclass Div<bits<8> o, string m, Format RegMRM, Format MemMRM> {
+  defvar sched8 = !if(!eq(m, "div"), WriteDiv8, WriteIDiv8);
+  defvar sched16 = !if(!eq(m, "div"), WriteDiv16, WriteIDiv16);
+  defvar sched32 = !if(!eq(m, "div"), WriteDiv32, WriteIDiv32);
+  defvar sched64 = !if(!eq(m, "div"), WriteDiv64, WriteIDiv64);
+  let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+  def 8r  : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>;
+  let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+  def 16r : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, OpSize16;
+  let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+  def 32r : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, OpSize32;
+  let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+  def 64r : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>;
+  let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+  def 8m  : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>;
+  let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+  def 16m : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, OpSize16;
+  let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+  def 32m : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, OpSize32;
+  let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+  def 64m : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, Requires<[In64BitMode]>;
 }
-
-// BinOpRI8_F - Binary instructions with inputs "reg, imm8".
-class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, Format f>
-  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs), WriteALU, []>;
-
-// BinOpRI8_RF - Binary instructions with inputs "reg, imm8".
-class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, Format f>
-  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteALU, []>;
-
-// BinOpRI8_RFF - Binary instructions with inputs "reg, imm8".
-class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, Format f>
-  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteADC, []>;
-
-// BinOpMR - Binary instructions with inputs "[mem], reg".
-class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              list<dag> pattern>
-  : ITy<opcode, MRMDestMem, typeinfo,
-        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern>;
-
-// BinOpMR_RMW - Binary instructions with inputs "[mem], reg", where the pattern
-// implicitly use EFLAGS.
-class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode>
-  : BinOpMR<opcode, mnemonic, typeinfo,
-            [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
-             (implicit EFLAGS)]>,
-    Sched<[WriteALURMW,
-           // base, scale, index, offset, segment
-           ReadDefault, ReadDefault, ReadDefault,
-           ReadDefault, ReadDefault,
-           WriteALU.ReadAfterFold]>;  // reg
-
-// BinOpMR_RMW_FF - Binary instructions with inputs "[mem], reg", where the
-// pattern sets EFLAGS and implicitly uses EFLAGS.
-class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                     SDNode opnode>
-  : BinOpMR<opcode, mnemonic, typeinfo,
-            [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
-                    addr:$dst),
-             (implicit EFLAGS)]>,
-    Sched<[WriteADCRMW,
-          // base, scale, index, offset, segment
-          ReadDefault, ReadDefault, ReadDefault,
-          ReadDefault, ReadDefault,
-          WriteALU.ReadAfterFold,    // reg
-          WriteALU.ReadAfterFold]>;  // EFLAGS
-
-// BinOpMR_F - Binary instructions with inputs "[mem], reg", where the pattern
-// has EFLAGS as a result.
-class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                SDPatternOperator opnode>
-  : BinOpMR<opcode, mnemonic, typeinfo,
-            [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst),
-                                   typeinfo.RegClass:$src))]>,
-    Sched<[WriteALU.Folded, ReadDefault, ReadDefault, ReadDefault,
-            ReadDefault, ReadDefault, WriteALU.ReadAfterFold]>;
-
-// BinOpMI - Binary instructions with inputs "[mem], imm".
-class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Format f, list<dag> pattern>
-  : ITy<opcode, f, typeinfo,
-        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern> {
-  let ImmT = typeinfo.ImmEncoding;
+let hasSideEffects = 1 in { // so that we don't speculatively execute
+defm DIV: Div<0xF7, "div", MRM6r, MRM6m>;
+defm IDIV: Div<0xF7, "idiv", MRM7r, MRM7m>;
 }
 
-// BinOpMI_RMW - Binary instructions with inputs "[mem], imm", where the
-// pattern implicitly use EFLAGS.
-class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode, Format f>
-  : BinOpMI<opcode, mnemonic, typeinfo, f,
-            [(store (opnode (typeinfo.VT (load addr:$dst)),
-                            typeinfo.ImmOperator:$src), addr:$dst),
-             (implicit EFLAGS)]>,
-    Sched<[WriteALURMW]>;
-
-// BinOpMI_RMW_FF - Binary instructions with inputs "[mem], imm", where the
-// pattern sets EFLAGS and implicitly uses EFLAGS.
-class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                     SDNode opnode, Format f>
-  : BinOpMI<opcode, mnemonic, typeinfo, f,
-            [(store (opnode (typeinfo.VT (load addr:$dst)),
-                             typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
-                             (implicit EFLAGS)]>,
-    Sched<[WriteADCRMW]>;
-
-// BinOpMI_F - Binary instructions with inputs "[mem], imm", where the pattern
-// has EFLAGS as a result.
-class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                SDPatternOperator opnode, Format f>
-  : BinOpMI<opcode, mnemonic, typeinfo, f,
-            [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst),
-                                  typeinfo.ImmOperator:$src))]>,
-    Sched<[WriteALU.Folded]>;
-
-// BinOpMI8 - Binary instructions with inputs "[mem], imm8".
-class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
-               Format f, list<dag> pattern>
-  : ITy<0x82, f, typeinfo,
-        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern> {
-  let ImmT = Imm8; // Always 8-bit immediate.
+class IMulOpRR<X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpRR_RF<0xAF, "imul", t, X86smul_flag>, TB {
+  let Form = MRMSrcReg;
+  let SchedRW = [sched];
+  // X = IMUL Y, Z --> X = IMUL Z, Y
+  let isCommutable = 1;
 }
-
-// BinOpMI8_RMW - Binary instructions with inputs "[mem], imm8".
-class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo, Format f>
-  : BinOpMI8<mnemonic, typeinfo, f, []>, Sched<[WriteALURMW]>;
-
-// BinOpMI8_RMW_FF - Binary instructions with inputs "[mem], imm8".
-class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo, Format f>
-  : BinOpMI8<mnemonic, typeinfo, f, []>, Sched<[WriteADCRMW]>;
-
-// BinOpMI8_F - Binary instructions with inputs "[mem], imm8"
-class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo, Format f>
-  : BinOpMI8<mnemonic, typeinfo, f, []>, Sched<[WriteALU.Folded]>;
-
-// BinOpAI - Binary instructions with input imm, that implicitly use A reg and
-// implicitly define Areg and EFLAGS.
-class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Register areg, string operands, X86FoldableSchedWrite sched = WriteALU>
-  : ITy<opcode, RawFrm, typeinfo,
-        (outs), (ins typeinfo.ImmOperand:$src),
-        mnemonic, operands, []>,
-    Sched<[sched]> {
-  let ImmT = typeinfo.ImmEncoding;
-  let Uses = [areg];
-  let Defs = [areg, EFLAGS];
-  let hasSideEffects = 0;
+class IMulOpRM<X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpRM_RF<0xAF, "imul", t, X86smul_flag>, TB {
+let Form = MRMSrcMem;
+let SchedRW = [sched.Folded, sched.ReadAfterFold];
 }
 
-// BinOpAI_RFF - Binary instructions with input imm, that implicitly use and
-// define Areg and EFLAGS.
-class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  Register areg, string operands>
-  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands, WriteADC> {
-  let Uses = [areg, EFLAGS];
+def IMUL16rr : IMulOpRR<Xi16, WriteIMul16Reg>, OpSize16;
+def IMUL32rr : IMulOpRR<Xi32, WriteIMul32Reg>, OpSize32;
+def IMUL64rr : IMulOpRR<Xi64, WriteIMul64Reg>;
+def IMUL16rm : IMulOpRM<Xi16, WriteIMul16Reg>, OpSize16;
+def IMUL32rm : IMulOpRM<Xi32, WriteIMul32Reg>, OpSize32;
+def IMUL64rm : IMulOpRM<Xi64, WriteIMul64Reg>;
+
+class IMulOpRI8_R<X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpRI8<0x6B, "imul", binop_ndd_args, t, MRMSrcReg,
+             (outs t.RegClass:$dst)>, DefEFLAGS {
+  let SchedRW = [sched];
 }
-
-// BinOpAI_F - Binary instructions with input imm, that implicitly use A reg and
-// implicitly define EFLAGS.
-class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                Register areg, string operands>
-  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> {
-  let Defs = [EFLAGS];
+class IMulOpRI_R<X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpRI<0x69, "imul", binop_ndd_args, t, MRMSrcReg,
+            (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS, (X86smul_flag t.RegClass:$src1,
+             t.ImmNoSuOperator:$src2))]>, DefEFLAGS {
+  let SchedRW = [sched];
 }
-
-//  UnaryOpM - Unary instructions with a memory operand.
-class UnaryOpM<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
-               list<dag> pattern>
-  : ITy<opcode, f, info, (outs), (ins info.MemOperand:$dst), mnemonic,
-        "$dst", pattern>;
-
-//  UnaryOpR - Unary instructions with a register.
-class UnaryOpR<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
-               list<dag> pattern>
-  : ITy<opcode, f, info, (outs info.RegClass:$dst),
-        (ins info.RegClass:$src1), mnemonic, "$dst", pattern>;
-
-//  INCDECR - Instructions like "inc reg".
-class INCDECR<Format f, string mnemonic, X86TypeInfo info,
-              SDPatternOperator node>
-  : UnaryOpR<0xFE, f, mnemonic, info,
-             [(set info.RegClass:$dst, EFLAGS,
-              (node info.RegClass:$src1, 1))]>;
-
-//  INCDECM - Instructions like "inc [mem]".
-class INCDECM<Format f, string mnemonic, X86TypeInfo info, int num>
-  : UnaryOpM<0xFE, f, mnemonic, info,
-             [(store (add (info.LoadNode addr:$dst), num), addr:$dst),
-              (implicit EFLAGS)]>;
-
-//  INCDECR_ALT - Instructions like "inc reg" short forms.
-class INCDECR_ALT<bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : UnaryOpR<opcode, AddRegFrm, mnemonic, info, []>{
-  let Predicates = [Not64BitMode];
-  let Opcode = opcode;
+class IMulOpMI8_R<X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpMI8<"imul", binop_ndd_args, t, MRMSrcMem, (outs t.RegClass:$dst)>,
+    DefEFLAGS {
+  let Opcode = 0x6B;
+  let SchedRW = [sched.Folded];
 }
-
-//  MulOpR - Instructions like "mul reg".
-class MulOpR<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
-             X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, f, info, (outs), (ins info.RegClass:$src), mnemonic,
-        "$src", pattern>,
-    Sched<[sched]>;
-
-//  MulOpM - Instructions like "mul [mem]".
-class MulOpM<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
-             X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, f, info, (outs), (ins info.MemOperand:$src), mnemonic,
-        "$src", pattern>, SchedLoadReg<sched>;
-
-//  NegOpR - Instructions like "neg reg", with implicit EFLAGS.
-class NegOpR<bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : UnaryOpR<opcode, MRM3r, mnemonic, info,
-             [(set info.RegClass:$dst, (ineg info.RegClass:$src1)),
-              (implicit EFLAGS)]>;
-
-//  NotOpR - Instructions like "not reg".
-class NotOpR<bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : UnaryOpR<opcode, MRM2r, mnemonic, info,
-               [(set info.RegClass:$dst,
-                (not info.RegClass:$src1))]>;
-
-//  NegOpM - Instructions like "neg [mem]", with implicit EFLAGS.
-class NegOpM<bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : UnaryOpM<opcode, MRM3m, mnemonic, info,
-             [(store (ineg (info.LoadNode addr:$dst)), addr:$dst),
-              (implicit EFLAGS)]>;
-
-//  NotOpM - Instructions like "neg [mem]".
-class NotOpM<bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : UnaryOpM<opcode, MRM2m, mnemonic,  info,
-             [(store (not (info.LoadNode addr:$dst)), addr:$dst)]>;
-
-// BinOpRR_C - Binary instructions with inputs "reg, reg", which used mainly
-// with Constraints = "$src1 = $dst".
-class BinOpRR_C<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
-                list<dag> pattern>
-  : ITy<opcode, f, info, (outs info.RegClass:$dst),
-        (ins info.RegClass:$src1, info.RegClass:$src2),
-        mnemonic, "{$src2, $dst|$dst, $src2}", pattern>;
-
-// BinOpRM_C - Binary instructions with inputs "reg, [mem]", which used mainly
-// with Constraints = "$src1 = $dst".
-class BinOpRM_C<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
-                list<dag> pattern>
-  : ITy<opcode, f, info, (outs info.RegClass:$dst),
-        (ins info.RegClass:$src1, info.MemOperand:$src2),
-        mnemonic, "{$src2, $dst|$dst, $src2}", pattern>;
-
-// IMulOpRR - Instructions like "imul reg, reg, i8".
-class IMulOpRR<bits<8> opcode, string mnemonic, X86TypeInfo info,
-               X86FoldableSchedWrite sched>
-  : BinOpRR_C<opcode, MRMSrcReg, mnemonic, info,
-              [(set info.RegClass:$dst, EFLAGS,
-                (X86smul_flag info.RegClass:$src1,
-                info.RegClass:$src2))]>,
-    Sched<[sched]>, TB;
-
-// IMulOpRM - Instructions like "imul reg, reg, [mem]".
-class IMulOpRM<bits<8> opcode, string mnemonic, X86TypeInfo info,
-               X86FoldableSchedWrite sched>
-  : BinOpRM_C<opcode, MRMSrcMem, mnemonic, info,
-              [(set info.RegClass:$dst, EFLAGS,
-                (X86smul_flag info.RegClass:$src1, (info.LoadNode addr:$src2)))]>,
-    Sched<[sched.Folded, sched.ReadAfterFold]>, TB;
-
-// IMulOpRRI8 - Instructions like "imul reg, reg, i8".
-class IMulOpRRI8<bits<8> opcode, string mnemonic, X86TypeInfo info,
-                 X86FoldableSchedWrite sched>
-  : ITy<opcode, MRMSrcReg, info, (outs info.RegClass:$dst),
-        (ins info.RegClass:$src1, info.Imm8Operand:$src2), mnemonic,
-        "{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[sched]> {
-  let ImmT = Imm8;
+class IMulOpMI_R<X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpMI<0x69, "imul", binop_ndd_args, t, MRMSrcMem,
+            (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS, (X86smul_flag (t.LoadNode addr:$src1),
+             t.ImmNoSuOperator:$src2))]>,
+    DefEFLAGS {
+  let SchedRW = [sched.Folded];
 }
+def IMUL16rri8 : IMulOpRI8_R<Xi16, WriteIMul16Imm>, OpSize16;
+def IMUL32rri8 : IMulOpRI8_R<Xi32, WriteIMul32Imm>, OpSize32;
+def IMUL64rri8 : IMulOpRI8_R<Xi64, WriteIMul64Imm>;
+def IMUL16rri  : IMulOpRI_R<Xi16, WriteIMul16Imm>, OpSize16;
+def IMUL32rri  : IMulOpRI_R<Xi32, WriteIMul32Imm>, OpSize32;
+def IMUL64rri32 : IMulOpRI_R<Xi64, WriteIMul64Imm>;
+
+def IMUL16rmi8 : IMulOpMI8_R<Xi16, WriteIMul16Imm>, OpSize16;
+def IMUL32rmi8 : IMulOpMI8_R<Xi32, WriteIMul32Imm>, OpSize32;
+def IMUL64rmi8 : IMulOpMI8_R<Xi64, WriteIMul64Imm>;
+def IMUL16rmi  : IMulOpMI_R<Xi16, WriteIMul16Imm>, OpSize16;
+def IMUL32rmi  : IMulOpMI_R<Xi32, WriteIMul32Imm>, OpSize32;
+def IMUL64rmi32 : IMulOpMI_R<Xi64, WriteIMul64Imm>;
 
-// IMulOpRRI - Instructions like "imul reg, reg, i16/i32/i64".
-class IMulOpRRI<bits<8> opcode, string mnemonic, X86TypeInfo info,
-                X86FoldableSchedWrite sched>
-  : ITy<opcode, MRMSrcReg, info, (outs info.RegClass:$dst),
-        (ins info.RegClass:$src1, info.ImmOperand:$src2), mnemonic,
-        "{$src2, $src1, $dst|$dst, $src1, $src2}",
-        [(set info.RegClass:$dst, EFLAGS,
-         (X86smul_flag info.RegClass:$src1,
-                       info.ImmNoSuOperator:$src2))]>,
-    Sched<[sched]>{
-  let ImmT = info.ImmEncoding;
+//===----------------------------------------------------------------------===//
+// INC and DEC Instructions
+//
+class IncOpR_RF<X86TypeInfo t> : UnaryOpR_RF<0xFF, MRM0r, "inc", t, null_frag> {
+  let Pattern = [(set t.RegClass:$dst, EFLAGS,
+                 (X86add_flag_nocf t.RegClass:$src1, 1))];
 }
-
-// IMulOpRMI8 - Instructions like "imul reg, [mem], i8".
-class IMulOpRMI8<bits<8> opcode, string mnemonic, X86TypeInfo info,
-                 X86FoldableSchedWrite sched>
-  : ITy<opcode, MRMSrcMem, info, (outs info.RegClass:$dst),
-        (ins info.MemOperand:$src1, info.Imm8Operand:$src2), mnemonic,
-        "{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[sched.Folded]> {
-  let ImmT = Imm8;
+class DecOpR_RF<X86TypeInfo t> : UnaryOpR_RF<0xFF, MRM1r, "dec", t, null_frag> {
+  let Pattern = [(set t.RegClass:$dst, EFLAGS,
+                 (X86sub_flag_nocf t.RegClass:$src1, 1))];
 }
-
-// IMulOpRMI - Instructions like "imul reg, [mem], i16/i32/i64".
-class IMulOpRMI<bits<8> opcode, string mnemonic, X86TypeInfo info,
-                X86FoldableSchedWrite sched>
-  : ITy<opcode, MRMSrcMem, info, (outs info.RegClass:$dst),
-        (ins info.MemOperand:$src1, info.ImmOperand:$src2), mnemonic,
-        "{$src2, $src1, $dst|$dst, $src1, $src2}",
-        [(set info.RegClass:$dst, EFLAGS,
-         (X86smul_flag (info.LoadNode addr:$src1),
-                        info.ImmNoSuOperator:$src2))]>,
-    Sched<[sched.Folded]>{
-  let ImmT = info.ImmEncoding;
+class IncOpM_M<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM0m, "inc", t, null_frag> {
+  let Pattern = [(store (add (t.LoadNode addr:$src1), 1), addr:$src1),
+                 (implicit EFLAGS)];
 }
-
-let Defs = [EFLAGS] in {
-let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
-// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
-let hasSideEffects = 0 in {
-def INC16r_alt : INCDECR_ALT<0x40, "inc", Xi16>;
-def INC32r_alt : INCDECR_ALT<0x40, "inc", Xi32>;
-} // hasSideEffects = 0
-
-let isConvertibleToThreeAddress = 1 in { // Can xform into LEA.
-def INC8r  : INCDECR<MRM0r, "inc", Xi8, X86add_flag_nocf>;
-def INC16r : INCDECR<MRM0r, "inc", Xi16, X86add_flag_nocf>;
-def INC32r : INCDECR<MRM0r, "inc", Xi32, X86add_flag_nocf>;
-def INC64r : INCDECR<MRM0r, "inc", Xi64, X86add_flag_nocf>;
-} // isConvertibleToThreeAddress = 1
-} // Constraints = "$src1 = $dst", SchedRW
-
-let SchedRW = [WriteALURMW] in {
-let Predicates = [UseIncDec] in {
-  def INC8m  : INCDECM<MRM0m, "inc", Xi8, 1>;
-  def INC16m : INCDECM<MRM0m, "inc", Xi16, 1>;
-  def INC32m : INCDECM<MRM0m, "inc", Xi32, 1>;
-} // Predicates
-let Predicates = [UseIncDec, In64BitMode] in {
-  def INC64m : INCDECM<MRM0m, "inc", Xi64, 1>;
-} // Predicates
-} // SchedRW
-
-let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+class DecOpM_M<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM1m, "dec", t, null_frag> {
+  let Pattern = [(store (add (t.LoadNode addr:$src1), -1), addr:$src1),
+                 (implicit EFLAGS)];
+}
+// IncDec_Alt - Instructions like "inc reg" short forms.
 // Short forms only valid in 32-bit mode. Selected during MCInst lowering.
-let hasSideEffects = 0 in {
-def DEC16r_alt : INCDECR_ALT<0x48, "dec", Xi16>;
-def DEC32r_alt : INCDECR_ALT<0x48, "dec", Xi32>;
-} // hasSideEffects = 0
-
-let isConvertibleToThreeAddress = 1 in { // Can xform into LEA.
-def DEC8r  : INCDECR<MRM1r, "dec", Xi8, X86sub_flag_nocf>;
-def DEC16r : INCDECR<MRM1r, "dec", Xi16, X86sub_flag_nocf>;
-def DEC32r : INCDECR<MRM1r, "dec", Xi32, X86sub_flag_nocf>;
-def DEC64r : INCDECR<MRM1r, "dec", Xi64, X86sub_flag_nocf>;
-} // isConvertibleToThreeAddress = 1
-} // Constraints = "$src1 = $dst", SchedRW
-
-let SchedRW = [WriteALURMW] in {
+class IncDec_Alt<bits<8> o, string m, X86TypeInfo t>
+  : UnaryOpR_RF<o, AddRegFrm, m, t, null_frag>, Requires<[Not64BitMode]>;
+
+let isConvertibleToThreeAddress = 1 in {
+def INC16r_alt : IncDec_Alt<0x40, "inc", Xi16>, OpSize16;
+def INC32r_alt : IncDec_Alt<0x40, "inc", Xi32>, OpSize32;
+def DEC16r_alt : IncDec_Alt<0x48, "dec", Xi16>, OpSize16;
+def DEC32r_alt : IncDec_Alt<0x48, "dec", Xi32>, OpSize32;
+def INC8r  : IncOpR_RF<Xi8>;
+def INC16r : IncOpR_RF<Xi16>, OpSize16;
+def INC32r : IncOpR_RF<Xi32>, OpSize32;
+def INC64r : IncOpR_RF<Xi64>;
+def DEC8r  : DecOpR_RF<Xi8>;
+def DEC16r : DecOpR_RF<Xi16>, OpSize16;
+def DEC32r : DecOpR_RF<Xi32>, OpSize32;
+def DEC64r : DecOpR_RF<Xi64>;
+}
 let Predicates = [UseIncDec] in {
-  def DEC8m  : INCDECM<MRM1m, "dec", Xi8, -1>;
-  def DEC16m : INCDECM<MRM1m, "dec", Xi16, -1>;
-  def DEC32m : INCDECM<MRM1m, "dec", Xi32, -1>;
-} // Predicates
+def INC8m  : IncOpM_M<Xi8>;
+def INC16m : IncOpM_M<Xi16>, OpSize16;
+def INC32m : IncOpM_M<Xi32>, OpSize32;
+def DEC8m  : DecOpM_M<Xi8>;
+def DEC16m : DecOpM_M<Xi16>, OpSize16;
+def DEC32m : DecOpM_M<Xi32>, OpSize32;
+}
 let Predicates = [UseIncDec, In64BitMode] in {
-  def DEC64m : INCDECM<MRM1m, "dec", Xi64, -1>;
-} // Predicates
-} // SchedRW
-} // Defs = [EFLAGS]
-
-// Extra precision multiplication
-
-// AL is really implied by AX, but the registers in Defs must match the
-// SDNode results (i8, i32).
-// AL,AH = AL*GR8
-let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-def MUL8r  : MulOpR<0xF6, MRM4r, "mul", Xi8, WriteIMul8,
-               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
-               // This probably ought to be moved to a def : Pat<> if the
-               // syntax can be accepted.
-               [(set AL, (mul AL, GR8:$src)), (implicit EFLAGS)]>;
-// AX,DX = AX*GR16
-let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in
-def MUL16r : MulOpR<0xF7, MRM4r, "mul", Xi16, WriteIMul16, []>;
-// EAX,EDX = EAX*GR32
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in
-def MUL32r : MulOpR<0xF7, MRM4r, "mul", Xi32, WriteIMul32,
-               [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/]>;
-// RAX,RDX = RAX*GR64
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in
-def MUL64r : MulOpR<0xF7, MRM4r, "mul", Xi64, WriteIMul64,
-                [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/]>;
-// AL,AH = AL*[mem8]
-let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-def MUL8m  : MulOpM<0xF6, MRM4m, "mul", Xi8, WriteIMul8,
-               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
-               // This probably ought to be moved to a def : Pat<> if the
-               // syntax can be accepted.
-               [(set AL, (mul AL, (loadi8 addr:$src))),
-                (implicit EFLAGS)]>;
-// AX,DX = AX*[mem16]
-let mayLoad = 1, hasSideEffects = 0 in {
-let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def MUL16m : MulOpM<0xF7, MRM4m, "mul", Xi16, WriteIMul16, []>;
-// EAX,EDX = EAX*[mem32]
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-def MUL32m : MulOpM<0xF7, MRM4m, "mul", Xi32, WriteIMul32, []>;
-// RAX,RDX = RAX*[mem64]
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
-def MUL64m : MulOpM<0xF7, MRM4m, "mul", Xi64, WriteIMul64, []>,
-                Requires<[In64BitMode]>;
+def INC64m : IncOpM_M<Xi64>;
+def DEC64m : DecOpM_M<Xi64>;
 }
 
-let hasSideEffects = 0 in {
-// AL,AH = AL*GR8
-let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-def IMUL8r  : MulOpR<0xF6, MRM5r, "imul", Xi8, WriteIMul8, []>;
-// AX,DX = AX*GR16
-let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def IMUL16r : MulOpR<0xF7, MRM5r, "imul", Xi16, WriteIMul16, []>;
-// EAX,EDX = EAX*GR32
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-def IMUL32r : MulOpR<0xF7, MRM5r, "imul", Xi32, WriteIMul32, []>;
-// RAX,RDX = RAX*GR64
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
-def IMUL64r : MulOpR<0xF7, MRM5r, "imul", Xi64, WriteIMul64, []>;
-
-let mayLoad = 1 in {
-// AL,AH = AL*[mem8]
-let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-def IMUL8m  : MulOpM<0xF6, MRM5m, "imul", Xi8, WriteIMul8, []>;
-// AX,DX = AX*[mem16]
-let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def IMUL16m : MulOpM<0xF7, MRM5m, "imul", Xi16, WriteIMul16, []>;
-// EAX,EDX = EAX*[mem32]
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-def IMUL32m : MulOpM<0xF7, MRM5m, "imul", Xi32, WriteIMul32, []>;
-// RAX,RDX = RAX*[mem64]
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
-def IMUL64m : MulOpM<0xF7, MRM5m, "imul", Xi64, WriteIMul64, []>,
-                 Requires<[In64BitMode]>;
+//===----------------------------------------------------------------------===//
+// NEG and NOT Instructions
+//
+class NegOpR_R<X86TypeInfo t, bit ndd = 0>
+  : UnaryOpR_R<0xF7, MRM3r, "neg", t, ineg, ndd>;
+class NegOpR_RF<X86TypeInfo t, bit ndd = 0>
+  : UnaryOpR_RF<0xF7, MRM3r, "neg", t, ineg, ndd>;
+class NegOpM_M<X86TypeInfo t> : UnaryOpM_M<0xF7, MRM3m, "neg", t, null_frag>;
+class NegOpM_MF<X86TypeInfo t> : UnaryOpM_MF<0xF7, MRM3m, "neg", t, ineg>;
+class NegOpM_R<X86TypeInfo t> : UnaryOpM_R<0xF7, MRM3m, "neg", t, null_frag>;
+class NegOpM_RF<X86TypeInfo t> : UnaryOpM_RF<0xF7, MRM3m, "neg", t, ineg>;
+
+class NotOpR_R<X86TypeInfo t, bit ndd = 0>
+  : UnaryOpR_R<0xF7, MRM2r, "not", t, not, ndd>;
+class NotOpM_M<X86TypeInfo t> : UnaryOpM_M<0xF7, MRM2m, "not", t, not>;
+class NotOpM_R<X86TypeInfo t> : UnaryOpM_R<0xF7, MRM2m, "not", t, not>;
+
+let Predicates = [NoNDD] in {
+def NEG8r  : NegOpR_RF<Xi8>;
+def NEG16r : NegOpR_RF<Xi16>, OpSize16;
+def NEG32r : NegOpR_RF<Xi32>, OpSize32;
+def NEG64r : NegOpR_RF<Xi64>;
+def NOT8r  : NotOpR_R<Xi8>;
+def NOT16r : NotOpR_R<Xi16>, OpSize16;
+def NOT32r : NotOpR_R<Xi32>, OpSize32;
+def NOT64r : NotOpR_R<Xi64>;
 }
 
-let Defs = [EFLAGS] in {
-let Constraints = "$src1 = $dst" in {
-let isCommutable = 1 in {
-// X = IMUL Y, Z --> X = IMUL Z, Y
-// Register-Register Signed Integer Multiply
-def IMUL16rr : IMulOpRR<0xAF, "imul", Xi16, WriteIMul16Reg>;
-def IMUL32rr : IMulOpRR<0xAF, "imul", Xi32, WriteIMul32Reg>;
-def IMUL64rr : IMulOpRR<0xAF, "imul", Xi64, WriteIMul64Reg>;
-} // isCommutable
-
-// Register-Memory Signed Integer Multiply
-def IMUL16rm : IMulOpRM<0xAF, "imul", Xi16, WriteIMul16Reg>;
-def IMUL32rm : IMulOpRM<0xAF, "imul", Xi32, WriteIMul32Reg>;
-def IMUL64rm : IMulOpRM<0xAF, "imul", Xi64, WriteIMul64Reg>;
-} // Constraints = "$src1 = $dst"
-} // Defs = [EFLAGS]
-
-// Surprisingly enough, these are not two address instructions!
-let Defs = [EFLAGS] in {
-// NOTE: These are order specific, we want the ri8 forms to be listed
-// first so that they are slightly preferred to the ri forms.
-
-// Register-Integer Signed Integer Multiply
-// GR16 = GR16*I8
-def IMUL16rri8 : IMulOpRRI8<0x6B, "imul", Xi16, WriteIMul16Imm>;
-// GR16 = GR16*I16
-def IMUL16rri  : IMulOpRRI<0x69, "imul", Xi16, WriteIMul16Imm>;
-// GR32 = GR32*I8
-def IMUL32rri8 : IMulOpRRI8<0x6B, "imul", Xi32, WriteIMul32Imm>;
-// GR32 = GR32*I32
-def IMUL32rri  : IMulOpRRI<0x69, "imul", Xi32, WriteIMul32Imm>;
-// GR64 = GR64*I8
-def IMUL64rri8 : IMulOpRRI8<0x6B, "imul", Xi64, WriteIMul64Imm>;
-// GR64 = GR64*I32
-def IMUL64rri32 : IMulOpRRI<0x69, "imul", Xi64, WriteIMul64Imm>;
-
-// Memory-Integer Signed Integer Multiply
-// GR16 = [mem16]*I8
-let mayLoad = 1 in {
-def IMUL16rmi8 : IMulOpRMI8<0x6B, "imul", Xi16, WriteIMul16Imm>;
-// GR16 = [mem16]*I16
-def IMUL16rmi  : IMulOpRMI<0x69, "imul", Xi16, WriteIMul16Imm>;
-// GR32 = [mem32]*I8
-def IMUL32rmi8 : IMulOpRMI8<0x6B, "imul", Xi32, WriteIMul32Imm>;
-// GR32 = [mem32]*I32
-def IMUL32rmi  : IMulOpRMI<0x69, "imul", Xi32, WriteIMul32Imm>;
-// GR64 = [mem64]*I8
-def IMUL64rmi8 : IMulOpRMI8<0x6B, "imul", Xi64, WriteIMul64Imm>;
-// GR64 = [mem64]*I32
-def IMUL64rmi32 : IMulOpRMI<0x69, "imul", Xi64, WriteIMul64Imm>;
-} // mayLoad
-} // Defs = [EFLAGS]
-} // hasSideEffects
-
-// unsigned division/remainder
-let hasSideEffects = 1 in { // so that we don't speculatively execute
-let Defs = [AL,AH,EFLAGS], Uses = [AX] in
-// AX/r8 = AL,AH
-def DIV8r  : MulOpR<0xF6, MRM6r, "div", Xi8, WriteDiv8, []>;
-let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
-// DX:AX/r16 = AX,DX
-def DIV16r : MulOpR<0xF7, MRM6r, "div", Xi16, WriteDiv16, []>;
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
-// EDX:EAX/r32 = EAX,EDX
-def DIV32r : MulOpR<0xF7, MRM6r, "div", Xi32, WriteDiv32, []>;
-// RDX:RAX/r64 = RAX,RDX
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
-def DIV64r : MulOpR<0xF7, MRM6r, "div", Xi64, WriteDiv64, []>;
-
-let mayLoad = 1 in {
-let Defs = [AL,AH,EFLAGS], Uses = [AX] in
-// AX/[mem8] = AL,AH
-def DIV8m  : MulOpM<0xF6, MRM6m, "div", Xi8, WriteDiv8, []>;
-let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
-// DX:AX/[mem16] = AX,DX
-def DIV16m : MulOpM<0xF7, MRM6m, "div", Xi16, WriteDiv16, []>;
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
-def DIV32m : MulOpM<0xF7, MRM6m, "div", Xi32, WriteDiv32, []>;
-// RDX:RAX/[mem64] = RAX,RDX
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
-def DIV64m : MulOpM<0xF7, MRM6m, "div", Xi64, WriteDiv64, []>,
-             Requires<[In64BitMode]>;
+let Predicates = [HasNDD, In64BitMode] in {
+def NEG8r_ND  : NegOpR_RF<Xi8, 1>;
+def NEG16r_ND : NegOpR_RF<Xi16, 1>, PD;
+def NEG32r_ND : NegOpR_RF<Xi32, 1>;
+def NEG64r_ND : NegOpR_RF<Xi64, 1>;
+
+def NOT8r_ND  : NotOpR_R<Xi8, 1>;
+def NOT16r_ND : NotOpR_R<Xi16, 1>, PD;
+def NOT32r_ND : NotOpR_R<Xi32, 1>;
+def NOT64r_ND : NotOpR_R<Xi64, 1>;
+
+def NEG8r_NF_ND  : NegOpR_R<Xi8, 1>, EVEX_NF;
+def NEG16r_NF_ND : NegOpR_R<Xi16, 1>, EVEX_NF, PD;
+def NEG32r_NF_ND : NegOpR_R<Xi32, 1>, EVEX_NF;
+def NEG64r_NF_ND : NegOpR_R<Xi64, 1>, EVEX_NF;
 }
 
-// Signed division/remainder.
-let Defs = [AL,AH,EFLAGS], Uses = [AX] in
-// AX/r8 = AL,AH
-def IDIV8r : MulOpR<0xF6, MRM7r, "idiv", Xi8, WriteIDiv8, []>;
-let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
-// DX:AX/r16 = AX,DX
-def IDIV16r: MulOpR<0xF7, MRM7r, "idiv", Xi16, WriteIDiv16, []>;
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
-// EDX:EAX/r32 = EAX,EDX
-def IDIV32r: MulOpR<0xF7, MRM7r, "idiv", Xi32, WriteIDiv32, []>;
-// RDX:RAX/r64 = RAX,RDX
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
-def IDIV64r: MulOpR<0xF7, MRM7r, "idiv", Xi64, WriteIDiv64, []>;
-
-let mayLoad = 1 in {
-let Defs = [AL,AH,EFLAGS], Uses = [AX] in
-// AX/[mem8] = AL,AH
-def IDIV8m : MulOpM<0xF6, MRM7m, "idiv", Xi8, WriteIDiv8, []>;
-let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
-// DX:AX/[mem16] = AX,DX
-def IDIV16m: MulOpM<0xF7, MRM7m, "idiv", Xi16, WriteIDiv16, []>;
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
-// EDX:EAX/[mem32] = EAX,EDX
-def IDIV32m: MulOpM<0xF7, MRM7m, "idiv", Xi32, WriteIDiv32, []>;
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
-// RDX:RAX/[mem64] = RAX,RDX
-def IDIV64m: MulOpM<0xF7, MRM7m, "idiv", Xi64, WriteIDiv64, []>,
-             Requires<[In64BitMode]>;
+def NEG8m  : NegOpM_MF<Xi8>;
+def NEG16m : NegOpM_MF<Xi16>, OpSize16;
+def NEG32m : NegOpM_MF<Xi32>, OpSize32;
+def NEG64m : NegOpM_MF<Xi64>, Requires<[In64BitMode]>;
+
+let Predicates = [HasNDD, In64BitMode] in {
+def NEG8m_ND  : NegOpM_RF<Xi8>;
+def NEG16m_ND : NegOpM_RF<Xi16>, PD;
+def NEG32m_ND : NegOpM_RF<Xi32>;
+def NEG64m_ND : NegOpM_RF<Xi64>;
+
+def NEG8m_NF_ND  : NegOpM_R<Xi8>, EVEX_NF;
+def NEG16m_NF_ND : NegOpM_R<Xi16>, EVEX_NF, PD;
+def NEG32m_NF_ND : NegOpM_R<Xi32>, EVEX_NF;
+def NEG64m_NF_ND : NegOpM_R<Xi64>, EVEX_NF;
 }
-} // hasSideEffects = 1
 
-//===----------------------------------------------------------------------===//
-//  Two address Instructions.
-//
-
-// unary instructions
-let Defs = [EFLAGS] in {
-let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
-def NEG8r  : NegOpR<0xF6, "neg", Xi8>;
-def NEG16r : NegOpR<0xF7, "neg", Xi16>;
-def NEG32r : NegOpR<0xF7, "neg", Xi32>;
-def NEG64r : NegOpR<0xF7, "neg", Xi64>;
-} // Constraints = "$src1 = $dst", SchedRW
-
-// Read-modify-write negate.
-let SchedRW = [WriteALURMW] in {
-def NEG8m  : NegOpM<0xF6, "neg", Xi8>;
-def NEG16m : NegOpM<0xF7, "neg", Xi16>;
-def NEG32m : NegOpM<0xF7, "neg", Xi32>;
-def NEG64m : NegOpM<0xF7, "neg", Xi64>, Requires<[In64BitMode]>;
-} // SchedRW
-} // Defs = [EFLAGS]
-
-
-// Note: NOT does not set EFLAGS!
+def NOT8m  : NotOpM_M<Xi8>;
+def NOT16m : NotOpM_M<Xi16>, OpSize16;
+def NOT32m : NotOpM_M<Xi32>, OpSize32;
+def NOT64m : NotOpM_M<Xi64>, Requires<[In64BitMode]>;
 
-let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
-def NOT8r  : NotOpR<0xF6, "not", Xi8>;
-def NOT16r : NotOpR<0xF7, "not", Xi16>;
-def NOT32r : NotOpR<0xF7, "not", Xi32>;
-def NOT64r : NotOpR<0xF7, "not", Xi64>;
-} // Constraints = "$src1 = $dst", SchedRW
+let Predicates = [HasNDD, In64BitMode] in {
+def NOT8m_ND  : NotOpM_R<Xi8>;
+def NOT16m_ND : NotOpM_R<Xi16>, PD;
+def NOT32m_ND : NotOpM_R<Xi32>;
+def NOT64m_ND : NotOpM_R<Xi64>;
+}
 
-let SchedRW = [WriteALURMW] in {
-def NOT8m  : NotOpM<0xF6, "not", Xi8>;
-def NOT16m : NotOpM<0xF7, "not", Xi16>;
-def NOT32m : NotOpM<0xF7, "not", Xi32>;
-def NOT64m : NotOpM<0xF7, "not", Xi64>, Requires<[In64BitMode]>;
-} // SchedRW
+let Predicates = [In64BitMode], Pattern = [(null_frag)] in {
+def NEG8r_NF  : NegOpR_R<Xi8>, NF;
+def NEG16r_NF : NegOpR_R<Xi16>, NF, PD;
+def NEG32r_NF : NegOpR_R<Xi32>, NF;
+def NEG64r_NF : NegOpR_R<Xi64>, NF;
+def NEG8m_NF  : NegOpM_M<Xi8>, NF;
+def NEG16m_NF : NegOpM_M<Xi16>, NF, PD;
+def NEG32m_NF : NegOpM_M<Xi32>, NF;
+def NEG64m_NF : NegOpM_M<Xi64>, NF;
+
+def NEG8r_EVEX  : NegOpR_RF<Xi8>, PL;
+def NEG16r_EVEX : NegOpR_RF<Xi16>, PL, PD;
+def NEG32r_EVEX : NegOpR_RF<Xi32>, PL;
+def NEG64r_EVEX : NegOpR_RF<Xi64>, PL;
+
+def NOT8r_EVEX  : NotOpR_R<Xi8>, PL;
+def NOT16r_EVEX : NotOpR_R<Xi16>, PL, PD;
+def NOT32r_EVEX : NotOpR_R<Xi32>, PL;
+def NOT64r_EVEX : NotOpR_R<Xi64>, PL;
+
+def NEG8m_EVEX  : NegOpM_MF<Xi8>, PL;
+def NEG16m_EVEX : NegOpM_MF<Xi16>, PL, PD;
+def NEG32m_EVEX : NegOpM_MF<Xi32>, PL;
+def NEG64m_EVEX : NegOpM_MF<Xi64>, PL;
+
+def NOT8m_EVEX  : NotOpM_M<Xi8>, PL;
+def NOT16m_EVEX : NotOpM_M<Xi16>, PL, PD;
+def NOT32m_EVEX : NotOpM_M<Xi32>, PL;
+def NOT64m_EVEX : NotOpM_M<Xi64>, PL;
+}
 
 /// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
 /// defined with "(set GPR:$dst, EFLAGS, (...".
@@ -787,81 +347,216 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                          SDNode opnodeflag, SDNode opnode,
                          bit CommutableRR, bit ConvertibleToThreeAddress,
                          bit ConvertibleToThreeAddressRR> {
-  let Defs = [EFLAGS] in {
-    let Constraints = "$src1 = $dst" in {
-      let isCommutable = CommutableRR in {
-        let isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in {
-          def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
-          def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
-          def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
-          def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
-        } // isConvertibleToThreeAddress
-      } // isCommutable
-
-      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
-      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
-      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
-      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+  let isCommutable = CommutableRR,
+      isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in {
+    let Predicates = [NoNDD] in {
+      def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
+      def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>, OpSize16;
+      def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>, OpSize32;
+      def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+    }
+    let Predicates = [HasNDD, In64BitMode] in {
+      def NAME#8rr_ND  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag, 1>;
+      def NAME#16rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag, 1>, PD;
+      def NAME#32rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag, 1>;
+      def NAME#64rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag, 1>;
+      def NAME#8rr_NF_ND  : BinOpRR_R<BaseOpc, mnemonic, Xi8, 1>, EVEX_NF;
+      def NAME#16rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi16, 1>, EVEX_NF, PD;
+      def NAME#32rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi32, 1>, EVEX_NF;
+      def NAME#64rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi64, 1>, EVEX_NF;
+    }
+    let Predicates = [In64BitMode] in {
+      def NAME#8rr_NF  : BinOpRR_R<BaseOpc, mnemonic, Xi8>, NF;
+      def NAME#16rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi16>, NF, PD;
+      def NAME#32rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi32>, NF;
+      def NAME#64rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi64>, NF;
+      def NAME#8rr_EVEX  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
+      def NAME#16rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
+      def NAME#32rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
+      def NAME#64rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
+    }
+  }
+
+    def NAME#8rr_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>;
+    def NAME#16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
+    def NAME#32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
+    def NAME#64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>;
+    let Predicates = [In64BitMode] in {
+      def NAME#8rr_EVEX_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL;
+      def NAME#16rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD;
+      def NAME#32rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL;
+      def NAME#64rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL;
+      def NAME#8rr_ND_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>;
+      def NAME#16rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD;
+      def NAME#32rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>;
+      def NAME#64rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>;
+      def NAME#8rr_NF_REV  : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8>, NF;
+      def NAME#16rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16>, NF, PD;
+      def NAME#32rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32>, NF;
+      def NAME#64rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64>, NF;
+      def NAME#8rr_NF_ND_REV  : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
+      def NAME#16rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
+      def NAME#32rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
+      def NAME#64rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
+    }
 
+    let Predicates = [NoNDD] in {
       def NAME#8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
-      def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
-      def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
+      def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16;
+      def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32;
       def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+    }
+    let Predicates = [HasNDD, In64BitMode] in {
+      def NAME#8rm_ND  : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag, 1>;
+      def NAME#16rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag, 1>, PD;
+      def NAME#32rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag, 1>;
+      def NAME#64rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag, 1>;
+      def NAME#8rm_NF_ND  : BinOpRM_R<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF;
+      def NAME#16rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD;
+      def NAME#32rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF;
+      def NAME#64rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;
+    }
+    let Predicates = [In64BitMode] in {
+      def NAME#8rm_NF  : BinOpRM_R<BaseOpc2, mnemonic, Xi8>, NF;
+      def NAME#16rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi16>, NF, PD;
+      def NAME#32rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi32>, NF;
+      def NAME#64rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi64>, NF;
+      def NAME#8rm_EVEX  : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , null_frag>, PL;
+      def NAME#16rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, null_frag>, PL, PD;
+      def NAME#32rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, null_frag>, PL;
+      def NAME#64rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, null_frag>, PL;
+    }
 
-      let isConvertibleToThreeAddress = ConvertibleToThreeAddress, hasSideEffects= 0 in {
-        def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
-
+    let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+      let Predicates = [NoNDD] in {
         // NOTE: These are order specific, we want the ri8 forms to be listed
         // first so that they are slightly preferred to the ri forms.
-        def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, RegMRM>;
-        def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, RegMRM>;
-        def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, RegMRM>;
-
-        def NAME#16ri  : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
-        def NAME#32ri  : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
-        def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
+        def NAME#16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+        def NAME#32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
+        def NAME#64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>;
+        def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+        def NAME#16ri  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16;
+        def NAME#32ri  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32;
+        def NAME#64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>;
+      }
+      let Predicates = [HasNDD, In64BitMode] in {
+        def NAME#16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD;
+        def NAME#32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>;
+        def NAME#64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
+        def NAME#8ri_ND   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>;
+        def NAME#16ri_ND  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD;
+        def NAME#32ri_ND  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>;
+        def NAME#64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>;
+        def NAME#16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
+        def NAME#32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
+        def NAME#64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
+        def NAME#8ri_NF_ND  : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF;
+        def NAME#16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
+        def NAME#32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
+        def NAME#64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
+      }
+      let Predicates = [In64BitMode] in {
+        def NAME#16ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM>, NF, PD;
+        def NAME#32ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM>, NF;
+        def NAME#64ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM>, NF;
+        def NAME#8ri_NF  : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM>, NF;
+        def NAME#16ri_NF : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM>, NF, PD;
+        def NAME#32ri_NF : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM>, NF;
+        def NAME#64ri32_NF : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM>, NF;
+        def NAME#16ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD;
+        def NAME#32ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, PL;
+        def NAME#64ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>, PL;
+        def NAME#8ri_EVEX   : BinOpRI_RF<0x80, mnemonic, Xi8 , null_frag, RegMRM>, PL;
+        def NAME#16ri_EVEX  : BinOpRI_RF<0x81, mnemonic, Xi16, null_frag, RegMRM>, PL, PD;
+        def NAME#32ri_EVEX  : BinOpRI_RF<0x81, mnemonic, Xi32, null_frag, RegMRM>, PL;
+        def NAME#64ri32_EVEX: BinOpRI_RF<0x81, mnemonic, Xi64, null_frag, RegMRM>, PL;
       }
-    } // Constraints = "$src1 = $dst"
-
-    let mayLoad = 1, mayStore = 1, hasSideEffects = 0 in {
-      def NAME#8mr    : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
-      def NAME#16mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
-      def NAME#32mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
-      def NAME#64mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
-
-      // NOTE: These are order specific, we want the mi8 forms to be listed
-      // first so that they are slightly preferred to the mi forms.
-      def NAME#16mi8  : BinOpMI8_RMW<mnemonic, Xi16, MemMRM>;
-      def NAME#32mi8  : BinOpMI8_RMW<mnemonic, Xi32, MemMRM>;
-      let Predicates = [In64BitMode] in
-      def NAME#64mi8  : BinOpMI8_RMW<mnemonic, Xi64, MemMRM>;
-
-      def NAME#8mi    : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-      def NAME#16mi   : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>;
-      def NAME#32mi   : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>;
-      let Predicates = [In64BitMode] in
-      def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>;
-    }
-
-    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
-    // not in 64-bit mode.
-    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
-        hasSideEffects = 0 in {
-      let Constraints = "$src1 = $dst" in
-        def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>;
-      let mayLoad = 1, mayStore = 1 in
-        def NAME#8mi8 : BinOpMI8_RMW<mnemonic, Xi8, MemMRM>;
     }
-  } // Defs = [EFLAGS]
 
-  def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
-                           "{$src, %al|al, $src}">;
-  def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
-                           "{$src, %ax|ax, $src}">;
-  def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
-                           "{$src, %eax|eax, $src}">;
-  def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
-                           "{$src, %rax|rax, $src}">;
+    def NAME#8mr    : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+    def NAME#32mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+    def NAME#64mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>;
+    let Predicates = [HasNDD, In64BitMode] in {
+    def NAME#8mr_ND    : BinOpMR_RF<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi16, opnode>, PD;
+    def NAME#32mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi64, opnode>;
+    def NAME#8mr_NF_ND    : BinOpMR_R<BaseOpc, mnemonic, Xi8>, EVEX_NF;
+    def NAME#16mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi16>, EVEX_NF, PD;
+    def NAME#32mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi32>, EVEX_NF;
+    def NAME#64mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi64>, EVEX_NF;
+  }
+  let Predicates = [In64BitMode] in {
+    def NAME#8mr_NF    : BinOpMR_M<BaseOpc, mnemonic, Xi8>, NF;
+    def NAME#16mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi16>, NF, PD;
+    def NAME#32mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi32>, NF;
+    def NAME#64mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi64>, NF;
+    def NAME#8mr_EVEX    : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
+    def NAME#16mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
+    def NAME#32mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
+    def NAME#64mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
+  }
+
+  // NOTE: These are order specific, we want the mi8 forms to be listed
+  // first so that they are slightly preferred to the mi forms.
+  def NAME#16mi8  : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, OpSize16;
+  def NAME#32mi8  : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, OpSize32;
+  let Predicates = [In64BitMode] in
+    def NAME#64mi8  : BinOpMI8_MF<mnemonic, Xi64, MemMRM>;
+  def NAME#8mi    : BinOpMI_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+  def NAME#16mi   : BinOpMI_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
+  def NAME#32mi   : BinOpMI_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
+  let Predicates = [In64BitMode] in
+    def NAME#64mi32 : BinOpMI_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
+  let Predicates = [HasNDD, In64BitMode] in {
+    def NAME#16mi8_ND  : BinOpMI8_RF<mnemonic, Xi16, MemMRM>, PD;
+    def NAME#32mi8_ND  : BinOpMI8_RF<mnemonic, Xi32, MemMRM>;
+    def NAME#64mi8_ND  : BinOpMI8_RF<mnemonic, Xi64, MemMRM>;
+    def NAME#8mi_ND    : BinOpMI_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi_ND   : BinOpMI_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD;
+    def NAME#32mi_ND   : BinOpMI_RF<0x81, mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32_ND : BinOpMI_RF<0x81, mnemonic, Xi64, opnode, MemMRM>;
+    def NAME#16mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi16, MemMRM>, NF, PD;
+    def NAME#32mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi32, MemMRM>, NF;
+    def NAME#64mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi64, MemMRM>, NF;
+    def NAME#8mi_NF_ND    : BinOpMI_R<0x80, mnemonic, Xi8, MemMRM>, NF;
+    def NAME#16mi_NF_ND   : BinOpMI_R<0x81, mnemonic, Xi16, MemMRM>, NF, PD;
+    def NAME#32mi_NF_ND   : BinOpMI_R<0x81, mnemonic, Xi32, MemMRM>, NF;
+    def NAME#64mi32_NF_ND : BinOpMI_R<0x81, mnemonic, Xi64, MemMRM>, NF;
+  }
+  let Predicates = [In64BitMode] in {
+    def NAME#16mi8_NF  : BinOpMI8_M<mnemonic, Xi16, MemMRM>, NF, PD;
+    def NAME#32mi8_NF  : BinOpMI8_M<mnemonic, Xi32, MemMRM>, NF;
+    def NAME#64mi8_NF  : BinOpMI8_M<mnemonic, Xi64, MemMRM>, NF;
+    def NAME#8mi_NF    : BinOpMI_M<0x80, mnemonic, Xi8, MemMRM>, NF;
+    def NAME#16mi_NF   : BinOpMI_M<0x81, mnemonic, Xi16, MemMRM>, NF, PD;
+    def NAME#32mi_NF   : BinOpMI_M<0x81, mnemonic, Xi32, MemMRM>, NF;
+    def NAME#64mi32_NF : BinOpMI_M<0x81, mnemonic, Xi64, MemMRM>, NF;
+    def NAME#16mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, PL, PD;
+    def NAME#32mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, PL;
+    def NAME#64mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi64, MemMRM>, PL;
+    def NAME#8mi_EVEX    : BinOpMI_MF<0x80, mnemonic, Xi8 , null_frag, MemMRM>, PL;
+    def NAME#16mi_EVEX   : BinOpMI_MF<0x81, mnemonic, Xi16, null_frag, MemMRM>, PL, PD;
+    def NAME#32mi_EVEX   : BinOpMI_MF<0x81, mnemonic, Xi32, null_frag, MemMRM>, PL;
+    def NAME#64mi32_EVEX : BinOpMI_MF<0x81, mnemonic, Xi64, null_frag, MemMRM>, PL;
+  }
+
+  // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+  // not in 64-bit mode.
+  let Predicates = [Not64BitMode] in {
+  def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+  def NAME#8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
+  }
+
+  def NAME#8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL,
+                            "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX,
+                              "{$src, %ax|ax, $src}">, OpSize16;
+  def NAME#32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX,
+                              "{$src, %eax|eax, $src}">, OpSize32;
+  def NAME#64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX,
+                              "{$src, %rax|rax, $src}">;
 }
 
 /// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
@@ -874,80 +569,164 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                           string mnemonic, Format RegMRM, Format MemMRM,
                           SDNode opnode, bit CommutableRR,
                            bit ConvertibleToThreeAddress> {
-  let Uses = [EFLAGS], Defs = [EFLAGS] in {
-    let Constraints = "$src1 = $dst" in {
-      let isCommutable = CommutableRR in {
-        def NAME#8rr  : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>;
-        let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
-          def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
-          def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
-          def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
-        } // isConvertibleToThreeAddress
-      } // isCommutable
-
-      def NAME#8rr_REV  : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
-      def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>;
-      def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>;
-      def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>;
-
-      def NAME#8rm   : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
-      def NAME#16rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
-      def NAME#32rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>;
-      def NAME#64rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>;
-
-      def NAME#8ri   : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
-
-      let isConvertibleToThreeAddress = ConvertibleToThreeAddress, hasSideEffects = 0 in {
-        // NOTE: These are order specific, we want the ri8 forms to be listed
-        // first so that they are slightly preferred to the ri forms.
-        def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, RegMRM>;
-        def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, RegMRM>;
-        def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, RegMRM>;
-
-        def NAME#16ri  : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>;
-        def NAME#32ri  : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>;
-        def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>;
+  let isCommutable = CommutableRR in {
+    let Predicates = [NoNDD] in {
+      def NAME#8rr  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        def NAME#16rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+        def NAME#32rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+        def NAME#64rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode>;
       }
-    } // Constraints = "$src1 = $dst"
-
-    def NAME#8mr    : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>;
-    def NAME#16mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>;
-    def NAME#32mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>;
-    def NAME#64mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>;
-
-    // NOTE: These are order specific, we want the mi8 forms to be listed
-    // first so that they are slightly preferred to the mi forms.
-    let mayLoad = 1, mayStore = 1, hasSideEffects = 0 in {
-    def NAME#16mi8  : BinOpMI8_RMW_FF<mnemonic, Xi16, MemMRM>;
-    def NAME#32mi8  : BinOpMI8_RMW_FF<mnemonic, Xi32, MemMRM>;
-    let Predicates = [In64BitMode] in
-    def NAME#64mi8  : BinOpMI8_RMW_FF<mnemonic, Xi64, MemMRM>;
-
-    def NAME#8mi    : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-    def NAME#16mi   : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>;
-    def NAME#32mi   : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>;
-    let Predicates = [In64BitMode] in
-    def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>;
     }
+    let Predicates = [HasNDD, In64BitMode] in {
+      def NAME#8rr_ND  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode, 1>;
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        def NAME#16rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode, 1>, PD;
+        def NAME#32rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode, 1>;
+        def NAME#64rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode, 1>;
+      }
+    }
+  } // isCommutable
+
+  let Predicates = [In64BitMode] in {
+    def NAME#8rr_EVEX  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
+    def NAME#16rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
+    def NAME#32rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
+    def NAME#64rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
+  }
+
+  def NAME#8rr_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>;
+  def NAME#16rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
+  def NAME#32rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
+  def NAME#64rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>;
+  let Predicates = [In64BitMode] in {
+    def NAME#8rr_ND_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>;
+    def NAME#16rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD;
+    def NAME#32rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>;
+    def NAME#64rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>;
+    def NAME#8rr_EVEX_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL;
+    def NAME#16rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD;
+    def NAME#32rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL;
+    def NAME#64rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL;
+  }
+
+  let Predicates = [NoNDD] in {
+    def NAME#8rm   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>;
+    def NAME#16rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16;
+    def NAME#32rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32;
+    def NAME#64rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>;
+  }
+  let Predicates = [HasNDD, In64BitMode] in {
+    def NAME#8rm_ND   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode, 1>;
+    def NAME#16rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode, 1>, PD;
+    def NAME#32rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode, 1>;
+    def NAME#64rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode, 1>;
+  }
+  let Predicates = [In64BitMode] in {
+    def NAME#8rm_EVEX   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>, PL;
+    def NAME#16rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, PL, PD;
+    def NAME#32rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, PL;
+    def NAME#64rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>, PL;
+  }
+
+  let Predicates = [NoNDD] in {
+    def NAME#8ri   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+    let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+      // NOTE: These are order specific, we want the ri8 forms to be listed
+      // first so that they are slightly preferred to the ri forms.
+      def NAME#16ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+      def NAME#32ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
+      def NAME#64ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>;
 
-    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
-    // not in 64-bit mode.
-    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
-        hasSideEffects = 0 in {
-      let Constraints = "$src1 = $dst" in
-        def NAME#8ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi8, RegMRM>;
-      let mayLoad = 1, mayStore = 1 in
-        def NAME#8mi8 : BinOpMI8_RMW_FF<mnemonic, Xi8, MemMRM>;
+      def NAME#16ri  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
+      def NAME#32ri  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
+      def NAME#64ri32: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>;
+    }
+  }
+
+  let Predicates = [HasNDD, In64BitMode] in {
+    def NAME#8ri_ND   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM, 1>;
+    let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+      def NAME#16ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD;
+      def NAME#32ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM, 1>;
+      def NAME#64ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
+      def NAME#16ri_ND  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM, 1>, PD;
+      def NAME#32ri_ND  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM, 1>;
+      def NAME#64ri32_ND: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM, 1>;
     }
-  } // Uses = [EFLAGS], Defs = [EFLAGS]
-
-  def NAME#8i8   : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL,
-                               "{$src, %al|al, $src}">;
-  def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX,
-                               "{$src, %ax|ax, $src}">;
-  def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX,
-                               "{$src, %eax|eax, $src}">;
-  def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX,
+  }
+  let Predicates = [In64BitMode] in {
+    def NAME#8ri_EVEX   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>, PL;
+    def NAME#16ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD;
+    def NAME#32ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, PL;
+    def NAME#64ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>, PL;
+    def NAME#16ri_EVEX  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, PL, PD;
+    def NAME#32ri_EVEX  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, PL;
+    def NAME#64ri32_EVEX: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>, PL;
+  }
+
+  def NAME#8mr    : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , opnode>;
+  def NAME#16mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+  def NAME#32mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+  def NAME#64mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, opnode>;
+  let Predicates = [HasNDD, In64BitMode] in {
+    def NAME#8mr_ND    : BinOpMRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi16, opnode>, PD;
+    def NAME#32mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi64, opnode>;
+  }
+  let Predicates = [In64BitMode] in {
+    def NAME#8mr_EVEX    : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL;
+    def NAME#16mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD;
+    def NAME#32mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL;
+    def NAME#64mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL;
+  }
+
+  // NOTE: These are order specific, we want the mi8 forms to be listed
+  // first so that they are slightly preferred to the mi forms.
+  def NAME#8mi    : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+  def NAME#16mi8  : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, OpSize16;
+  def NAME#32mi8  : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, OpSize32;
+  let Predicates = [In64BitMode] in
+    def NAME#64mi8  : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>;
+  def NAME#16mi   : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
+  def NAME#32mi   : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
+  let Predicates = [In64BitMode] in
+    def NAME#64mi32 : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
+
+  let Predicates = [HasNDD, In64BitMode] in {
+    def NAME#8mi_ND    : BinOpMIF_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi8_ND  : BinOpMI8F_RF<mnemonic, Xi16, MemMRM>, PD;
+    def NAME#32mi8_ND  : BinOpMI8F_RF<mnemonic, Xi32, MemMRM>;
+    def NAME#64mi8_ND  : BinOpMI8F_RF<mnemonic, Xi64, MemMRM>;
+    def NAME#16mi_ND   : BinOpMIF_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD;
+    def NAME#32mi_ND   : BinOpMIF_RF<0x81, mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32_ND : BinOpMIF_RF<0x81, mnemonic, Xi64, opnode, MemMRM>;
+  }
+  let Predicates = [In64BitMode] in {
+    def NAME#8mi_EVEX    : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>, PL;
+    def NAME#16mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, PL, PD;
+    def NAME#32mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, PL;
+    def NAME#64mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>, PL;
+    def NAME#16mi_EVEX   : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, PL, PD;
+    def NAME#32mi_EVEX   : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, PL;
+    def NAME#64mi32_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>, PL;
+  }
+
+  // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+  // not in 64-bit mode.
+  let Predicates = [Not64BitMode]  in {
+    def NAME#8ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+  def NAME#8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
+  }
+
+  def NAME#8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL,
+                             "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX,
+                               "{$src, %ax|ax, $src}">, OpSize16;
+  def NAME#32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX,
+                               "{$src, %eax|eax, $src}">, OpSize32;
+  def NAME#64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX,
                                "{$src, %rax|rax, $src}">;
 }
 
@@ -957,93 +736,88 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
 ///
 multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                         string mnemonic, Format RegMRM, Format MemMRM,
-                        SDNode opnode,
-                        bit CommutableRR, bit ConvertibleToThreeAddress> {
-  let Defs = [EFLAGS] in {
-    let isCommutable = CommutableRR in {
-      def NAME#8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
-      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
-        def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
-        def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
-        def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
-      }
-    } // isCommutable
-
-    def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
-    def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
-    def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
-    def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
-
-    def NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
-    def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
-    def NAME#32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
-    def NAME#64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
-
-    def NAME#8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
-
-    let isConvertibleToThreeAddress = ConvertibleToThreeAddress, hasSideEffects = 0 in {
-      // NOTE: These are order specific, we want the ri8 forms to be listed
-      // first so that they are slightly preferred to the ri forms.
-      def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, RegMRM>;
-      def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, RegMRM>;
-      def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, RegMRM>;
-
-      def NAME#16ri  : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
-      def NAME#32ri  : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
-      def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
-    }
-
-    def NAME#8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
-    def NAME#16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
-    def NAME#32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
-    def NAME#64mr   : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
-
-    // NOTE: These are order specific, we want the mi8 forms to be listed
-    // first so that they are slightly preferred to the mi forms.
-    let mayLoad = 1, hasSideEffects = 0 in {
-      def NAME#16mi8  : BinOpMI8_F<mnemonic, Xi16, MemMRM>;
-      def NAME#32mi8  : BinOpMI8_F<mnemonic, Xi32, MemMRM>;
-      let Predicates = [In64BitMode] in
-      def NAME#64mi8  : BinOpMI8_F<mnemonic, Xi64, MemMRM>;
-
-      def NAME#8mi    : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-      def NAME#16mi   : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>;
-      def NAME#32mi   : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>;
-      let Predicates = [In64BitMode] in
-      def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>;
-    }
-
-    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
-    // not in 64-bit mode.
-    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
-        hasSideEffects = 0 in {
-      def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>;
-      let mayLoad = 1 in
-        def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>;
-    }
-  } // Defs = [EFLAGS]
-
-  def NAME#8i8   : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
-                             "{$src, %al|al, $src}">;
+                        SDNode opnode, bit CommutableRR,
+                        bit ConvertibleToThreeAddress> {
+  let isCommutable = CommutableRR in {
+  def NAME#8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+    let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+    def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+    def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+    def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+    } // isConvertibleToThreeAddress
+  } // isCommutable
+
+  def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+  def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
+  def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
+  def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+
+  def NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
+  def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16;
+  def NAME#32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32;
+  def NAME#64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
+
+  def NAME#8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
+  let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+  // NOTE: These are order specific, we want the ri8 forms to be listed
+  // first so that they are slightly preferred to the ri forms.
+  def NAME#16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+  def NAME#32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
+  def NAME#64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>;
+
+  def NAME#16ri  : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
+  def NAME#32ri  : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
+  def NAME#64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>;
+  }
+
+  def NAME#8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+  def NAME#16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+  def NAME#32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
+  def NAME#64mr   : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
+
+  // NOTE: These are order specific, we want the mi8 forms to be listed
+  // first so that they are slightly preferred to the mi forms.
+  def NAME#16mi8  : BinOpMI8_F<mnemonic, Xi16, MemMRM>, OpSize16;
+  def NAME#32mi8  : BinOpMI8_F<mnemonic, Xi32, MemMRM>, OpSize32;
+  let Predicates = [In64BitMode] in
+  def NAME#64mi8  : BinOpMI8_F<mnemonic, Xi64, MemMRM>;
+
+  def NAME#8mi    : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+  def NAME#16mi   : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
+  def NAME#32mi   : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
+  let Predicates = [In64BitMode] in
+  def NAME#64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>;
+
+  // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+  // not in 64-bit mode.
+  let Predicates = [Not64BitMode] in {
+  def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+    let mayLoad = 1 in
+    def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>;
+  }
+
+  def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
+                           "{$src, %al|al, $src}">;
   def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
-                             "{$src, %ax|ax, $src}">;
+                           "{$src, %ax|ax, $src}">, OpSize16;
   def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
-                             "{$src, %eax|eax, $src}">;
+                           "{$src, %eax|eax, $src}">, OpSize32;
   def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
-                             "{$src, %rax|rax, $src}">;
+                           "{$src, %rax|rax, $src}">;
 }
 
 
-defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m,
+defm AND : ArithBinOp_RF<0x21, 0x23, 0x25, "and", MRM4r, MRM4m,
                          X86and_flag, and, 1, 0, 0>;
-defm OR  : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m,
+defm OR  : ArithBinOp_RF<0x09, 0x0B, 0x0D, "or", MRM1r, MRM1m,
                          X86or_flag, or, 1, 0, 0>;
-defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
+defm XOR : ArithBinOp_RF<0x31, 0x33, 0x35, "xor", MRM6r, MRM6m,
                          X86xor_flag, xor, 1, 0, 0>;
-defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
+defm ADD : ArithBinOp_RF<0x01, 0x03, 0x05, "add", MRM0r, MRM0m,
                          X86add_flag, add, 1, 1, 1>;
 let isCompare = 1 in {
-defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
+defm SUB : ArithBinOp_RF<0x29, 0x2B, 0x2D, "sub", MRM5r, MRM5m,
                          X86sub_flag, sub, 0, 1, 0>;
 }
 
@@ -1057,13 +831,13 @@ def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst),
                      Sched<[WriteALU]>;
 
 // Arithmetic.
-defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
+defm ADC : ArithBinOp_RFF<0x11, 0x13, 0x15, "adc", MRM2r, MRM2m, X86adc_flag,
                           1, 0>;
-defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
+defm SBB : ArithBinOp_RFF<0x19, 0x1B, 0x1D, "sbb", MRM3r, MRM3m, X86sbb_flag,
                           0, 0>;
 
 let isCompare = 1 in {
-defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+defm CMP : ArithBinOp_F<0x39, 0x3B, 0x3D, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
 }
 
 // Patterns to recognize loads on the LHS of an ADC. We can't make X86adc_flag
@@ -1201,44 +975,37 @@ def : Pat<(store (X86adc_flag i64relocImmSExt32_su:$src, (load addr:$dst), EFLAG
 // they don't have all the usual imm8 and REV forms, and are encoded into a
 // different space.
 let isCompare = 1 in {
-  let Defs = [EFLAGS] in {
-    let isCommutable = 1 in {
-      // Avoid selecting these and instead use a test+and. Post processing will
-      // combine them. This gives bunch of other patterns that start with
-      // and a chance to match.
-      def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , null_frag>;
-      def TEST16rr : BinOpRR_F<0x84, "test", Xi16, null_frag>;
-      def TEST32rr : BinOpRR_F<0x84, "test", Xi32, null_frag>;
-      def TEST64rr : BinOpRR_F<0x84, "test", Xi64, null_frag>;
-    } // isCommutable
-
-    let hasSideEffects = 0, mayLoad = 1 in {
-    def TEST8mr    : BinOpMR_F<0x84, "test", Xi8 , null_frag>;
-    def TEST16mr   : BinOpMR_F<0x84, "test", Xi16, null_frag>;
-    def TEST32mr   : BinOpMR_F<0x84, "test", Xi32, null_frag>;
-    def TEST64mr   : BinOpMR_F<0x84, "test", Xi64, null_frag>;
-    }
-
-    def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
-    def TEST16ri   : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
-    def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
-    def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
-
-    def TEST8mi    : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
-    def TEST16mi   : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>;
-    def TEST32mi   : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
-    let Predicates = [In64BitMode] in
-    def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
-  } // Defs = [EFLAGS]
-
-  def TEST8i8    : BinOpAI_F<0xA8, "test", Xi8 , AL,
-                             "{$src, %al|al, $src}">;
-  def TEST16i16  : BinOpAI_F<0xA8, "test", Xi16, AX,
-                             "{$src, %ax|ax, $src}">;
-  def TEST32i32  : BinOpAI_F<0xA8, "test", Xi32, EAX,
-                             "{$src, %eax|eax, $src}">;
-  def TEST64i32  : BinOpAI_F<0xA8, "test", Xi64, RAX,
-                             "{$src, %rax|rax, $src}">;
+  let isCommutable = 1 in {
+  // Avoid selecting these and instead use a test+and. Post processing will
+  // combine them. This gives bunch of other patterns that start with
+  // and a chance to match.
+  def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , null_frag>;
+  def TEST16rr : BinOpRR_F<0x85, "test", Xi16, null_frag>, OpSize16;
+  def TEST32rr : BinOpRR_F<0x85, "test", Xi32, null_frag>, OpSize32;
+  def TEST64rr : BinOpRR_F<0x85, "test", Xi64, null_frag>;
+  } // isCommutable
+
+def TEST8mr    : BinOpMR_F<0x84, "test", Xi8 , null_frag>;
+def TEST16mr   : BinOpMR_F<0x85, "test", Xi16, null_frag>, OpSize16;
+def TEST32mr   : BinOpMR_F<0x85, "test", Xi32, null_frag>, OpSize32;
+def TEST64mr   : BinOpMR_F<0x85, "test", Xi64, null_frag>;
+
+def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+def TEST16ri   : BinOpRI_F<0xF7, "test", Xi16, X86testpat, MRM0r>, OpSize16;
+def TEST32ri   : BinOpRI_F<0xF7, "test", Xi32, X86testpat, MRM0r>, OpSize32;
+def TEST64ri32 : BinOpRI_F<0xF7, "test", Xi64, X86testpat, MRM0r>;
+
+def TEST8mi    : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
+def TEST16mi   : BinOpMI_F<0xF7, "test", Xi16, X86testpat, MRM0m>, OpSize16;
+def TEST32mi   : BinOpMI_F<0xF7, "test", Xi32, X86testpat, MRM0m>, OpSize32;
+
+  let Predicates = [In64BitMode] in
+  def TEST64mi32 : BinOpMI_F<0xF7, "test", Xi64, X86testpat, MRM0m>;
+
+def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, "{$src, %al|al, $src}">;
+def TEST16i16 : BinOpAI_F<0xA9, "test", Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16;
+def TEST32i32 : BinOpAI_F<0xA9, "test", Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32;
+def TEST64i32 : BinOpAI_F<0xA9, "test", Xi64, RAX, "{$src, %rax|rax, $src}">;
 } // isCompare
 
 // Patterns to match a relocImm into the immediate field.
@@ -1263,36 +1030,30 @@ def : Pat<(X86testpat (loadi64 addr:$src1), i64relocImmSExt32_su:$src2),
 //===----------------------------------------------------------------------===//
 // ANDN Instruction
 //
-multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
-                    PatFrag ld_frag, X86FoldableSchedWrite sched> {
-let Predicates = [HasBMI, NoEGPR] in {
-  def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
-             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
-           VEX_4V, Sched<[sched]>;
-  def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
-             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, EFLAGS,
-              (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
-           VEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
-let Predicates = [HasBMI, HasEGPR, In64BitMode] in {
-  def rr_EVEX : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
-                  !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
-                EVEX_4V, Sched<[sched]>;
-  def rm_EVEX : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
-                  !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  [(set RC:$dst, EFLAGS,
-                   (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
-                EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
+multiclass AndN<X86TypeInfo t, string suffix> {
+  defvar andn_rr_p =
+    [(set t.RegClass:$dst, EFLAGS, (X86and_flag (not t.RegClass:$src1),
+     t.RegClass:$src2))];
+  defvar andn_rm_p =
+    [(set t.RegClass:$dst, EFLAGS, (X86and_flag (not t.RegClass:$src1),
+     (t.LoadNode addr:$src2)))];
+  def rr#suffix : ITy<0xF2, MRMSrcReg, t, (outs t.RegClass:$dst),
+                      (ins t.RegClass:$src1, t.RegClass:$src2), "andn",
+                      binop_ndd_args, andn_rr_p>, VVVV, Sched<[WriteALU]>,
+                     T8, DefEFLAGS;
+  def rm#suffix : ITy<0xF2, MRMSrcMem, t, (outs t.RegClass:$dst),
+                       (ins t.RegClass:$src1, t.MemOperand:$src2), "andn",
+                       binop_ndd_args, andn_rm_p>, VVVV,
+                       Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>,
+                       T8, DefEFLAGS;
 }
 
 // Complexity is reduced to give and with immediate a chance to match first.
-let Defs = [EFLAGS], AddedComplexity = -6 in {
-  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8PS;
-  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8PS, REX_W;
+let AddedComplexity = -6 in {
+defm ANDN32 : AndN<Xi32, "">, VEX, Requires<[HasBMI, NoEGPR]>;
+defm ANDN64 : AndN<Xi64, "">, VEX, REX_W, Requires<[HasBMI, NoEGPR]>;
+defm ANDN32 : AndN<Xi32, "_EVEX">, EVEX, Requires<[HasBMI, HasEGPR, In64BitMode]>;
+defm ANDN64 : AndN<Xi64, "_EVEX">, EVEX, REX_W, Requires<[HasBMI, HasEGPR, In64BitMode]>;
 }
 
 let Predicates = [HasBMI], AddedComplexity = -6 in {
@@ -1309,91 +1070,63 @@ let Predicates = [HasBMI], AddedComplexity = -6 in {
 //===----------------------------------------------------------------------===//
 // MULX Instruction
 //
-multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
-                    X86FoldableSchedWrite sched> {
-let hasSideEffects = 0 in {
-let Predicates = [HasBMI2, NoEGPR] in {
-  def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
-             !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             []>, T8XD, VEX_4V, Sched<[WriteIMulH, sched]>;
-
+multiclass MulX<X86TypeInfo t, X86FoldableSchedWrite sched> {
+  defvar mulx_args = "{$src, $dst2, $dst1|$dst1, $dst2, $src}";
+  defvar mulx_rm_sched =
+    [WriteIMulHLd, sched.Folded,
+     // Memory operand.
+     ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+     // Implicit read of EDX/RDX
+     sched.ReadAfterFold];
+
+  def rr : ITy<0xF6, MRMSrcReg, t, (outs t.RegClass:$dst1, t.RegClass:$dst2),
+               (ins t.RegClass:$src), "mulx", mulx_args, []>, T8, XD, VEX,
+           VVVV, Sched<[WriteIMulH, sched]>;
   let mayLoad = 1 in
-  def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
-             !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             []>, T8XD, VEX_4V,
-             Sched<[WriteIMulHLd, sched.Folded,
-                    // Memory operand.
-                    ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-                    // Implicit read of EDX/RDX
-                    sched.ReadAfterFold]>;
-
+  def rm : ITy<0xF6, MRMSrcMem, t, (outs t.RegClass:$dst1, t.RegClass:$dst2),
+               (ins t.MemOperand:$src), "mulx", mulx_args, []>, T8, XD, VEX,
+               VVVV, Sched<mulx_rm_sched>;
+
+  let Predicates = [In64BitMode] in {
+  def rr_EVEX : ITy<0xF6, MRMSrcReg, t,
+                    (outs t.RegClass:$dst1, t.RegClass:$dst2),
+                    (ins t.RegClass:$src), "mulx", mulx_args, []>, T8, XD,
+                EVEX, VVVV, Sched<[WriteIMulH, sched]>;
+    let mayLoad = 1 in
+    def rm_EVEX : ITy<0xF6, MRMSrcMem, t,
+                      (outs t.RegClass:$dst1, t.RegClass:$dst2),
+                      (ins t.MemOperand:$src), "mulx", mulx_args, []>, T8, XD,
+                  EVEX, VVVV, Sched<mulx_rm_sched>;
+  }
   // Pseudo instructions to be used when the low result isn't used. The
   // instruction is defined to keep the high if both destinations are the same.
-  def Hrr : PseudoI<(outs RC:$dst), (ins RC:$src),
-                    []>, Sched<[sched]>;
-
+  def Hrr : PseudoI<(outs t.RegClass:$dst), (ins t.RegClass:$src), []>,
+            Sched<[sched]>;
   let mayLoad = 1 in
-  def Hrm : PseudoI<(outs RC:$dst), (ins x86memop:$src),
-                    []>, Sched<[sched.Folded]>;
-}
-let Predicates = [HasBMI2, HasEGPR, In64BitMode] in
-  def rr#_EVEX : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
-                   !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-                   []>, T8XD, EVEX_4V, Sched<[WriteIMulH, sched]>;
-let Predicates = [HasBMI2, HasEGPR, In64BitMode], mayLoad = 1 in
-  def rm#_EVEX : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
-                   !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-                   []>, T8XD, EVEX_4V,
-                 Sched<[WriteIMulHLd, sched.Folded,
-                        // Memory operand.
-                        ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-                        // Implicit read of EDX/RDX
-                        sched.ReadAfterFold]>;
-}
+  def Hrm : PseudoI<(outs t.RegClass:$dst), (ins t.MemOperand:$src), []>,
+            Sched<[sched.Folded]>;
 }
 
 let Uses = [EDX] in
-  defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteMULX32>;
+defm MULX32 : MulX<Xi32, WriteMULX32>;
+
 let Uses = [RDX] in
-  defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteMULX64>, REX_W;
+defm MULX64 : MulX<Xi64, WriteMULX64>, REX_W;
 
 //===----------------------------------------------------------------------===//
 // ADCX and ADOX Instructions
 //
 // We don't have patterns for these as there is no advantage over ADC for
 // most code.
-class ADCOXOpRR <bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : BinOpRR_C<opcode, MRMSrcReg, mnemonic, info, []>{
-  let Opcode = opcode;
-  let OpSize = OpSizeFixed;
+let Form = MRMSrcReg in {
+def ADCX32rr : BinOpRRF_RF<0xF6, "adcx", Xi32, null_frag>, T8, PD;
+def ADCX64rr : BinOpRRF_RF<0xF6, "adcx", Xi64, null_frag>, T8, PD;
+def ADOX32rr : BinOpRRF_RF<0xF6, "adox", Xi32, null_frag>, T8, XS;
+def ADOX64rr : BinOpRRF_RF<0xF6, "adox", Xi64, null_frag>, T8, XS;
 }
-
-class ADCOXOpRM <bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : BinOpRM_C<opcode, MRMSrcMem, mnemonic, info, []>{
-  let Opcode = opcode;
-  let OpSize = OpSizeFixed;
-}
-
-let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
-    Constraints = "$src1 = $dst", hasSideEffects = 0 in {
-  let SchedRW = [WriteADC], isCommutable = 1 in {
-  def ADCX32rr : ADCOXOpRR<0xF6, "adcx", Xi32>, T8PD;
-  def ADCX64rr : ADCOXOpRR<0xF6, "adcx", Xi64>, T8PD;
-
-  def ADOX32rr : ADCOXOpRR<0xF6, "adox", Xi32>, T8XS;
-  def ADOX64rr : ADCOXOpRR<0xF6, "adox", Xi64>, T8XS;
-  } // SchedRW
-
-  let mayLoad = 1,
-      SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold,
-                 // Memory operand.
-                 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-                 // Implicit read of EFLAGS
-                 WriteADC.ReadAfterFold] in {
-  def ADCX32rm : ADCOXOpRM<0xF6, "adcx", Xi32>, T8PD;
-  def ADCX64rm : ADCOXOpRM<0xF6, "adcx", Xi64>, T8PD;
-
-  def ADOX32rm : ADCOXOpRM<0xF6, "adox", Xi32>, T8XS;
-  def ADOX64rm : ADCOXOpRM<0xF6, "adox", Xi64>, T8XS;
-  } // mayLoad, SchedRW
+let Form = MRMSrcMem in {
+def ADCX32rm : BinOpRMF_RF<0xF6, "adcx", Xi32, null_frag>, T8, PD;
+def ADCX64rm : BinOpRMF_RF<0xF6, "adcx", Xi64, null_frag>, T8, PD;
+def ADOX32rm : BinOpRMF_RF<0xF6, "adox", Xi32, null_frag>, T8, XS;
+def ADOX64rm : BinOpRMF_RF<0xF6, "adox", Xi64, null_frag>, T8, XS;
 }
diff --git a/llvm/lib/Target/X86/X86InstrAsmAlias.td b/llvm/lib/Target/X86/X86InstrAsmAlias.td
index f1a90d9c59c3..2590be8651d5 100644
--- a/llvm/lib/Target/X86/X86InstrAsmAlias.td
+++ b/llvm/lib/Target/X86/X86InstrAsmAlias.td
@@ -55,6 +55,11 @@ multiclass CMPCCXADD_Aliases<string Cond, int CC> {
                   (CMPCCXADDmr32 GR32:$dst, i32mem:$dstsrc2, GR32:$src3, CC), 0>;
   def : InstAlias<"cmp"#Cond#"xadd"#"\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
                   (CMPCCXADDmr64 GR64:$dst, i64mem:$dstsrc2, GR64:$src3, CC), 0>;
+
+  def : InstAlias<"cmp"#Cond#"xadd"#"\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
+                  (CMPCCXADDmr32_EVEX GR32:$dst, i32mem:$dstsrc2, GR32:$src3, CC), 0>;
+  def : InstAlias<"cmp"#Cond#"xadd"#"\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
+                  (CMPCCXADDmr64_EVEX GR64:$dst, i64mem:$dstsrc2, GR64:$src3, CC), 0>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 457833f8cc33..c77c77ee4a3e 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1515,6 +1515,23 @@ def : Pat<(X86add_flag_nocf GR32:$src1, 128),
 def : Pat<(X86add_flag_nocf GR64:$src1, 128),
           (SUB64ri32 GR64:$src1, -128)>;
 
+// Depositing value to 8/16 bit subreg:
+def : Pat<(or (and GR64:$dst, -256), 
+              (i64 (zextloadi8 addr:$src))),
+          (INSERT_SUBREG (i64 (COPY $dst)), (MOV8rm  i8mem:$src), sub_8bit)>; 
+
+def : Pat<(or (and GR32:$dst, -256), 
+              (i32 (zextloadi8 addr:$src))),
+          (INSERT_SUBREG (i32 (COPY $dst)), (MOV8rm  i8mem:$src), sub_8bit)>; 
+
+def : Pat<(or (and GR64:$dst, -65536), 
+              (i64 (zextloadi16 addr:$src))),
+          (INSERT_SUBREG (i64 (COPY $dst)), (MOV16rm  i16mem:$src), sub_16bit)>;
+
+def : Pat<(or (and GR32:$dst, -65536), 
+              (i32 (zextloadi16 addr:$src))),
+          (INSERT_SUBREG (i32 (COPY $dst)), (MOV16rm  i16mem:$src), sub_16bit)>; 
+
 // The same trick applies for 32-bit immediate fields in 64-bit
 // instructions.
 def : Pat<(add GR64:$src1, 0x0000000080000000),
diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td
index 09655d939121..6a9a74ce15f2 100644
--- a/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -666,20 +666,20 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", []>;
 
 let Uses = [FPSW, FPCW] in {
 def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
-             "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, PS,
+             "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, TB,
              Requires<[HasFXSR]>;
 def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
                "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)]>,
-               PS, Requires<[HasFXSR, In64BitMode]>;
+               TB, Requires<[HasFXSR, In64BitMode]>;
 } // Uses = [FPSW, FPCW]
 
 let Defs = [FPSW, FPCW] in {
 def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaquemem:$src),
               "fxrstor\t$src", [(int_x86_fxrstor addr:$src)]>,
-              PS, Requires<[HasFXSR]>;
+              TB, Requires<[HasFXSR]>;
 def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src),
                 "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)]>,
-                PS, Requires<[HasFXSR, In64BitMode]>;
+                TB, Requires<[HasFXSR, In64BitMode]>;
 } // Defs = [FPSW, FPCW]
 } // SchedRW
 
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index df05a5788a50..6e76b44b66a3 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -180,7 +180,7 @@ class OperandSize<bits<2> val> {
   bits<2> Value = val;
 }
 def OpSizeFixed  : OperandSize<0>; // Never needs a 0x66 prefix.
-def OpSize16     : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
+def OpSize16     : OperandSize<1>; // Needs 0x66 prefix in 32/64-bit mode.
 def OpSize32     : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
 
 // Address size for encodings that change based on mode.
@@ -234,7 +234,9 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
                                 // based on address size of the mode?
   bits<2> AdSizeBits = AdSize.Value;
 
-  Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have?
+  Encoding OpEnc = EncNormal; // Encoding used by this instruction
+  // Which prefix byte does this inst have?
+  Prefix OpPrefix = !if(!eq(OpEnc, EncNormal), NoPrfx, PS);
   bits<3> OpPrefixBits = OpPrefix.Value;
   Map OpMap = OB;           // Which opcode map does this inst have?
   bits<4> OpMapBits = OpMap.Value;
@@ -243,7 +245,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
   Domain ExeDomain = d;
   bit hasREPPrefix = 0;     // Does this inst have a REP prefix?
-  Encoding OpEnc = EncNormal; // Encoding used by this instruction
   bits<2> OpEncBits = OpEnc.Value;
   bit IgnoresW = 0;         // Does this inst ignore REX_W field?
   bit EVEX_W1_VEX_W0 = 0;   // This EVEX inst with VEX.W==1 can become a VEX
@@ -255,6 +256,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   bit hasEVEX_Z = 0;        // Does this inst set the EVEX_Z field?
   bit hasEVEX_L2 = 0;       // Does this inst set the EVEX_L2 field?
   bit hasEVEX_B = 0;        // Does this inst set the EVEX_B field?
+  bit hasEVEX_NF = 0;       // Does this inst set the EVEX_NF field?
   bits<3> CD8_Form = 0;     // Compressed disp8 form - vector-width.
   // Declare it int rather than bits<4> so that all bits are defined when
   // assigning to bits<7>.
@@ -308,4 +310,5 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{48}    = hasEVEX_RC;
   let TSFlags{49}    = hasNoTrackPrefix;
   let TSFlags{51-50} = explicitOpPrefixBits;
+  let TSFlags{52}    = hasEVEX_NF;
 }
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index bc2d5ed1e17d..bddda6891356 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -8263,8 +8263,8 @@ bool X86InstrInfo::unfoldMemoryOperand(
 
     DebugLoc DL;
     MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
-    for (unsigned i = 0, e = AddrOps.size(); i != e; ++i)
-      MIB.add(AddrOps[i]);
+    for (const MachineOperand &AddrOp : AddrOps)
+      MIB.add(AddrOp);
     MIB.setMemRefs(MMOs);
     NewMIs.push_back(MIB);
 
@@ -8341,8 +8341,8 @@ bool X86InstrInfo::unfoldMemoryOperand(
     unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
     DebugLoc DL;
     MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
-    for (unsigned i = 0, e = AddrOps.size(); i != e; ++i)
-      MIB.add(AddrOps[i]);
+    for (const MachineOperand &AddrOp : AddrOps)
+      MIB.add(AddrOp);
     MIB.addReg(Reg, RegState::Kill);
     MIB.setMemRefs(MMOs);
     NewMIs.push_back(MIB);
diff --git a/llvm/lib/Target/X86/X86InstrKL.td b/llvm/lib/Target/X86/X86InstrKL.td
index a3392b691c0a..4586fc541627 100644
--- a/llvm/lib/Target/X86/X86InstrKL.td
+++ b/llvm/lib/Target/X86/X86InstrKL.td
@@ -19,17 +19,17 @@ let SchedRW = [WriteSystem], Predicates = [HasKL] in {
   let Uses = [XMM0, EAX], Defs = [EFLAGS] in {
     def LOADIWKEY : I<0xDC, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                       "loadiwkey\t{$src2, $src1|$src1, $src2}",
-                      [(int_x86_loadiwkey XMM0, VR128:$src1, VR128:$src2, EAX)]>, T8XS;
+                      [(int_x86_loadiwkey XMM0, VR128:$src1, VR128:$src2, EAX)]>, T8, XS;
   }
 
   let Uses = [XMM0], Defs = [XMM0, XMM1, XMM2, XMM4, XMM5, XMM6, EFLAGS] in {
     def ENCODEKEY128 : I<0xFA, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                         "encodekey128\t{$src, $dst|$dst, $src}", []>, T8XS;
+                         "encodekey128\t{$src, $dst|$dst, $src}", []>, T8, XS;
   }
 
   let Uses = [XMM0, XMM1], Defs = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, EFLAGS] in {
     def ENCODEKEY256 : I<0xFB, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                         "encodekey256\t{$src, $dst|$dst, $src}", []>, T8XS;
+                         "encodekey256\t{$src, $dst|$dst, $src}", []>, T8, XS;
   }
 
   let Constraints = "$src1 = $dst",
@@ -37,22 +37,22 @@ let SchedRW = [WriteSystem], Predicates = [HasKL] in {
    def AESENC128KL : I<0xDC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
                        "aesenc128kl\t{$src2, $src1|$src1, $src2}",
                        [(set VR128:$dst, EFLAGS,
-                         (X86aesenc128kl VR128:$src1, addr:$src2))]>, T8XS;
+                         (X86aesenc128kl VR128:$src1, addr:$src2))]>, T8, XS;
 
    def AESDEC128KL : I<0xDD, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
                        "aesdec128kl\t{$src2, $src1|$src1, $src2}",
                        [(set VR128:$dst, EFLAGS,
-                         (X86aesdec128kl VR128:$src1, addr:$src2))]>, T8XS;
+                         (X86aesdec128kl VR128:$src1, addr:$src2))]>, T8, XS;
 
    def AESENC256KL : I<0xDE, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
                        "aesenc256kl\t{$src2, $src1|$src1, $src2}",
                        [(set VR128:$dst, EFLAGS,
-                         (X86aesenc256kl VR128:$src1, addr:$src2))]>, T8XS;
+                         (X86aesenc256kl VR128:$src1, addr:$src2))]>, T8, XS;
 
    def AESDEC256KL : I<0xDF, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
                        "aesdec256kl\t{$src2, $src1|$src1, $src2}",
                        [(set VR128:$dst, EFLAGS,
-                         (X86aesdec256kl VR128:$src1, addr:$src2))]>, T8XS;
+                         (X86aesdec256kl VR128:$src1, addr:$src2))]>, T8, XS;
   }
 
 } // SchedRW, Predicates
@@ -62,13 +62,13 @@ let SchedRW = [WriteSystem], Predicates = [HasWIDEKL] in {
       Defs = [EFLAGS, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7],
       mayLoad = 1 in {
     def AESENCWIDE128KL : I<0xD8, MRM0m, (outs), (ins opaquemem:$src),
-                            "aesencwide128kl\t$src", []>, T8XS;
+                            "aesencwide128kl\t$src", []>, T8, XS;
     def AESDECWIDE128KL : I<0xD8, MRM1m, (outs), (ins opaquemem:$src),
-                            "aesdecwide128kl\t$src", []>, T8XS;
+                            "aesdecwide128kl\t$src", []>, T8, XS;
     def AESENCWIDE256KL : I<0xD8, MRM2m, (outs), (ins opaquemem:$src),
-                            "aesencwide256kl\t$src", []>, T8XS;
+                            "aesencwide256kl\t$src", []>, T8, XS;
     def AESDECWIDE256KL : I<0xD8, MRM3m, (outs), (ins opaquemem:$src),
-                            "aesdecwide256kl\t$src", []>, T8XS;
+                            "aesdecwide256kl\t$src", []>, T8, XS;
   }
 
 } // SchedRW, Predicates
diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index 9796379aa0bf..8d6bc8d0ee2c 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -487,24 +487,24 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
 // -- Conversion Instructions
 defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
                       f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
-                      WriteCvtPS2I, SSEPackedSingle>, PS, SIMD_EXC;
+                      WriteCvtPS2I, SSEPackedSingle>, TB, SIMD_EXC;
 defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
                       f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
-                      WriteCvtPD2I, SSEPackedDouble>, PD, SIMD_EXC;
+                      WriteCvtPD2I, SSEPackedDouble>, TB, PD, SIMD_EXC;
 defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
                        f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
-                       WriteCvtPS2I, SSEPackedSingle>, PS, SIMD_EXC;
+                       WriteCvtPS2I, SSEPackedSingle>, TB, SIMD_EXC;
 defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
                        f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
-                       WriteCvtPD2I, SSEPackedDouble>, PD, SIMD_EXC;
+                       WriteCvtPD2I, SSEPackedDouble>, TB, PD, SIMD_EXC;
 defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
                          i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
-                         WriteCvtI2PD, SSEPackedDouble>, PD;
+                         WriteCvtI2PD, SSEPackedDouble>, TB, PD;
 let Constraints = "$src1 = $dst" in {
   defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
                          int_x86_sse_cvtpi2ps,
                          i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
-                         SSEPackedSingle>, PS, SIMD_EXC;
+                         SSEPackedSingle>, TB, SIMD_EXC;
 }
 
 // Extract / Insert
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 2ea10e317e12..305bd74f7bd7 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -165,10 +165,10 @@ def POPP64r  : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "popp\t$reg", []>,
                  REX_W, ExplicitREX2Prefix, Requires<[In64BitMode]>;
 def POP2: I<0x8F, MRM0r, (outs GR64:$reg1, GR64:$reg2), (ins),
             "pop2\t{$reg2, $reg1|$reg1, $reg2}",
-            []>, EVEX_4V, EVEX_B, T_MAP4PS;
+            []>, EVEX, VVVV, EVEX_B, T_MAP4;
 def POP2P: I<0x8F, MRM0r, (outs GR64:$reg1, GR64:$reg2), (ins),
              "pop2p\t{$reg2, $reg1|$reg1, $reg2}",
-             []>, EVEX_4V, EVEX_B, T_MAP4PS, REX_W;
+             []>, EVEX, VVVV, EVEX_B, T_MAP4, REX_W;
 
 } // mayLoad, SchedRW
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in
@@ -186,10 +186,10 @@ def PUSHP64r  : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "pushp\t$reg", []>,
                   REX_W, ExplicitREX2Prefix, Requires<[In64BitMode]>;
 def PUSH2: I<0xFF, MRM6r, (outs), (ins GR64:$reg1, GR64:$reg2),
             "push2\t{$reg2, $reg1|$reg1, $reg2}",
-            []>, EVEX_4V, EVEX_B, T_MAP4PS;
+            []>, EVEX, VVVV, EVEX_B, T_MAP4;
 def PUSH2P: I<0xFF, MRM6r, (outs), (ins GR64:$reg1, GR64:$reg2),
              "push2p\t{$reg2, $reg1|$reg1, $reg2}",
-             []>, EVEX_4V, EVEX_B, T_MAP4PS, REX_W;
+             []>, EVEX, VVVV, EVEX_B, T_MAP4, REX_W;
 } // mayStore, SchedRW
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in {
 def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>,
@@ -251,52 +251,52 @@ let Defs = [EFLAGS] in {
 def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>,
-                  PS, OpSize16, Sched<[WriteBSF]>;
+                  TB, OpSize16, Sched<[WriteBSF]>;
 def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>,
-                 PS, OpSize16, Sched<[WriteBSFLd]>;
+                 TB, OpSize16, Sched<[WriteBSFLd]>;
 def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>,
-                 PS, OpSize32, Sched<[WriteBSF]>;
+                 TB, OpSize32, Sched<[WriteBSF]>;
 def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>,
-                 PS, OpSize32, Sched<[WriteBSFLd]>;
+                 TB, OpSize32, Sched<[WriteBSFLd]>;
 def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>,
-                  PS, Sched<[WriteBSF]>;
+                  TB, Sched<[WriteBSF]>;
 def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>,
-                  PS, Sched<[WriteBSFLd]>;
+                  TB, Sched<[WriteBSFLd]>;
 
 def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>,
-                 PS, OpSize16, Sched<[WriteBSR]>;
+                 TB, OpSize16, Sched<[WriteBSR]>;
 def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>,
-                 PS, OpSize16, Sched<[WriteBSRLd]>;
+                 TB, OpSize16, Sched<[WriteBSRLd]>;
 def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>,
-                 PS, OpSize32, Sched<[WriteBSR]>;
+                 TB, OpSize32, Sched<[WriteBSR]>;
 def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>,
-                 PS, OpSize32, Sched<[WriteBSRLd]>;
+                 TB, OpSize32, Sched<[WriteBSRLd]>;
 def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>,
-                  PS, Sched<[WriteBSR]>;
+                  TB, Sched<[WriteBSR]>;
 def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>,
-                  PS, Sched<[WriteBSRLd]>;
+                  TB, Sched<[WriteBSRLd]>;
 } // Defs = [EFLAGS]
 
 let SchedRW = [WriteMicrocoded] in {
@@ -1095,29 +1095,29 @@ let Predicates = [HasMOVBE] in {
   def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>,
-                    OpSize16, T8PS;
+                    OpSize16, T8;
   def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>,
-                    OpSize32, T8PS;
+                    OpSize32, T8;
   def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>,
-                     T8PS;
+                     T8;
   }
   let SchedRW = [WriteStore] in {
   def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
                     [(store (bswap GR16:$src), addr:$dst)]>,
-                    OpSize16, T8PS;
+                    OpSize16, T8;
   def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
                     [(store (bswap GR32:$src), addr:$dst)]>,
-                    OpSize32, T8PS;
+                    OpSize32, T8;
   def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
                      [(store (bswap GR64:$src), addr:$dst)]>,
-                     T8PS;
+                     T8;
   }
 }
 
@@ -1127,13 +1127,13 @@ let Predicates = [HasMOVBE] in {
 let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
   def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
                     "rdrand{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86rdrand))]>,
-                    OpSize16, PS;
+                    OpSize16, TB;
   def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
                     "rdrand{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86rdrand))]>,
-                    OpSize32, PS;
+                    OpSize32, TB;
   def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
                      "rdrand{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86rdrand))]>,
-                     PS;
+                     TB;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1141,11 +1141,11 @@ let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
 //
 let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
   def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), "rdseed{w}\t$dst",
-                    [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, PS;
+                    [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, TB;
   def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), "rdseed{l}\t$dst",
-                    [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, PS;
+                    [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, TB;
   def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdseed{q}\t$dst",
-                     [(set GR64:$dst, EFLAGS, (X86rdseed))]>, PS;
+                     [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1155,29 +1155,29 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
   def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                     "lzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>,
-                    XS, OpSize16, Sched<[WriteLZCNT]>;
+                    TB, XS, OpSize16, Sched<[WriteLZCNT]>;
   def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "lzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
-                     (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteLZCNTLd]>;
+                     (implicit EFLAGS)]>, TB, XS, OpSize16, Sched<[WriteLZCNTLd]>;
 
   def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                     "lzcnt{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>,
-                    XS, OpSize32, Sched<[WriteLZCNT]>;
+                    TB, XS, OpSize32, Sched<[WriteLZCNT]>;
   def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "lzcnt{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
-                     (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteLZCNTLd]>;
+                     (implicit EFLAGS)]>, TB, XS, OpSize32, Sched<[WriteLZCNTLd]>;
 
   def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                      "lzcnt{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
-                     XS, Sched<[WriteLZCNT]>;
+                     TB, XS, Sched<[WriteLZCNT]>;
   def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                      "lzcnt{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (ctlz (loadi64 addr:$src))),
-                      (implicit EFLAGS)]>, XS, Sched<[WriteLZCNTLd]>;
+                      (implicit EFLAGS)]>, TB, XS, Sched<[WriteLZCNTLd]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1187,29 +1187,29 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
   def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                     "tzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>,
-                    XS, OpSize16, Sched<[WriteTZCNT]>;
+                    TB, XS, OpSize16, Sched<[WriteTZCNT]>;
   def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "tzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (cttz (loadi16 addr:$src))),
-                     (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteTZCNTLd]>;
+                     (implicit EFLAGS)]>, TB, XS, OpSize16, Sched<[WriteTZCNTLd]>;
 
   def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                     "tzcnt{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>,
-                    XS, OpSize32, Sched<[WriteTZCNT]>;
+                    TB, XS, OpSize32, Sched<[WriteTZCNT]>;
   def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "tzcnt{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (cttz (loadi32 addr:$src))),
-                     (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteTZCNTLd]>;
+                     (implicit EFLAGS)]>, TB, XS, OpSize32, Sched<[WriteTZCNTLd]>;
 
   def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                      "tzcnt{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
-                     XS, Sched<[WriteTZCNT]>;
+                     TB, XS, Sched<[WriteTZCNT]>;
   def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                      "tzcnt{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (cttz (loadi64 addr:$src))),
-                      (implicit EFLAGS)]>, XS, Sched<[WriteTZCNTLd]>;
+                      (implicit EFLAGS)]>, TB, XS, Sched<[WriteTZCNTLd]>;
 }
 
 multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
@@ -1218,11 +1218,11 @@ multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
 let hasSideEffects = 0 in {
   def rr#Suffix : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
                     !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-                  T8PS, VEX_4V, Sched<[sched]>;
+                  T8, VEX, VVVV, Sched<[sched]>;
   let mayLoad = 1 in
   def rm#Suffix : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
                     !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-                  T8PS, VEX_4V, Sched<[sched.Folded]>;
+                  T8, VEX, VVVV, Sched<[sched.Folded]>;
 }
 }
 
@@ -1288,12 +1288,12 @@ multiclass bmi4VOp3_base<bits<8> opc, string mnemonic, RegisterClass RC,
   def rr#Suffix : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
                     !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
-                  T8PS, VEX, Sched<[Sched]>;
+                  T8, VEX, Sched<[Sched]>;
 let mayLoad = 1 in
   def rm#Suffix : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
                     !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)),
-                     (implicit EFLAGS)]>, T8PS, VEX,
+                     (implicit EFLAGS)]>, T8, VEX,
                   Sched<[Sched.Folded,
                          // x86memop:$src1
                          ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -1371,33 +1371,33 @@ multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
   def rr#Suffix : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
                     !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>,
-                  VEX_4V, Sched<[WriteALU]>;
+                  VEX, VVVV, Sched<[WriteALU]>;
   def rm#Suffix : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
                     !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>,
-                  VEX_4V, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
+                  VEX, VVVV, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
 }
 
 let Predicates = [HasBMI2, NoEGPR] in {
   defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
-                               X86pdep, loadi32>, T8XD;
+                               X86pdep, loadi32>, T8, XD;
   defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
-                               X86pdep, loadi64>, T8XD, REX_W;
+                               X86pdep, loadi64>, T8, XD, REX_W;
   defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
-                               X86pext, loadi32>, T8XS;
+                               X86pext, loadi32>, T8, XS;
   defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
-                               X86pext, loadi64>, T8XS, REX_W;
+                               X86pext, loadi64>, T8, XS, REX_W;
 }
 
 let Predicates = [HasBMI2, HasEGPR] in {
   defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
-                               X86pdep, loadi32, "_EVEX">, T8XD, EVEX;
+                               X86pdep, loadi32, "_EVEX">, T8, XD, EVEX;
   defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
-                               X86pdep, loadi64, "_EVEX">, T8XD, REX_W, EVEX;
+                               X86pdep, loadi64, "_EVEX">, T8, XD, REX_W, EVEX;
   defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
-                               X86pext, loadi32, "_EVEX">, T8XS, EVEX;
+                               X86pext, loadi32, "_EVEX">, T8, XS, EVEX;
   defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
-                               X86pext, loadi64, "_EVEX">, T8XS, REX_W, EVEX;
+                               X86pext, loadi64, "_EVEX">, T8, XS, REX_W, EVEX;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1419,12 +1419,12 @@ multiclass lwpins_intr<RegisterClass RC> {
   def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
                  "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
                  [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, timm:$cntl))]>,
-                 XOP_4V, XOPA;
+                 XOP, VVVV, XOPA;
   let mayLoad = 1 in
   def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
                  "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
                  [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), timm:$cntl))]>,
-                 XOP_4V, XOPA;
+                 XOP, VVVV, XOPA;
 }
 
 let Defs = [EFLAGS] in {
@@ -1435,12 +1435,12 @@ let Defs = [EFLAGS] in {
 multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> {
   def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
                  "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
-                 [(Int RC:$src0, GR32:$src1, timm:$cntl)]>, XOP_4V, XOPA;
+                 [(Int RC:$src0, GR32:$src1, timm:$cntl)]>, XOP, VVVV, XOPA;
   let mayLoad = 1 in
   def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
                  "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
                  [(Int RC:$src0, (loadi32 addr:$src1), timm:$cntl)]>,
-                 XOP_4V, XOPA;
+                 XOP, VVVV, XOPA;
 }
 
 defm LWPVAL32 : lwpval_intr<GR32, int_x86_lwpval32>;
@@ -1471,22 +1471,22 @@ let SchedRW = [ WriteSystem ] in {
 let SchedRW = [WriteSystem] in {
   def UMONITOR16 : I<0xAE, MRM6r, (outs), (ins GR16:$src),
                      "umonitor\t$src", [(int_x86_umonitor GR16:$src)]>,
-                     XS, AdSize16, Requires<[HasWAITPKG, Not64BitMode]>;
+                     TB, XS, AdSize16, Requires<[HasWAITPKG, Not64BitMode]>;
   def UMONITOR32 : I<0xAE, MRM6r, (outs), (ins GR32:$src),
                      "umonitor\t$src", [(int_x86_umonitor GR32:$src)]>,
-                     XS, AdSize32, Requires<[HasWAITPKG]>;
+                     TB, XS, AdSize32, Requires<[HasWAITPKG]>;
   def UMONITOR64 : I<0xAE, MRM6r, (outs), (ins GR64:$src),
                      "umonitor\t$src", [(int_x86_umonitor GR64:$src)]>,
-                     XS, AdSize64, Requires<[HasWAITPKG, In64BitMode]>;
+                     TB, XS, AdSize64, Requires<[HasWAITPKG, In64BitMode]>;
   let Uses = [EAX, EDX], Defs = [EFLAGS] in {
     def UMWAIT : I<0xAE, MRM6r,
                      (outs), (ins GR32orGR64:$src), "umwait\t$src",
                      [(set EFLAGS, (X86umwait GR32orGR64:$src, EDX, EAX))]>,
-                     XD, Requires<[HasWAITPKG]>;
+                     TB, XD, Requires<[HasWAITPKG]>;
     def TPAUSE : I<0xAE, MRM6r,
                      (outs), (ins GR32orGR64:$src), "tpause\t$src",
                      [(set EFLAGS, (X86tpause GR32orGR64:$src, EDX, EAX))]>,
-                     PD, Requires<[HasWAITPKG]>;
+                     TB, PD, Requires<[HasWAITPKG]>;
   }
 } // SchedRW
 
@@ -1497,19 +1497,19 @@ let SchedRW = [WriteStore] in {
 def MOVDIRI32 : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                   "movdiri\t{$src, $dst|$dst, $src}",
                   [(int_x86_directstore32 addr:$dst, GR32:$src)]>,
-                 T8PS, Requires<[HasMOVDIRI, NoEGPR]>;
+                 T8, Requires<[HasMOVDIRI, NoEGPR]>;
 def MOVDIRI64 : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                    "movdiri\t{$src, $dst|$dst, $src}",
                    [(int_x86_directstore64 addr:$dst, GR64:$src)]>,
-                  T8PS, Requires<[In64BitMode, HasMOVDIRI, NoEGPR]>;
+                  T8, Requires<[In64BitMode, HasMOVDIRI, NoEGPR]>;
 def MOVDIRI32_EVEX : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                        "movdiri\t{$src, $dst|$dst, $src}",
                        [(int_x86_directstore32 addr:$dst, GR32:$src)]>,
-                     EVEX_NoCD8, T_MAP4PS, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
+                     EVEX, NoCD8, T_MAP4, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
 def MOVDIRI64_EVEX : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                         "movdiri\t{$src, $dst|$dst, $src}",
                         [(int_x86_directstore64 addr:$dst, GR64:$src)]>,
-                     EVEX_NoCD8, T_MAP4PS, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
+                     EVEX, NoCD8, T_MAP4, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1518,23 +1518,23 @@ def MOVDIRI64_EVEX : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
 let SchedRW = [WriteStore] in {
 def MOVDIR64B16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem_GR16:$src),
                     "movdir64b\t{$src, $dst|$dst, $src}", []>,
-                   T8PD, AdSize16, Requires<[HasMOVDIR64B, Not64BitMode]>;
+                   T8, PD, AdSize16, Requires<[HasMOVDIR64B, Not64BitMode]>;
 def MOVDIR64B32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem_GR32:$src),
                     "movdir64b\t{$src, $dst|$dst, $src}",
                     [(int_x86_movdir64b GR32:$dst, addr:$src)]>,
-                   T8PD, AdSize32, Requires<[HasMOVDIR64B, NoEGPR]>;
+                   T8, PD, AdSize32, Requires<[HasMOVDIR64B, NoEGPR]>;
 def MOVDIR64B64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem_GR64:$src),
                     "movdir64b\t{$src, $dst|$dst, $src}",
                     [(int_x86_movdir64b GR64:$dst, addr:$src)]>,
-                   T8PD, AdSize64, Requires<[HasMOVDIR64B, NoEGPR, In64BitMode]>;
+                   T8, PD, AdSize64, Requires<[HasMOVDIR64B, NoEGPR, In64BitMode]>;
 def MOVDIR64B32_EVEX : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem_GR32:$src),
                          "movdir64b\t{$src, $dst|$dst, $src}",
                          [(int_x86_movdir64b GR32:$dst, addr:$src)]>,
-                       EVEX_NoCD8, T_MAP4PD, AdSize32, Requires<[HasMOVDIR64B, HasEGPR, In64BitMode]>;
+                       EVEX, NoCD8, T_MAP4, PD, AdSize32, Requires<[HasMOVDIR64B, HasEGPR, In64BitMode]>;
 def MOVDIR64B64_EVEX : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem_GR64:$src),
                          "movdir64b\t{$src, $dst|$dst, $src}",
                          [(int_x86_movdir64b GR64:$dst, addr:$src)]>,
-                       EVEX_NoCD8, T_MAP4PD, AdSize64, Requires<[HasMOVDIR64B, HasEGPR, In64BitMode]>;
+                       EVEX, NoCD8, T_MAP4, PD, AdSize64, Requires<[HasMOVDIR64B, HasEGPR, In64BitMode]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1544,28 +1544,28 @@ let SchedRW = [WriteStore], Defs = [EFLAGS] in {
   def ENQCMD16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
                  "enqcmd\t{$src, $dst|$dst, $src}",
                  [(set EFLAGS, (X86enqcmd GR16:$dst, addr:$src))]>,
-                 T8XD, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
+                 T8, XD, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
   def ENQCMD32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
                  "enqcmd\t{$src, $dst|$dst, $src}",
                  [(set EFLAGS, (X86enqcmd GR32:$dst, addr:$src))]>,
-                 T8XD, AdSize32, Requires<[HasENQCMD]>;
+                 T8, XD, AdSize32, Requires<[HasENQCMD]>;
   def ENQCMD64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
                  "enqcmd\t{$src, $dst|$dst, $src}",
                  [(set EFLAGS, (X86enqcmd GR64:$dst, addr:$src))]>,
-                 T8XD, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
+                 T8, XD, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
 
   def ENQCMDS16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
                  "enqcmds\t{$src, $dst|$dst, $src}",
                  [(set EFLAGS, (X86enqcmds GR16:$dst, addr:$src))]>,
-                 T8XS, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
+                 T8, XS, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
   def ENQCMDS32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
                  "enqcmds\t{$src, $dst|$dst, $src}",
                  [(set EFLAGS, (X86enqcmds GR32:$dst, addr:$src))]>,
-                 T8XS, AdSize32, Requires<[HasENQCMD]>;
+                 T8, XS, AdSize32, Requires<[HasENQCMD]>;
   def ENQCMDS64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
                  "enqcmds\t{$src, $dst|$dst, $src}",
                  [(set EFLAGS, (X86enqcmds GR64:$dst, addr:$src))]>,
-                 T8XS, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
+                 T8, XS, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1588,11 +1588,11 @@ let SchedRW = [WriteSystem] in {
   let Uses = [EAX, EDX] in
   def INVLPGB32 : I<0x01, MRM_FE, (outs), (ins),
                   "invlpgb", []>,
-                  PS, Requires<[Not64BitMode]>;
+                  TB, Requires<[Not64BitMode]>;
   let Uses = [RAX, EDX] in
   def INVLPGB64 : I<0x01, MRM_FE, (outs), (ins),
                   "invlpgb", []>,
-                  PS, Requires<[In64BitMode]>;
+                  TB, Requires<[In64BitMode]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1602,7 +1602,7 @@ let SchedRW = [WriteSystem] in {
 let SchedRW = [WriteSystem] in {
   def TLBSYNC   : I<0x01, MRM_FF, (outs), (ins),
                   "tlbsync", []>,
-                  PS, Requires<[]>;
+                  TB, Requires<[]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1610,14 +1610,14 @@ let SchedRW = [WriteSystem] in {
 //
 let Uses = [EAX], SchedRW = [WriteSystem] in
   def HRESET : Ii8<0xF0, MRM_C0, (outs), (ins i32u8imm:$imm), "hreset\t$imm", []>,
-                   Requires<[HasHRESET]>, TAXS;
+                   Requires<[HasHRESET]>, TA, XS;
 
 //===----------------------------------------------------------------------===//
 // SERIALIZE Instruction
 //
 let SchedRW = [WriteSystem] in
   def SERIALIZE : I<0x01, MRM_E8, (outs), (ins), "serialize",
-                    [(int_x86_serialize)]>, PS,
+                    [(int_x86_serialize)]>, TB,
                     Requires<[HasSERIALIZE]>;
 
 //===----------------------------------------------------------------------===//
@@ -1625,9 +1625,9 @@ let SchedRW = [WriteSystem] in
 //
 let Predicates = [HasTSXLDTRK], SchedRW = [WriteSystem] in {
   def XSUSLDTRK : I<0x01, MRM_E8, (outs), (ins), "xsusldtrk",
-                    [(int_x86_xsusldtrk)]>, XD;
+                    [(int_x86_xsusldtrk)]>, TB, XD;
   def XRESLDTRK : I<0x01, MRM_E9, (outs), (ins), "xresldtrk",
-                    [(int_x86_xresldtrk)]>, XD;
+                    [(int_x86_xresldtrk)]>, TB, XD;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1635,18 +1635,18 @@ let Predicates = [HasTSXLDTRK], SchedRW = [WriteSystem] in {
 //
 let Predicates = [HasUINTR, In64BitMode], SchedRW = [WriteSystem] in {
   def UIRET : I<0x01, MRM_EC, (outs), (ins), "uiret",
-               []>, XS;
+               []>, TB, XS;
   def CLUI : I<0x01, MRM_EE, (outs), (ins), "clui",
-               [(int_x86_clui)]>, XS;
+               [(int_x86_clui)]>, TB, XS;
   def STUI : I<0x01, MRM_EF, (outs), (ins), "stui",
-               [(int_x86_stui)]>, XS;
+               [(int_x86_stui)]>, TB, XS;
 
   def SENDUIPI : I<0xC7, MRM6r, (outs), (ins GR64:$arg), "senduipi\t$arg",
-                   [(int_x86_senduipi GR64:$arg)]>, XS;
+                   [(int_x86_senduipi GR64:$arg)]>, TB, XS;
 
   let Defs = [EFLAGS] in
     def TESTUI : I<0x01, MRM_ED, (outs), (ins), "testui",
-                   [(set EFLAGS, (X86testui))]>, XS;
+                   [(set EFLAGS, (X86testui))]>, TB, XS;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1663,21 +1663,38 @@ let Predicates = [HasPREFETCHI, In64BitMode], SchedRW = [WriteLoad] in {
 // CMPCCXADD Instructions
 //
 let isCodeGenOnly = 1, ForceDisassemble = 1, mayLoad = 1, mayStore = 1,
-    Predicates = [HasCMPCCXADD, In64BitMode], Defs = [EFLAGS],
-    Constraints = "$dstsrc1 = $dst" in {
+    Defs = [EFLAGS], Constraints = "$dstsrc1 = $dst" in {
+let Predicates = [HasCMPCCXADD, NoEGPR, In64BitMode] in {
 def CMPCCXADDmr32 : I<0xe0, MRMDestMem4VOp3CC, (outs GR32:$dst),
           (ins GR32:$dstsrc1, i32mem:$dstsrc2, GR32:$src3, ccode:$cond),
           "cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
           [(set GR32:$dst, (X86cmpccxadd addr:$dstsrc2,
             GR32:$dstsrc1, GR32:$src3, timm:$cond))]>,
-          VEX_4V, T8PD, Sched<[WriteXCHG]>;
+          VEX, VVVV, T8, PD, Sched<[WriteXCHG]>;
 
 def CMPCCXADDmr64 : I<0xe0, MRMDestMem4VOp3CC, (outs GR64:$dst),
           (ins GR64:$dstsrc1, i64mem:$dstsrc2, GR64:$src3, ccode:$cond),
           "cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
           [(set GR64:$dst, (X86cmpccxadd addr:$dstsrc2,
             GR64:$dstsrc1, GR64:$src3, timm:$cond))]>,
-          VEX_4V, REX_W, T8PD, Sched<[WriteXCHG]>;
+          VEX, VVVV, REX_W, T8, PD, Sched<[WriteXCHG]>;
+}
+
+let Predicates = [HasCMPCCXADD, HasEGPR, In64BitMode] in {
+def CMPCCXADDmr32_EVEX : I<0xe0, MRMDestMem4VOp3CC, (outs GR32:$dst),
+          (ins GR32:$dstsrc1, i32mem:$dstsrc2, GR32:$src3, ccode:$cond),
+          "cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
+          [(set GR32:$dst, (X86cmpccxadd addr:$dstsrc2,
+            GR32:$dstsrc1, GR32:$src3, timm:$cond))]>,
+          EVEX, VVVV, NoCD8, T8, PD, Sched<[WriteXCHG]>;
+
+def CMPCCXADDmr64_EVEX : I<0xe0, MRMDestMem4VOp3CC, (outs GR64:$dst),
+          (ins GR64:$dstsrc1, i64mem:$dstsrc2, GR64:$src3, ccode:$cond),
+          "cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
+          [(set GR64:$dst, (X86cmpccxadd addr:$dstsrc2,
+            GR64:$dstsrc1, GR64:$src3, timm:$cond))]>,
+          EVEX, VVVV, NoCD8, REX_W, T8, PD, Sched<[WriteXCHG]>;
+}
 }
 
 //===----------------------------------------------------------------------===//
@@ -1686,12 +1703,12 @@ def CMPCCXADDmr64 : I<0xe0, MRMDestMem4VOp3CC, (outs GR64:$dst),
 
 let Predicates = [HasCLFLUSHOPT], SchedRW = [WriteLoad] in
 def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
-                   "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD;
+                   "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, TB, PD;
 
 let Predicates = [HasCLWB], SchedRW = [WriteLoad] in
 def CLWB       : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src",
-                   [(int_x86_clwb addr:$src)]>, PD;
+                   [(int_x86_clwb addr:$src)]>, TB, PD;
 
 let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in
 def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
-                   [(int_x86_cldemote addr:$src)]>, PS;
+                   [(int_x86_cldemote addr:$src)]>, TB;
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 8653f15d8602..94fa6e45ded9 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -10,6 +10,8 @@ def TruePredicate : Predicate<"true">;
 
 def HasEGPR      : Predicate<"Subtarget->hasEGPR()">;
 def NoEGPR       : Predicate<"!Subtarget->hasEGPR()">;
+def HasNDD       : Predicate<"Subtarget->hasNDD()">;
+def NoNDD        : Predicate<"!Subtarget->hasNDD()">;
 def HasCMOV      : Predicate<"Subtarget->canUseCMOV()">;
 def NoCMOV       : Predicate<"!Subtarget->canUseCMOV()">;
 def HasNOPL      : Predicate<"Subtarget->hasNOPL()">;
@@ -100,7 +102,6 @@ def HasIFMA      : Predicate<"Subtarget->hasIFMA()">;
 def HasAVXIFMA   : Predicate<"Subtarget->hasAVXIFMA()">;
 def NoVLX_Or_NoIFMA : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasIFMA()">;
 def HasRTM       : Predicate<"Subtarget->hasRTM()">;
-def HasADX       : Predicate<"Subtarget->hasADX()">;
 def HasSHA       : Predicate<"Subtarget->hasSHA()">;
 def HasSHA512    : Predicate<"Subtarget->hasSHA512()">;
 def HasSGX       : Predicate<"Subtarget->hasSGX()">;
diff --git a/llvm/lib/Target/X86/X86InstrRAOINT.td b/llvm/lib/Target/X86/X86InstrRAOINT.td
index dc0e267a83e3..bc17b00f3573 100644
--- a/llvm/lib/Target/X86/X86InstrRAOINT.td
+++ b/llvm/lib/Target/X86/X86InstrRAOINT.td
@@ -39,7 +39,7 @@ multiclass RAOINT_BASE<string OpcodeStr> {
                Sched<[WriteALURMW]>, REX_W;
 }
 
-defm AADD : RAOINT_BASE<"add">, T8PS;
-defm AAND : RAOINT_BASE<"and">, T8PD;
-defm AOR  : RAOINT_BASE<"or" >, T8XD;
-defm AXOR : RAOINT_BASE<"xor">, T8XS;
+defm AADD : RAOINT_BASE<"add">, T8;
+defm AAND : RAOINT_BASE<"and">, T8, PD;
+defm AOR  : RAOINT_BASE<"or" >, T8, XD;
+defm AXOR : RAOINT_BASE<"xor">, T8, XS;
diff --git a/llvm/lib/Target/X86/X86InstrSGX.td b/llvm/lib/Target/X86/X86InstrSGX.td
index 6439f717accb..747f5aa86653 100644
--- a/llvm/lib/Target/X86/X86InstrSGX.td
+++ b/llvm/lib/Target/X86/X86InstrSGX.td
@@ -17,13 +17,13 @@
 let SchedRW = [WriteSystem], Predicates = [HasSGX] in {
 // ENCLS - Execute an Enclave System Function of Specified Leaf Number
 def ENCLS : I<0x01, MRM_CF, (outs), (ins),
-             "encls", []>, PS;
+             "encls", []>, TB;
 
 // ENCLU - Execute an Enclave User Function of Specified Leaf Number
 def ENCLU : I<0x01, MRM_D7, (outs), (ins),
-             "enclu", []>, PS;
+             "enclu", []>, TB;
 
 // ENCLV - Execute an Enclave VMM Function of Specified Leaf Number
 def ENCLV : I<0x01, MRM_C0, (outs), (ins),
-             "enclv", []>, PS;
+             "enclv", []>, TB;
 } // SchedRW
diff --git a/llvm/lib/Target/X86/X86InstrSNP.td b/llvm/lib/Target/X86/X86InstrSNP.td
index ab13fa43c92d..05ed6585db6d 100644
--- a/llvm/lib/Target/X86/X86InstrSNP.td
+++ b/llvm/lib/Target/X86/X86InstrSNP.td
@@ -17,31 +17,31 @@
 let SchedRW = [WriteSystem] in {
 // F3 0F 01 FF
 let Uses = [RAX], Defs = [EAX, EFLAGS] in
-def PSMASH: I<0x01, MRM_FF, (outs), (ins), "psmash", []>, XS,
+def PSMASH: I<0x01, MRM_FF, (outs), (ins), "psmash", []>, TB, XS,
             Requires<[In64BitMode]>;
 
 // F2 0F 01 FF
 let Uses = [RAX, RCX, RDX], Defs = [EAX, EFLAGS] in
 def PVALIDATE64: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
-                 XD, Requires<[In64BitMode]>;
+                 TB, XD, Requires<[In64BitMode]>;
 
 let Uses = [EAX, ECX, EDX], Defs = [EAX, EFLAGS] in
 def PVALIDATE32: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
-                 XD, Requires<[Not64BitMode]>;
+                 TB, XD, Requires<[Not64BitMode]>;
 
 // F2 0F 01 FE
 let Uses = [RAX, RCX], Defs = [EAX, EFLAGS] in
-def RMPUPDATE: I<0x01, MRM_FE, (outs), (ins), "rmpupdate", []>, XD,
+def RMPUPDATE: I<0x01, MRM_FE, (outs), (ins), "rmpupdate", []>, TB, XD,
                Requires<[In64BitMode]>;
 
 // F3 0F 01 FE
 let Uses = [RAX, RCX, RDX], Defs = [EAX, EFLAGS] in
-def RMPADJUST: I<0x01, MRM_FE, (outs), (ins), "rmpadjust", []>, XS,
+def RMPADJUST: I<0x01, MRM_FE, (outs), (ins), "rmpadjust", []>, TB, XS,
                Requires<[In64BitMode]>;
 
 // F3 0F 01 FD
 let Uses = [RAX, RDX], Defs = [RAX, RCX, RDX, EFLAGS] in
-def RMPQUERY: I<0x01, MRM_FD, (outs), (ins), "rmpquery", []>, XS,
+def RMPQUERY: I<0x01, MRM_FD, (outs), (ins), "rmpquery", []>, TB, XS,
                Requires<[In64BitMode]>;
 } // SchedRW
 
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index be6962ebbb4f..e8a1a2b83886 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -215,7 +215,7 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
   let Predicates = [UseAVX, OptForSize] in
   defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
-                              VEX_4V, VEX_LIG, WIG;
+                              VEX, VVVV, VEX_LIG, WIG;
 
   def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -268,15 +268,15 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
 }
 
 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
-                        SSEPackedSingle, UseSSE1>, XS;
+                        SSEPackedSingle, UseSSE1>, TB, XS;
 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
-                        SSEPackedDouble, UseSSE2>, XD;
+                        SSEPackedDouble, UseSSE2>, TB, XD;
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
   defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
-                             SSEPackedSingle>, XS;
+                             SSEPackedSingle>, TB, XS;
   defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
-                             SSEPackedDouble>, XD;
+                             SSEPackedDouble>, TB, XD;
 }
 
 // Patterns
@@ -352,46 +352,46 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in
 let Predicates = [HasAVX, NoVLX] in {
 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
                                 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
-                                PS, VEX, WIG;
+                                TB, VEX, WIG;
 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
                                 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
-                                PD, VEX, WIG;
+                                TB, PD, VEX, WIG;
 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
                                 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
-                                PS, VEX, WIG;
+                                TB, VEX, WIG;
 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
                                 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
-                                PD, VEX, WIG;
+                                TB, PD, VEX, WIG;
 
 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
                                  SSEPackedSingle, SchedWriteFMoveLS.YMM>,
-                                 PS, VEX, VEX_L, WIG;
+                                 TB, VEX, VEX_L, WIG;
 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
                                  SSEPackedDouble, SchedWriteFMoveLS.YMM>,
-                                 PD, VEX, VEX_L, WIG;
+                                 TB, PD, VEX, VEX_L, WIG;
 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
                                  SSEPackedSingle, SchedWriteFMoveLS.YMM>,
-                                 PS, VEX, VEX_L, WIG;
+                                 TB, VEX, VEX_L, WIG;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
                                  SSEPackedDouble, SchedWriteFMoveLS.YMM>,
-                                 PD, VEX, VEX_L, WIG;
+                                 TB, PD, VEX, VEX_L, WIG;
 }
 
 let Predicates = [UseSSE1] in {
 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
-                               PS;
+                               TB;
 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
-                               PS;
+                               TB;
 }
 let Predicates = [UseSSE2] in {
 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
-                               PD;
+                               TB, PD;
 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
-                               PD;
+                               TB, PD;
 }
 
 let Predicates = [HasAVX, NoVLX]  in {
@@ -666,7 +666,7 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
   def PSrm : PI<opc, MRMSrcMem,
                 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
                 !strconcat(base_opc, "s", asm_opr),
-                [], SSEPackedSingle>, PS,
+                [], SSEPackedSingle>, TB,
                 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
 
   def PDrm : PI<opc, MRMSrcMem,
@@ -674,7 +674,7 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
          !strconcat(base_opc, "d", asm_opr),
      [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
                               (scalar_to_vector (loadf64 addr:$src2)))))],
-              SSEPackedDouble>, PD,
+              SSEPackedDouble>, TB, PD,
      Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
 }
 
@@ -683,7 +683,7 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
   let Predicates = [UseAVX] in
     defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
                                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
-                                    VEX_4V, WIG;
+                                    VEX, VVVV, WIG;
 
   let Constraints = "$src1 = $dst" in
     defm NAME : sse12_mov_hilo_packed_base<opc,  pdnode, base_opc,
@@ -823,14 +823,14 @@ let Predicates = [UseAVX] in {
                       "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set VR128:$dst,
                         (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
-                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, WIG;
+                      VEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>, WIG;
   let isCommutable = 1 in
   def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set VR128:$dst,
                         (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
-                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, WIG;
+                      VEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>, WIG;
 }
 let Constraints = "$src1 = $dst" in {
   def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
@@ -903,36 +903,36 @@ let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPExceptio
 defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
                                 "cvttss2si", "cvttss2si",
                                 WriteCvtSS2I, SSEPackedSingle>,
-                                XS, VEX, VEX_LIG;
+                                TB, XS, VEX, VEX_LIG;
 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
                                 "cvttss2si", "cvttss2si",
                                 WriteCvtSS2I, SSEPackedSingle>,
-                                XS, VEX, REX_W, VEX_LIG;
+                                TB, XS, VEX, REX_W, VEX_LIG;
 defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
                                 "cvttsd2si", "cvttsd2si",
                                 WriteCvtSD2I, SSEPackedDouble>,
-                                XD, VEX, VEX_LIG;
+                                TB, XD, VEX, VEX_LIG;
 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
                                 "cvttsd2si", "cvttsd2si",
                                 WriteCvtSD2I, SSEPackedDouble>,
-                                XD, VEX, REX_W, VEX_LIG;
+                                TB, XD, VEX, REX_W, VEX_LIG;
 
 defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
                                "cvtss2si", "cvtss2si",
                                WriteCvtSS2I, SSEPackedSingle>,
-                               XS, VEX, VEX_LIG;
+                               TB, XS, VEX, VEX_LIG;
 defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
                                "cvtss2si", "cvtss2si",
                                WriteCvtSS2I, SSEPackedSingle>,
-                               XS, VEX, REX_W, VEX_LIG;
+                               TB, XS, VEX, REX_W, VEX_LIG;
 defm VCVTSD2SI   : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
                                "cvtsd2si", "cvtsd2si",
                                WriteCvtSD2I, SSEPackedDouble>,
-                               XD, VEX, VEX_LIG;
+                               TB, XD, VEX, VEX_LIG;
 defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
                                "cvtsd2si", "cvtsd2si",
                                WriteCvtSD2I, SSEPackedDouble>,
-                               XD, VEX, REX_W, VEX_LIG;
+                               TB, XD, VEX, REX_W, VEX_LIG;
 }
 
 // The assembler can recognize rr 64-bit instructions by seeing a rxx
@@ -941,16 +941,16 @@ defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
 // where appropriate to do so.
 let isCodeGenOnly = 1 in {
 defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
-                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
+                                  WriteCvtI2SS, SSEPackedSingle>, TB, XS, VEX, VVVV,
                                   VEX_LIG, SIMD_EXC;
 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
-                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
+                                  WriteCvtI2SS, SSEPackedSingle>, TB, XS, VEX, VVVV,
                                   REX_W, VEX_LIG, SIMD_EXC;
 defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
-                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
+                                  WriteCvtI2SD, SSEPackedDouble>, TB, XD, VEX, VVVV,
                                   VEX_LIG;
 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
-                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
+                                  WriteCvtI2SD, SSEPackedDouble>, TB, XD, VEX, VVVV,
                                   REX_W, VEX_LIG, SIMD_EXC;
 } // isCodeGenOnly = 1
 
@@ -983,42 +983,42 @@ let Predicates = [UseAVX] in {
 let isCodeGenOnly = 1 in {
 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
                       "cvttss2si", "cvttss2si",
-                      WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
+                      WriteCvtSS2I, SSEPackedSingle>, TB, XS, SIMD_EXC;
 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
                       "cvttss2si", "cvttss2si",
-                      WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
+                      WriteCvtSS2I, SSEPackedSingle>, TB, XS, REX_W, SIMD_EXC;
 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
                       "cvttsd2si", "cvttsd2si",
-                      WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
+                      WriteCvtSD2I, SSEPackedDouble>, TB, XD, SIMD_EXC;
 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
                       "cvttsd2si", "cvttsd2si",
-                      WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
+                      WriteCvtSD2I, SSEPackedDouble>, TB, XD, REX_W, SIMD_EXC;
 
 defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
                      "cvtss2si", "cvtss2si",
-                     WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
+                     WriteCvtSS2I, SSEPackedSingle>, TB, XS, SIMD_EXC;
 defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
                      "cvtss2si", "cvtss2si",
-                     WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
+                     WriteCvtSS2I, SSEPackedSingle>, TB, XS, REX_W, SIMD_EXC;
 defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
                      "cvtsd2si", "cvtsd2si",
-                     WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
+                     WriteCvtSD2I, SSEPackedDouble>, TB, XD, SIMD_EXC;
 defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
                      "cvtsd2si", "cvtsd2si",
-                     WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
+                     WriteCvtSD2I, SSEPackedDouble>, TB, XD, REX_W, SIMD_EXC;
 
 defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
                       "cvtsi2ss", "cvtsi2ss{l}",
-                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
+                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, TB, XS, SIMD_EXC;
 defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
                       "cvtsi2ss", "cvtsi2ss{q}",
-                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
+                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, TB, XS, REX_W, SIMD_EXC;
 defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
                       "cvtsi2sd", "cvtsi2sd{l}",
-                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
+                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, TB, XD;
 defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
                       "cvtsi2sd", "cvtsi2sd{q}",
-                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
+                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, TB, XD, REX_W, SIMD_EXC;
 } // isCodeGenOnly = 1
 
 let Predicates = [UseSSE1] in {
@@ -1074,46 +1074,46 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
 let Predicates = [UseAVX] in {
 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
                   X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
-                  WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
+                  WriteCvtSD2I, SSEPackedDouble>, TB, XD, VEX, VEX_LIG;
 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
                     X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
-                    WriteCvtSD2I, SSEPackedDouble>, XD, VEX, REX_W, VEX_LIG;
+                    WriteCvtSD2I, SSEPackedDouble>, TB, XD, VEX, REX_W, VEX_LIG;
 }
 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
                  sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
-                 SSEPackedDouble>, XD;
+                 SSEPackedDouble>, TB, XD;
 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
                    sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
-                   SSEPackedDouble>, XD, REX_W;
+                   SSEPackedDouble>, TB, XD, REX_W;
 }
 
 let Predicates = [UseAVX] in {
 defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
           i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
-          XS, VEX_4V, VEX_LIG, SIMD_EXC;
+          TB, XS, VEX, VVVV, VEX_LIG, SIMD_EXC;
 defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
           i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
-          XS, VEX_4V, VEX_LIG, REX_W, SIMD_EXC;
+          TB, XS, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC;
 defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
           i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
-          XD, VEX_4V, VEX_LIG;
+          TB, XD, VEX, VVVV, VEX_LIG;
 defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
           i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
-          XD, VEX_4V, VEX_LIG, REX_W, SIMD_EXC;
+          TB, XD, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC;
 }
 let Constraints = "$src1 = $dst" in {
   defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
                         i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
-                        XS, SIMD_EXC;
+                        TB, XS, SIMD_EXC;
   defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
                         i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
-                        XS, REX_W, SIMD_EXC;
+                        TB, XS, REX_W, SIMD_EXC;
   defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
                         i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
-                        XD;
+                        TB, XD;
   defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
                         i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
-                        XD, REX_W, SIMD_EXC;
+                        TB, XD, REX_W, SIMD_EXC;
 }
 
 def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1150,34 +1150,34 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
 let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
                                 ssmem, sse_load_f32, "cvttss2si",
-                                WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
+                                WriteCvtSS2I, SSEPackedSingle>, TB, XS, VEX, VEX_LIG;
 defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
                                X86cvtts2Int, ssmem, sse_load_f32,
                                "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
-                               XS, VEX, VEX_LIG, REX_W;
+                               TB, XS, VEX, VEX_LIG, REX_W;
 defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
                                 sdmem, sse_load_f64, "cvttsd2si",
-                                WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
+                                WriteCvtSS2I, SSEPackedDouble>, TB, XD, VEX, VEX_LIG;
 defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
                               X86cvtts2Int, sdmem, sse_load_f64,
                               "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
-                              XD, VEX, VEX_LIG, REX_W;
+                              TB, XD, VEX, VEX_LIG, REX_W;
 }
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
                                     ssmem, sse_load_f32, "cvttss2si",
-                                    WriteCvtSS2I, SSEPackedSingle>, XS;
+                                    WriteCvtSS2I, SSEPackedSingle>, TB, XS;
 defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
                                    X86cvtts2Int, ssmem, sse_load_f32,
                                    "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
-                                   XS, REX_W;
+                                   TB, XS, REX_W;
 defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
                                     sdmem, sse_load_f64, "cvttsd2si",
-                                    WriteCvtSD2I, SSEPackedDouble>, XD;
+                                    WriteCvtSD2I, SSEPackedDouble>, TB, XD;
 defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
                                   X86cvtts2Int, sdmem, sse_load_f64,
                                   "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
-                                  XD, REX_W;
+                                  TB, XD, REX_W;
 }
 
 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
@@ -1217,32 +1217,32 @@ def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
 let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
                                   ssmem, sse_load_f32, "cvtss2si",
-                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
+                                  WriteCvtSS2I, SSEPackedSingle>, TB, XS, VEX, VEX_LIG;
 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
                                   ssmem, sse_load_f32, "cvtss2si",
-                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, REX_W, VEX_LIG;
+                                  WriteCvtSS2I, SSEPackedSingle>, TB, XS, VEX, REX_W, VEX_LIG;
 }
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
                                ssmem, sse_load_f32, "cvtss2si",
-                               WriteCvtSS2I, SSEPackedSingle>, XS;
+                               WriteCvtSS2I, SSEPackedSingle>, TB, XS;
 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
                                  ssmem, sse_load_f32, "cvtss2si",
-                                 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
+                                 WriteCvtSS2I, SSEPackedSingle>, TB, XS, REX_W;
 
 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PS>,
-                               PS, VEX, Requires<[HasAVX, NoVLX]>, WIG;
+                               TB, VEX, Requires<[HasAVX, NoVLX]>, WIG;
 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PSY>,
-                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, WIG;
+                               TB, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, WIG;
 
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle, WriteCvtI2PS>,
-                            PS, Requires<[UseSSE2]>;
+                            TB, Requires<[UseSSE2]>;
 }
 
 // AVX aliases
@@ -1289,13 +1289,13 @@ let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
                         (ins FR32:$src1, FR64:$src2),
                         "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                        VEX_4V, VEX_LIG, WIG,
+                        VEX, VVVV, VEX_LIG, WIG,
                         Sched<[WriteCvtSD2SS]>, SIMD_EXC;
 let mayLoad = 1 in
 def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
                      (ins FR32:$src1, f64mem:$src2),
                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                     XD, VEX_4V, VEX_LIG, WIG,
+                     TB, XD, VEX, VVVV, VEX_LIG, WIG,
                      Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
 }
 
@@ -1311,7 +1311,7 @@ def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
 def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
                     "cvtsd2ss\t{$src, $dst|$dst, $src}",
                     [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
-                    XD, Requires<[UseSSE2, OptForSize]>,
+                    TB, XD, Requires<[UseSSE2, OptForSize]>,
                     Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
 }
 
@@ -1321,14 +1321,14 @@ def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
-                       XD, VEX_4V, VEX_LIG, WIG, Requires<[UseAVX]>,
+                       TB, XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>,
                        Sched<[WriteCvtSD2SS]>;
 def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
-                       XD, VEX_4V, VEX_LIG, WIG, Requires<[UseAVX]>,
+                       TB, XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>,
                        Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
 let Constraints = "$src1 = $dst" in {
 def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
@@ -1336,13 +1336,13 @@ def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
                        "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
                          (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
-                       XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
+                       TB, XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
 def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
                          (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
-                       XD, Requires<[UseSSE2]>,
+                       TB, XD, Requires<[UseSSE2]>,
                        Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
 }
 }
@@ -1353,13 +1353,13 @@ let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR64:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                    XS, VEX_4V, VEX_LIG, WIG,
+                    TB, XS, VEX, VVVV, VEX_LIG, WIG,
                     Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
 let mayLoad = 1 in
 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
                     (ins FR64:$src1, f32mem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                    XS, VEX_4V, VEX_LIG, WIG,
+                    TB, XS, VEX, VVVV, VEX_LIG, WIG,
                     Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
                     Requires<[UseAVX, OptForSize]>, SIMD_EXC;
 } // isCodeGenOnly = 1, hasSideEffects = 0
@@ -1373,11 +1373,11 @@ let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (any_fpextend FR32:$src))]>,
-                   XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
+                   TB, XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
-                   XS, Requires<[UseSSE2, OptForSize]>,
+                   TB, XS, Requires<[UseSSE2, OptForSize]>,
                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, SIMD_EXC;
 } // isCodeGenOnly = 1
 
@@ -1386,25 +1386,25 @@ let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, XS, VEX_4V, VEX_LIG, WIG,
+                    []>, TB, XS, VEX, VVVV, VEX_LIG, WIG,
                     Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
 let mayLoad = 1 in
 def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, XS, VEX_4V, VEX_LIG, WIG, Requires<[HasAVX]>,
+                    []>, TB, XS, VEX, VVVV, VEX_LIG, WIG, Requires<[HasAVX]>,
                     Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
 def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
-                    []>, XS, Requires<[UseSSE2]>,
+                    []>, TB, XS, Requires<[UseSSE2]>,
                     Sched<[WriteCvtSS2SD]>;
 let mayLoad = 1 in
 def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
-                    []>, XS, Requires<[UseSSE2]>,
+                    []>, TB, XS, Requires<[UseSSE2]>,
                     Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
 }
 } // hasSideEffects = 0
@@ -1699,30 +1699,30 @@ let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
-                    PS, VEX, Sched<[WriteCvtPS2PD]>, WIG;
+                    TB, VEX, Sched<[WriteCvtPS2PD]>, WIG;
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
-                    PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, WIG;
+                    TB, VEX, Sched<[WriteCvtPS2PD.Folded]>, WIG;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
-                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, WIG;
+                     TB, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, WIG;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
-                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, WIG;
+                     TB, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, WIG;
 }
 
 let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "cvtps2pd\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
-                   PS, Sched<[WriteCvtPS2PD]>;
+                   TB, Sched<[WriteCvtPS2PD]>;
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                    "cvtps2pd\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
-                   PS, Sched<[WriteCvtPS2PD.Folded]>;
+                   TB, Sched<[WriteCvtPS2PD.Folded]>;
 }
 
 // Convert Packed DW Integers to Packed Double FP
@@ -1860,22 +1860,22 @@ let ExeDomain = SSEPackedSingle in
 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
                  "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
-                 XS, VEX_4V, VEX_LIG, WIG;
+                 TB, XS, VEX, VVVV, VEX_LIG, WIG;
 let ExeDomain = SSEPackedDouble in
 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
                  "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
-                 XD, VEX_4V, VEX_LIG, WIG;
+                 TB, XD, VEX, VVVV, VEX_LIG, WIG;
 
 let Constraints = "$src1 = $dst" in {
   let ExeDomain = SSEPackedSingle in
   defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
                   "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
+                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, TB, XS;
   let ExeDomain = SSEPackedDouble in
   defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
+                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, TB, XD;
 }
 
 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
@@ -1919,44 +1919,44 @@ let mayLoad = 1 in
 
 let Defs = [EFLAGS] in {
   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
-                               "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
+                               "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG;
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
-                               "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
+                               "ucomisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG;
   defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
-                               "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
+                               "comiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG;
   defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
-                               "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
+                               "comisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG;
 
   let isCodeGenOnly = 1 in {
     defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
-                      sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
+                      sse_load_f32, "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG;
     defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
-                      sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
+                      sse_load_f64, "ucomisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG;
 
     defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
-                       sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
+                       sse_load_f32, "comiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG;
     defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
-                       sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
+                       sse_load_f64, "comisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG;
   }
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
-                                  "ucomiss", SSEPackedSingle>, PS;
+                                  "ucomiss", SSEPackedSingle>, TB;
   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
-                                  "ucomisd", SSEPackedDouble>, PD;
+                                  "ucomisd", SSEPackedDouble>, TB, PD;
   defm COMISS   : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
-                                  "comiss", SSEPackedSingle>, PS;
+                                  "comiss", SSEPackedSingle>, TB;
   defm COMISD   : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
-                                  "comisd", SSEPackedDouble>, PD;
+                                  "comisd", SSEPackedDouble>, TB, PD;
 
   let isCodeGenOnly = 1 in {
     defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
-                            sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
+                            sse_load_f32, "ucomiss", SSEPackedSingle>, TB;
     defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
-                            sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
+                            sse_load_f64, "ucomisd", SSEPackedDouble>, TB, PD;
 
     defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
-                                sse_load_f32, "comiss", SSEPackedSingle>, PS;
+                                sse_load_f32, "comiss", SSEPackedSingle>, TB;
     defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
-                                    sse_load_f64, "comisd", SSEPackedDouble>, PD;
+                                    sse_load_f64, "comisd", SSEPackedDouble>, TB, PD;
   }
 } // Defs = [EFLAGS]
 
@@ -1979,23 +1979,23 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
 
 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, WIG;
+               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, TB, VEX, VVVV, WIG;
 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, WIG;
+               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, TB, PD, VEX, VVVV, WIG;
 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, WIG;
+               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, TB, VEX, VVVV, VEX_L, WIG;
 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, WIG;
+               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, TB, PD, VEX, VVVV, VEX_L, WIG;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
+                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, TB;
   defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
+                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, TB, PD;
 }
 
 def CommutableCMPCC : PatLeaf<(timm), [{
@@ -2076,27 +2076,27 @@ let Predicates = [HasAVX, NoVLX] in {
   defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
-           PS, VEX_4V, WIG;
+           TB, VEX, VVVV, WIG;
   defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
-           PS, VEX_4V, VEX_L, WIG;
+           TB, VEX, VVVV, VEX_L, WIG;
   defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
-           PD, VEX_4V, WIG;
+           TB, PD, VEX, VVVV, WIG;
   defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
-           PD, VEX_4V, VEX_L, WIG;
+           TB, PD, VEX, VVVV, VEX_L, WIG;
 }
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
                     "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
+                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, TB;
   defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
                     "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
+                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2126,44 +2126,44 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
 let Predicates = [HasAVX, NoVLX] in {
 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, VEX, VVVV, WIG;
 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD, VEX, VVVV, WIG;
 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, VEX, VVVV, WIG;
 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedDouble>, TB, PD, VEX, VVVV, WIG;
 
 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedSingle>, TB, VEX, VVVV, VEX_L, WIG;
 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedDouble>, TB, PD, VEX, VVVV, VEX_L, WIG;
 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedSingle>, TB, VEX, VVVV, VEX_L, WIG;
 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedDouble>, TB, PD, VEX, VVVV, VEX_L, WIG;
 }// Predicates = [HasAVX, NoVLX]
 
 let Constraints = "$src1 = $dst" in {
   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
         VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
-                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
+                       SchedWriteFShuffle.XMM, SSEPackedSingle>, TB;
   defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
         VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
-                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
+                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD;
   defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
         VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
-                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
+                       SchedWriteFShuffle.XMM, SSEPackedSingle>, TB;
   defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
         VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
-                       SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
+                       SchedWriteFShuffle.XMM, SSEPackedDouble>, TB, PD;
 } // Constraints = "$src1 = $dst"
 
 let Predicates = [HasAVX1Only] in {
@@ -2208,13 +2208,13 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
 
 let Predicates = [HasAVX] in {
   defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
-                                        SSEPackedSingle>, PS, VEX, WIG;
+                                        SSEPackedSingle>, TB, VEX, WIG;
   defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
-                                        SSEPackedDouble>, PD, VEX, WIG;
+                                        SSEPackedDouble>, TB, PD, VEX, WIG;
   defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
-                                         SSEPackedSingle>, PS, VEX, VEX_L, WIG;
+                                         SSEPackedSingle>, TB, VEX, VEX_L, WIG;
   defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
-                                         SSEPackedDouble>, PD, VEX, VEX_L, WIG;
+                                         SSEPackedDouble>, TB, PD, VEX, VEX_L, WIG;
 
   // Also support integer VTs to avoid a int->fp bitcast in the DAG.
   def : Pat<(X86movmsk (v4i32 VR128:$src)),
@@ -2228,9 +2228,9 @@ let Predicates = [HasAVX] in {
 }
 
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
-                                     SSEPackedSingle>, PS;
+                                     SSEPackedSingle>, TB;
 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
-                                     SSEPackedDouble>, PD;
+                                     SSEPackedDouble>, TB, PD;
 
 let Predicates = [UseSSE2] in {
   // Also support integer VTs to avoid a int->fp bitcast in the DAG.
@@ -2276,7 +2276,7 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
                              VR128, load, i128mem, sched.XMM,
-                             IsCommutable, 0>, VEX_4V, WIG;
+                             IsCommutable, 0>, VEX, VVVV, WIG;
 
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
@@ -2285,7 +2285,7 @@ let Constraints = "$src1 = $dst" in
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
                                OpVT256, VR256, load, i256mem, sched.YMM,
-                               IsCommutable, 0>, VEX_4V, VEX_L, WIG;
+                               IsCommutable, 0>, VEX, VVVV, VEX_L, WIG;
 }
 
 // These are ordered here for pattern ordering requirements with the fp versions
@@ -2312,29 +2312,29 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
         !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
-        [], [], 0>, PS, VEX_4V, VEX_L, WIG;
+        [], [], 0>, TB, VEX, VVVV, VEX_L, WIG;
 
   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
         !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
-        [], [], 0>, PD, VEX_4V, VEX_L, WIG;
+        [], [], 0>, TB, PD, VEX, VVVV, VEX_L, WIG;
 
   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
        !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
-       [], [], 0>, PS, VEX_4V, WIG;
+       [], [], 0>, TB, VEX, VVVV, WIG;
 
   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
        !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
-       [], [], 0>, PD, VEX_4V, WIG;
+       [], [], 0>, TB, PD, VEX, VVVV, WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
          !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
-         [], []>, PS;
+         [], []>, TB;
 
     defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
          !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
-         [], []>, PD;
+         [], []>, TB, PD;
   }
 }
 
@@ -2636,26 +2636,26 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
                                VR128, v4f32, f128mem, loadv4f32,
-                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, WIG;
+                               SSEPackedSingle, sched.PS.XMM, 0>, TB, VEX, VVVV, WIG;
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
                                VR128, v2f64, f128mem, loadv2f64,
-                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, WIG;
+                               SSEPackedDouble, sched.PD.XMM, 0>, TB, PD, VEX, VVVV, WIG;
 
   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
                         OpNode, VR256, v8f32, f256mem, loadv8f32,
-                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, WIG;
+                        SSEPackedSingle, sched.PS.YMM, 0>, TB, VEX, VVVV, VEX_L, WIG;
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
                         OpNode, VR256, v4f64, f256mem, loadv4f64,
-                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, WIG;
+                        SSEPackedDouble, sched.PD.YMM, 0>, TB, PD, VEX, VVVV, VEX_L, WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
                               v4f32, f128mem, memopv4f32, SSEPackedSingle,
-                              sched.PS.XMM>, PS;
+                              sched.PS.XMM>, TB;
     defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
                               v2f64, f128mem, memopv2f64, SSEPackedDouble,
-                              sched.PD.XMM>, PD;
+                              sched.PD.XMM>, TB, PD;
   }
 }
 }
@@ -2665,18 +2665,18 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperat
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
                          OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
-                         XS, VEX_4V, VEX_LIG, WIG;
+                         TB, XS, VEX, VVVV, VEX_LIG, WIG;
   defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
                          OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
-                         XD, VEX_4V, VEX_LIG, WIG;
+                         TB, XD, VEX, VVVV, VEX_LIG, WIG;
 
   let Constraints = "$src1 = $dst" in {
     defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
                               OpNode, FR32, f32mem, SSEPackedSingle,
-                              sched.PS.Scl>, XS;
+                              sched.PS.Scl>, TB, XS;
     defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
                               OpNode, FR64, f64mem, SSEPackedDouble,
-                              sched.PD.Scl>, XD;
+                              sched.PD.Scl>, TB, XD;
   }
 }
 }
@@ -2687,18 +2687,18 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
-                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, WIG;
+                   SSEPackedSingle, sched.PS.Scl, 0>, TB, XS, VEX, VVVV, VEX_LIG, WIG;
   defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
-                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, WIG;
+                   SSEPackedDouble, sched.PD.Scl, 0>, TB, XD, VEX, VVVV, VEX_LIG, WIG;
 
   let Constraints = "$src1 = $dst" in {
     defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
-                   SSEPackedSingle, sched.PS.Scl>, XS;
+                   SSEPackedSingle, sched.PS.Scl>, TB, XS;
     defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
-                   SSEPackedDouble, sched.PD.Scl>, XD;
+                   SSEPackedDouble, sched.PD.Scl>, TB, XD;
   }
 }
 }
@@ -3016,29 +3016,29 @@ let Predicates = [HasAVX, NoVLX] in {
 multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> {
   defm SS        :  sse_fp_unop_s_intr<v4f32, sse_load_f32,
                       !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
-                      UseSSE1>, XS;
+                      UseSSE1>, TB, XS;
   defm V#NAME#SS  : avx_fp_unop_s_intr<v4f32, sse_load_f32,
                       !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
                       AVXTarget>,
-                      XS, VEX_4V, VEX_LIG, WIG;
+                      TB, XS, VEX, VVVV, VEX_LIG, WIG;
 }
 
 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                           X86SchedWriteWidths sched, Predicate AVXTarget> {
   defm SS        :  sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem,
-                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
+                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, TB, XS;
   defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
                       f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
-                       XS, VEX_4V, VEX_LIG, WIG;
+                       TB, XS, VEX, VVVV, VEX_LIG, WIG;
 }
 
 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                           X86SchedWriteWidths sched, Predicate AVXTarget> {
   defm SD         : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem,
-                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
+                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, TB, XD;
   defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
                          f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
-                         XD, VEX_4V, VEX_LIG, WIG;
+                         TB, XD, VEX, VVVV, VEX_LIG, WIG;
 }
 
 // Square root.
@@ -3165,11 +3165,11 @@ let SchedRW = [WriteStoreNT] in {
 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                  "movnti{l}\t{$src, $dst|$dst, $src}",
                  [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
-               PS, Requires<[HasSSE2]>;
+               TB, Requires<[HasSSE2]>;
 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movnti{q}\t{$src, $dst|$dst, $src}",
                      [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
-                  PS, Requires<[HasSSE2]>;
+                  TB, Requires<[HasSSE2]>;
 } // SchedRW = [WriteStoreNT]
 
 let Predicates = [HasAVX, NoVLX] in {
@@ -3226,14 +3226,14 @@ let SchedRW = [WriteLoad] in {
 // Flush cache
 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
                "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
-               PS, Requires<[HasCLFLUSH]>;
+               TB, Requires<[HasCLFLUSH]>;
 }
 
 let SchedRW = [WriteNop] in {
 // Pause. This "instruction" is encoded as "rep; nop", so even though it
 // was introduced with SSE2, it's backward compatible.
 def PAUSE : I<0x90, RawFrm, (outs), (ins),
-              "pause", [(int_x86_sse2_pause)]>, OBXS;
+              "pause", [(int_x86_sse2_pause)]>, XS;
 }
 
 let SchedRW = [WriteFence] in {
@@ -3241,11 +3241,11 @@ let SchedRW = [WriteFence] in {
 // TODO: As with mfence, we may want to ease the availability of sfence/lfence
 // to include any 64-bit target.
 def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
-               PS, Requires<[HasSSE1]>;
+               TB, Requires<[HasSSE1]>;
 def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
-               PS, Requires<[HasSSE2]>;
+               TB, Requires<[HasSSE2]>;
 def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
-               PS, Requires<[HasMFence]>;
+               TB, Requires<[HasMFence]>;
 } // SchedRW
 
 def : Pat<(X86MFence), (MFENCE)>;
@@ -3266,11 +3266,11 @@ def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
 let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in
 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
-              PS, Sched<[WriteLDMXCSR]>;
+              TB, Sched<[WriteLDMXCSR]>;
 let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in
 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
-              PS, Sched<[WriteSTMXCSR]>;
+              TB, Sched<[WriteSTMXCSR]>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
@@ -3327,11 +3327,11 @@ def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "vmovdqu\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (loadv2i64 addr:$src))]>,
                    Sched<[SchedWriteVecMoveLS.XMM.RM]>,
-                   XS, VEX, WIG;
+                   TB, XS, VEX, WIG;
 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "vmovdqu\t{$src, $dst|$dst, $src}", []>,
                    Sched<[SchedWriteVecMoveLS.YMM.RM]>,
-                   XS, VEX, VEX_L, WIG;
+                   TB, XS, VEX, VEX_L, WIG;
 }
 
 let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
@@ -3347,10 +3347,10 @@ def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
 def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "vmovdqu\t{$src, $dst|$dst, $src}",
                    [(store (v2i64 VR128:$src), addr:$dst)]>,
-                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, WIG;
+                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, TB, XS, VEX, WIG;
 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
                    "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
-                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, WIG;
+                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, TB, XS, VEX, VEX_L, WIG;
 }
 
 let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
@@ -3360,7 +3360,7 @@ def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 
 def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}", []>,
-                   XS, Requires<[UseSSE2]>;
+                   TB, XS, Requires<[UseSSE2]>;
 }
 
 // For Disassembler
@@ -3370,7 +3370,7 @@ def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
 
 def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                        "movdqu\t{$src, $dst|$dst, $src}", []>,
-                       XS, Requires<[UseSSE2]>;
+                       TB, XS, Requires<[UseSSE2]>;
 }
 } // SchedRW
 
@@ -3382,7 +3382,7 @@ def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
 def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
-                 XS, Requires<[UseSSE2]>;
+                 TB, XS, Requires<[UseSSE2]>;
 }
 
 let mayStore = 1, hasSideEffects = 0,
@@ -3393,7 +3393,7 @@ def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
 def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
-                 XS, Requires<[UseSSE2]>;
+                 TB, XS, Requires<[UseSSE2]>;
 }
 
 } // ExeDomain = SSEPackedInt
@@ -3537,12 +3537,12 @@ defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
                               load, i128mem, SchedWriteVecIMul.XMM, 0>,
-                              VEX_4V, WIG;
+                              VEX, VVVV, WIG;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
                                VR256, load, i256mem, SchedWriteVecIMul.YMM,
-                               0>, VEX_4V, VEX_L, WIG;
+                               0>, VEX, VVVV, VEX_L, WIG;
 let Constraints = "$src1 = $dst" in
 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
                              memop, i128mem, SchedWriteVecIMul.XMM>;
@@ -3550,11 +3550,11 @@ defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
                              load, i128mem, SchedWritePSADBW.XMM, 0>,
-                             VEX_4V, WIG;
+                             VEX, VVVV, WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
                              load, i256mem, SchedWritePSADBW.YMM, 0>,
-                             VEX_4V, VEX_L, WIG;
+                             VEX, VVVV, VEX_L, WIG;
 let Constraints = "$src1 = $dst" in
 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
                             memop, i128mem, SchedWritePSADBW.XMM>;
@@ -3604,11 +3604,11 @@ multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                               OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
-                              DstVT128, SrcVT, load, 0>, VEX_4V, WIG;
+                              DstVT128, SrcVT, load, 0>, VEX, VVVV, WIG;
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                                 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
-                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
+                                DstVT256, SrcVT, load, 0>, VEX, VVVV, VEX_L,
                                 WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
@@ -3631,11 +3631,11 @@ multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
                             SDNode OpNode, X86SchedWriteWidths sched> {
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
   defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
-                             VR128, v16i8, sched.XMM, 0>, VEX_4V, WIG;
+                             VR128, v16i8, sched.XMM, 0>, VEX, VVVV, WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
   defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
                                VR256, v32i8, sched.YMM, 0>,
-                               VEX_4V, VEX_L, WIG;
+                               VEX, VVVV, VEX_L, WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
                            sched.XMM>;
@@ -3757,11 +3757,11 @@ let Predicates = [UseSSE2] in {
 } // ExeDomain = SSEPackedInt
 
 defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
-                             SchedWriteShuffle, NoVLX>, PD;
+                             SchedWriteShuffle, NoVLX>, TB, PD;
 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
-                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
+                             SchedWriteShuffle, NoVLX_Or_NoBWI>, TB, XS;
 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
-                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
+                             SchedWriteShuffle, NoVLX_Or_NoBWI>, TB, XD;
 
 //===---------------------------------------------------------------------===//
 // Packed Integer Pack Instructions (SSE & AVX)
@@ -3821,33 +3821,33 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
                              i128mem, SchedWriteShuffle.XMM, load, 0>,
-                             VEX_4V, WIG;
+                             VEX, VVVV, WIG;
   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
                              i128mem, SchedWriteShuffle.XMM, load, 0>,
-                             VEX_4V, WIG;
+                             VEX, VVVV, WIG;
 
   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
                              i128mem, SchedWriteShuffle.XMM, load, 0>,
-                             VEX_4V, WIG;
+                             VEX, VVVV, WIG;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
                              i128mem, SchedWriteShuffle.XMM, load, 0>,
-                             VEX_4V, WIG;
+                             VEX, VVVV, WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
                               i256mem, SchedWriteShuffle.YMM, load, 0>,
-                              VEX_4V, VEX_L, WIG;
+                              VEX, VVVV, VEX_L, WIG;
   defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
                               i256mem, SchedWriteShuffle.YMM, load, 0>,
-                              VEX_4V, VEX_L, WIG;
+                              VEX, VVVV, VEX_L, WIG;
 
   defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
                               i256mem, SchedWriteShuffle.YMM, load, 0>,
-                              VEX_4V, VEX_L, WIG;
+                              VEX, VVVV, VEX_L, WIG;
   defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
                               i256mem, SchedWriteShuffle.YMM, load, 0>,
-                              VEX_4V, VEX_L, WIG;
+                              VEX, VVVV, VEX_L, WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -3892,61 +3892,61 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4004,7 +4004,7 @@ def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
                                             timm:$src2))]>,
-                PD, VEX, WIG, Sched<[WriteVecExtract]>;
+                TB, PD, VEX, WIG, Sched<[WriteVecExtract]>;
 def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
                     (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -4014,10 +4014,10 @@ def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
 
 // Insert
 let Predicates = [HasAVX, NoBWI] in
-defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, WIG;
+defm VPINSRW : sse2_pinsrw<0>, TB, PD, VEX, VVVV, WIG;
 
 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
-defm PINSRW : sse2_pinsrw, PD;
+defm PINSRW : sse2_pinsrw, TB, PD;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4306,13 +4306,13 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
-                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, TB, XS,
                     VEX, Requires<[UseAVX]>, WIG;
 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
-                    XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
+                    TB, XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
 } // ExeDomain, SchedRW
 
 //===---------------------------------------------------------------------===//
@@ -4369,11 +4369,11 @@ let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
-                         XS, VEX, Requires<[UseAVX]>, WIG;
+                         TB, XS, VEX, Requires<[UseAVX]>, WIG;
 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
-                        XS, Requires<[UseSSE2]>;
+                        TB, XS, Requires<[UseSSE2]>;
 } // ExeDomain, SchedRW
 
 let Predicates = [UseAVX] in {
@@ -4563,27 +4563,27 @@ let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
                                  SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
-                                 XD, VEX_4V, WIG;
+                                 TB, XD, VEX, VVVV, WIG;
     defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
                                   SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
-                                  XD, VEX_4V, VEX_L, WIG;
+                                  TB, XD, VEX, VVVV, VEX_L, WIG;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
                                  SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
-                                 PD, VEX_4V, WIG;
+                                 TB, PD, VEX, VVVV, WIG;
     defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
                                   SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
-                                  PD, VEX_4V, VEX_L, WIG;
+                                  TB, PD, VEX, VVVV, VEX_L, WIG;
   }
 }
 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
   let ExeDomain = SSEPackedSingle in
   defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
-                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
+                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, TB, XD;
   let ExeDomain = SSEPackedDouble in
   defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
-                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
+                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, TB, PD;
 }
 
 //===---------------------------------------------------------------------===//
@@ -4635,23 +4635,23 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
 let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
-                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, WIG;
+                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX, VVVV, WIG;
     defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
-                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, WIG;
+                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX, VVVV, WIG;
     defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, WIG;
+                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX, VVVV, VEX_L, WIG;
     defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, WIG;
+                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX, VVVV, VEX_L, WIG;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VHADDPD  : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
-                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, WIG;
+                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX, VVVV, WIG;
     defm VHSUBPD  : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
-                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, WIG;
+                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX, VVVV, WIG;
     defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, WIG;
+                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX, VVVV, VEX_L, WIG;
     defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, WIG;
+                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX, VVVV, VEX_L, WIG;
   }
 }
 
@@ -4806,45 +4806,45 @@ let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
                                   VR128, load, i128mem,
-                                  SchedWriteVarShuffle.XMM, 0>, VEX_4V, WIG;
+                                  SchedWriteVarShuffle.XMM, 0>, VEX, VVVV, WIG;
   defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
                                   v16i8, VR128, load, i128mem,
-                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, WIG;
+                                  SchedWriteVecIMul.XMM, 0>, VEX, VVVV, WIG;
 }
 defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
                                   VR128, load, i128mem,
-                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, WIG;
+                                  SchedWriteVecIMul.XMM, 0>, VEX, VVVV, WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
                                   load, i128mem,
-                                  SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
+                                  SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
                                   load, i128mem,
-                                  SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
+                                  SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
                                   load, i128mem,
-                                  SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
+                                  SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
                                   load, i128mem,
-                                  SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
+                                  SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
                                       int_x86_ssse3_psign_b_128,
-                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG;
   defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
                                       int_x86_ssse3_psign_w_128,
-                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG;
   defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
                                       int_x86_ssse3_psign_d_128,
-                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
-                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, WIG;
+                                      SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
   defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
                                       int_x86_ssse3_phsub_sw_128,
-                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, WIG;
+                                      SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
 }
 }
 
@@ -4852,42 +4852,42 @@ let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
                                   VR256, load, i256mem,
-                                  SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                  SchedWriteVarShuffle.YMM, 0>, VEX, VVVV, VEX_L, WIG;
   defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
                                    v32i8, VR256, load, i256mem,
-                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                   SchedWriteVecIMul.YMM, 0>, VEX, VVVV, VEX_L, WIG;
 }
 defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
                                   VR256, load, i256mem,
-                                  SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                  SchedWriteVecIMul.YMM, 0>, VEX, VVVV, VEX_L, WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
                                   VR256, load, i256mem,
-                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                  SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
                                   load, i256mem,
-                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                  SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
                                   VR256, load, i256mem,
-                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                  SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
                                   load, i256mem,
-                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                  SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
   defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
-                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
+                                       SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
   defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
-                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
+                                       SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
   defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
-                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
+                                       SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
   defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
                                        int_x86_avx2_phadd_sw,
-                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, WIG;
+                                       SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
   defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
                                        int_x86_avx2_phsub_sw,
-                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, WIG;
+                                       SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
 }
 }
 
@@ -4956,10 +4956,10 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
   defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
-                                SchedWriteShuffle.XMM, 0>, VEX_4V, WIG;
+                                SchedWriteShuffle.XMM, 0>, VEX, VVVV, WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
   defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
-                                 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                 SchedWriteShuffle.YMM, 0>, VEX, VVVV, VEX_L, WIG;
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
   defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
                                SchedWriteShuffle.XMM>;
@@ -5367,7 +5367,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
 }
 
 let Predicates = [HasAVX, NoBWI] in {
-  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, WIG;
+  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX, VVVV, WIG;
   def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext (i8 GR8:$src2))), timm:$src3),
             (VPINSRBrr VR128:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
                        GR8:$src2, sub_8bit), timm:$src3)>;
@@ -5398,7 +5398,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
 }
 
 let Predicates = [HasAVX, NoDQI] in
-  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
+  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX, VVVV;
 let Constraints = "$src1 = $dst" in
   defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
 
@@ -5424,7 +5424,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
 }
 
 let Predicates = [HasAVX, NoDQI] in
-  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, REX_W;
+  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX, VVVV, REX_W;
 let Constraints = "$src1 = $dst" in
   defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
 
@@ -5459,7 +5459,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
 let ExeDomain = SSEPackedSingle in {
   let Predicates = [UseAVX] in
     defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
-                     VEX_4V, WIG;
+                     VEX, VVVV, WIG;
   let Constraints = "$src1 = $dst" in
     defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
 }
@@ -5638,9 +5638,9 @@ let Predicates = [HasAVX, NoVLX] in {
 let Predicates = [UseAVX] in {
   defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
                                   v4f32, v2f64, X86RndScales, 0>,
-                                  VEX_4V, VEX_LIG, WIG, SIMD_EXC;
+                                  VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
   defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
-                                VEX_4V, VEX_LIG, WIG, SIMD_EXC;
+                                VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
 }
 
 let Predicates = [UseAVX] in {
@@ -5760,33 +5760,33 @@ let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
   def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
                      [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
-                     Sched<[WritePOPCNT]>, OpSize16, XS;
+                     Sched<[WritePOPCNT]>, OpSize16, TB, XS;
   def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
                      [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
                       (implicit EFLAGS)]>,
-                      Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
+                      Sched<[WritePOPCNT.Folded]>, OpSize16, TB, XS;
 
   def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
                      [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
-                     Sched<[WritePOPCNT]>, OpSize32, XS;
+                     Sched<[WritePOPCNT]>, OpSize32, TB, XS;
 
   def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
                      [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
                       (implicit EFLAGS)]>,
-                      Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
+                      Sched<[WritePOPCNT.Folded]>, OpSize32, TB, XS;
 
   def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
-                      Sched<[WritePOPCNT]>, XS;
+                      Sched<[WritePOPCNT]>, TB, XS;
   def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
                        (implicit EFLAGS)]>,
-                       Sched<[WritePOPCNT.Folded]>, XS;
+                       Sched<[WritePOPCNT.Folded]>, TB, XS;
 }
 
 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
@@ -5842,65 +5842,65 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 let Predicates = [HasAVX, NoVLX] in {
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
                                   load, i128mem, SchedWriteVecIMul.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
 }
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
                                   load, i256mem, SchedWriteVecIMul.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -5927,20 +5927,20 @@ let Constraints = "$src1 = $dst" in {
 let Predicates = [HasAVX, NoVLX] in
   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
                                  load, i128mem, SchedWritePMULLD.XMM, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
 let Predicates = [HasAVX] in
   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
 
 let Predicates = [HasAVX2, NoVLX] in
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
                                   load, i256mem, SchedWritePMULLD.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 let Predicates = [HasAVX2] in
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 
 let Constraints = "$src1 = $dst" in {
   defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
@@ -6088,22 +6088,22 @@ let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
                                         VR128, load, i128mem, 0,
-                                        SchedWriteMPSAD.XMM>, VEX_4V, WIG;
+                                        SchedWriteMPSAD.XMM>, VEX, VVVV, WIG;
   }
 
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
                                    VR128, load, f128mem, 0,
-                                   SchedWriteDPPS.XMM>, VEX_4V, WIG;
+                                   SchedWriteDPPS.XMM>, VEX, VVVV, WIG;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
                                    VR128, load, f128mem, 0,
-                                   SchedWriteDPPD.XMM>, VEX_4V, WIG;
+                                   SchedWriteDPPD.XMM>, VEX, VVVV, WIG;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
                                     VR256, load, i256mem, 0,
-                                    SchedWriteDPPS.YMM>, VEX_4V, VEX_L, WIG;
+                                    SchedWriteDPPS.YMM>, VEX, VVVV, VEX_L, WIG;
 }
 }
 
@@ -6111,7 +6111,7 @@ let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
                                   VR256, load, i256mem, 0,
-                                  SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, WIG;
+                                  SchedWriteMPSAD.YMM>, VEX, VVVV, VEX_L, WIG;
   }
 }
 
@@ -6170,30 +6170,30 @@ let Predicates = [HasAVX] in {
   defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
                                   VR128, load, f128mem, 0, SSEPackedSingle,
                                   SchedWriteFBlend.XMM, BlendCommuteImm4>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
                                    VR256, load, f256mem, 0, SSEPackedSingle,
                                    SchedWriteFBlend.YMM, BlendCommuteImm8>,
-                                   VEX_4V, VEX_L, WIG;
+                                   VEX, VVVV, VEX_L, WIG;
   defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
                                   VR128, load, f128mem, 0, SSEPackedDouble,
                                   SchedWriteFBlend.XMM, BlendCommuteImm2>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
                                    VR256, load, f256mem, 0, SSEPackedDouble,
                                    SchedWriteFBlend.YMM, BlendCommuteImm4>,
-                                   VEX_4V, VEX_L, WIG;
+                                   VEX, VVVV, VEX_L, WIG;
   defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
                                   VR128, load, i128mem, 0, SSEPackedInt,
                                   SchedWriteBlend.XMM, BlendCommuteImm8>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
 }
 
 let Predicates = [HasAVX2] in {
   defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
                                    VR256, load, i256mem, 0, SSEPackedInt,
                                    SchedWriteBlend.YMM, BlendCommuteImm8>,
-                                   VEX_4V, VEX_L, WIG;
+                                   VEX, VVVV, VEX_L, WIG;
 }
 
 // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
@@ -6290,7 +6290,7 @@ multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
-                  SSEPackedInt>, TAPD, VEX_4V,
+                  SSEPackedInt>, TA, PD, VEX, VVVV,
                 Sched<[sched]>;
 
   def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
@@ -6299,7 +6299,7 @@ multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst,
                         (OpNode RC:$src3, (mem_frag addr:$src2),
-                                RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
+                                RC:$src1))], SSEPackedInt>, TA, PD, VEX, VVVV,
                 Sched<[sched.Folded, sched.ReadAfterFold,
                        // x86memop:$src2
                        ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -6564,12 +6564,12 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
@@ -6655,49 +6655,51 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
 // SSE4.2 - CRC Instructions
 //===----------------------------------------------------------------------===//
 
+// NOTE: 'HasCRC32' is used as CRC32 instructions are GPR only and not directly
+// controlled by the SSE42 flag.
+//
 // No CRC instructions have AVX equivalents
 
-// crc intrinsic instruction
-// This set of instructions are only rm, the only difference is the size
-// of r and m.
-class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
-                   RegisterClass RCIn, SDPatternOperator Int> :
-  CRC32I<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
-         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
-         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
-         Sched<[WriteCRC32]>;
-
-class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
-                   X86MemOperand x86memop, SDPatternOperator Int> :
-  CRC32I<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
-         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
-         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
-         Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
-
-let Constraints = "$src1 = $dst" in {
-  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
-                                 int_x86_sse42_crc32_32_8>;
-  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
-                                 int_x86_sse42_crc32_32_8>;
-  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
-                                 int_x86_sse42_crc32_32_16>, OpSize16;
-  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
-                                 int_x86_sse42_crc32_32_16>, OpSize16;
-  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
-                                 int_x86_sse42_crc32_32_32>, OpSize32;
-  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
-                                 int_x86_sse42_crc32_32_32>, OpSize32;
-  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
-                                 int_x86_sse42_crc32_64_64>, REX_W;
-  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
-                                 int_x86_sse42_crc32_64_64>, REX_W;
-  let hasSideEffects = 0 in {
-    let mayLoad = 1 in
-    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
-                                   null_frag>, REX_W;
-    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
-                                   null_frag>, REX_W;
-  }
+class Crc32r<X86TypeInfo t, RegisterClass rc, SDPatternOperator node>
+  : ITy<0xF1, MRMSrcReg, t, (outs rc:$dst), (ins rc:$src1, t.RegClass:$src2),
+      "crc32", binop_args, [(set rc:$dst, (node rc:$src1, t.RegClass:$src2))]>,
+    Sched<[WriteCRC32]>, NoCD8 {
+  let Constraints = "$src1 = $dst";
+}
+
+class Crc32m<X86TypeInfo t, RegisterClass rc, SDPatternOperator node>
+  : ITy<0xF1, MRMSrcMem, t, (outs rc:$dst), (ins rc:$src1, t.MemOperand:$src2),
+      "crc32", binop_args, [(set rc:$dst, (node rc:$src1, (load addr:$src2)))]>,
+    Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>, NoCD8 {
+  let Constraints = "$src1 = $dst";
+}
+
+let Predicates = [HasCRC32, NoEGPR], OpMap = T8, OpPrefix = XD in {
+  def CRC32r32r8  : Crc32r<Xi8, GR32, int_x86_sse42_crc32_32_8>;
+  def CRC32r32m8  : Crc32m<Xi8, GR32, int_x86_sse42_crc32_32_8>;
+  def CRC32r32r16 : Crc32r<Xi16, GR32, int_x86_sse42_crc32_32_16>, OpSize16;
+  def CRC32r32m16 : Crc32m<Xi16, GR32, int_x86_sse42_crc32_32_16>, OpSize16;
+  def CRC32r32r32 : Crc32r<Xi32, GR32, int_x86_sse42_crc32_32_32>, OpSize32;
+  def CRC32r32m32 : Crc32m<Xi32, GR32, int_x86_sse42_crc32_32_32>, OpSize32;
+  def CRC32r64r64 : Crc32r<Xi64, GR64, int_x86_sse42_crc32_64_64>;
+  def CRC32r64m64 : Crc32m<Xi64, GR64, int_x86_sse42_crc32_64_64>;
+  def CRC32r64r8 : Crc32r<Xi8, GR64, null_frag>, REX_W;
+  let mayLoad = 1 in
+    def CRC32r64m8 : Crc32m<Xi8, GR64, null_frag>, REX_W;
+}
+
+let Predicates = [HasCRC32, HasEGPR, In64BitMode], OpMap = T_MAP4, OpEnc = EncEVEX in {
+  def CRC32r32r8_EVEX  : Crc32r<Xi8, GR32, int_x86_sse42_crc32_32_8>;
+  def CRC32r32m8_EVEX  : Crc32m<Xi8, GR32, int_x86_sse42_crc32_32_8>;
+  def CRC32r32r16_EVEX : Crc32r<Xi16, GR32, int_x86_sse42_crc32_32_16>, PD;
+  def CRC32r32m16_EVEX : Crc32m<Xi16, GR32, int_x86_sse42_crc32_32_16>, PD;
+  def CRC32r32r32_EVEX : Crc32r<Xi32, GR32, int_x86_sse42_crc32_32_32>;
+  def CRC32r32m32_EVEX : Crc32m<Xi32, GR32, int_x86_sse42_crc32_32_32>;
+  def CRC32r64r64_EVEX : Crc32r<Xi64, GR64, int_x86_sse42_crc32_64_64>;
+  def CRC32r64m64_EVEX : Crc32m<Xi64, GR64, int_x86_sse42_crc32_64_64>;
+  def CRC32r64r8_EVEX : Crc32r<Xi8, GR64, null_frag>, REX_W;
+  let mayLoad = 1 in
+    def CRC32r64m8_EVEX : Crc32m<Xi8, GR64, null_frag>, REX_W;
 }
 
 //===----------------------------------------------------------------------===//
@@ -6715,7 +6717,7 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                     [!if(UsesXMM0,
                          (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
                          (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
-                    T8PS, Sched<[sched]>;
+                    T8, Sched<[sched]>;
 
   def rm#Suffix : I<Opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, i128mem:$src2),
@@ -6726,7 +6728,7 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                          (set VR128:$dst, (IntId VR128:$src1,
                            (memop addr:$src2), XMM0)),
                          (set VR128:$dst, (IntId VR128:$src1,
-                           (memop addr:$src2))))]>, T8PS,
+                           (memop addr:$src2))))]>, T8,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6736,7 +6738,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, NoEGPR] in {
                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
-                            (i8 timm:$src3)))]>, TAPS,
+                            (i8 timm:$src3)))]>, TA,
                          Sched<[SchedWriteVecIMul.XMM]>;
   def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
                          (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
@@ -6744,7 +6746,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, NoEGPR] in {
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1,
                             (memop addr:$src2),
-                            (i8 timm:$src3)))]>, TAPS,
+                            (i8 timm:$src3)))]>, TA,
                          Sched<[SchedWriteVecIMul.XMM.Folded,
                                 SchedWriteVecIMul.XMM.ReadAfterFold]>;
 
@@ -6772,7 +6774,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, HasEGPR, In64BitMode] in
                              [(set VR128:$dst,
                                (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
                                 (i8 timm:$src3)))]>,
-                         EVEX_NoCD8, T_MAP4PS, Sched<[SchedWriteVecIMul.XMM]>;
+                         EVEX, NoCD8, T_MAP4, Sched<[SchedWriteVecIMul.XMM]>;
   def SHA1RNDS4rmi_EVEX: Ii8<0xD4, MRMSrcMem, (outs VR128:$dst),
                              (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
                              "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
@@ -6780,31 +6782,31 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, HasEGPR, In64BitMode] in
                                (int_x86_sha1rnds4 VR128:$src1,
                                 (memop addr:$src2),
                                 (i8 timm:$src3)))]>,
-                         EVEX_NoCD8, T_MAP4PS,
+                         EVEX, NoCD8, T_MAP4,
                          Sched<[SchedWriteVecIMul.XMM.Folded,
                                 SchedWriteVecIMul.XMM.ReadAfterFold]>;
 
   defm SHA1NEXTE : SHAI_binop<0xD8, "sha1nexte", int_x86_sha1nexte,
                                    SchedWriteVecIMul.XMM, "_EVEX">,
-                        EVEX_NoCD8, T_MAP4PS;
+                        EVEX, NoCD8, T_MAP4;
   defm SHA1MSG1  : SHAI_binop<0xD9, "sha1msg1", int_x86_sha1msg1,
                               SchedWriteVecIMul.XMM, "_EVEX">,
-                   EVEX_NoCD8, T_MAP4PS;
+                   EVEX, NoCD8, T_MAP4;
   defm SHA1MSG2  : SHAI_binop<0xDA, "sha1msg2", int_x86_sha1msg2,
                               SchedWriteVecIMul.XMM, "_EVEX">,
-                   EVEX_NoCD8, T_MAP4PS;
+                   EVEX, NoCD8, T_MAP4;
 
   let Uses=[XMM0] in
   defm SHA256RNDS2 : SHAI_binop<0xDB, "sha256rnds2", int_x86_sha256rnds2,
                                 SchedWriteVecIMul.XMM, "_EVEX", 1>,
-                     EVEX_NoCD8, T_MAP4PS;
+                     EVEX, NoCD8, T_MAP4;
 
   defm SHA256MSG1 : SHAI_binop<0xDC, "sha256msg1", int_x86_sha256msg1,
                                SchedWriteVecIMul.XMM, "_EVEX">,
-                    EVEX_NoCD8, T_MAP4PS;
+                    EVEX, NoCD8, T_MAP4;
   defm SHA256MSG2 : SHAI_binop<0xDD, "sha256msg2", int_x86_sha256msg2,
                                SchedWriteVecIMul.XMM, "_EVEX">,
-                    EVEX_NoCD8, T_MAP4PS;
+                    EVEX, NoCD8, T_MAP4;
 }
 
 //===----------------------------------------------------------------------===//
@@ -6832,28 +6834,28 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
 // Perform One Round of an AES Encryption/Decryption Flow
 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc, load>, VEX_4V, WIG;
+                         int_x86_aesni_aesenc, load>, VEX, VVVV, WIG;
   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast, load>, VEX_4V, WIG;
+                         int_x86_aesni_aesenclast, load>, VEX, VVVV, WIG;
   defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec, load>, VEX_4V, WIG;
+                         int_x86_aesni_aesdec, load>, VEX, VVVV, WIG;
   defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast, load>, VEX_4V, WIG;
+                         int_x86_aesni_aesdeclast, load>, VEX, VVVV, WIG;
 }
 
 let Predicates = [NoVLX, HasVAES] in {
   defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
                          int_x86_aesni_aesenc_256, load, 0, VR256,
-                         i256mem>, VEX_4V, VEX_L, WIG;
+                         i256mem>, VEX, VVVV, VEX_L, WIG;
   defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
                          int_x86_aesni_aesenclast_256, load, 0, VR256,
-                         i256mem>, VEX_4V, VEX_L, WIG;
+                         i256mem>, VEX, VVVV, VEX_L, WIG;
   defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
                          int_x86_aesni_aesdec_256, load, 0, VR256,
-                         i256mem>, VEX_4V, VEX_L, WIG;
+                         i256mem>, VEX, VVVV, VEX_L, WIG;
   defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
                          int_x86_aesni_aesdeclast_256, load, 0, VR256,
-                         i256mem>, VEX_4V, VEX_L, WIG;
+                         i256mem>, VEX, VVVV, VEX_L, WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6994,11 +6996,11 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
 
 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
 defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
-                             int_x86_pclmulqdq>, VEX_4V, WIG;
+                             int_x86_pclmulqdq>, VEX, VVVV, WIG;
 
 let Predicates = [NoVLX, HasVPCLMULQDQ] in
 defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
-                              int_x86_pclmulqdq_256>, VEX_4V, VEX_L, WIG;
+                              int_x86_pclmulqdq_256>, VEX, VVVV, VEX_L, WIG;
 
 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
                                    X86MemOperand MemOp, string Hi, string Lo> {
@@ -7035,26 +7037,26 @@ def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
                  "extrq\t{$idx, $len, $src|$src, $len, $idx}",
                  [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
                                     timm:$idx))]>,
-                 PD, Sched<[SchedWriteVecALU.XMM]>;
+                 TB, PD, Sched<[SchedWriteVecALU.XMM]>;
 def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
               (ins VR128:$src, VR128:$mask),
               "extrq\t{$mask, $src|$src, $mask}",
               [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
                                  VR128:$mask))]>,
-              PD, Sched<[SchedWriteVecALU.XMM]>;
+              TB, PD, Sched<[SchedWriteVecALU.XMM]>;
 
 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
                    "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
                    [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
                                       timm:$len, timm:$idx))]>,
-                   XD, Sched<[SchedWriteVecALU.XMM]>;
+                   TB, XD, Sched<[SchedWriteVecALU.XMM]>;
 def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
                  (ins VR128:$src, VR128:$mask),
                  "insertq\t{$mask, $src|$src, $mask}",
                  [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
                                     VR128:$mask))]>,
-                 XD, Sched<[SchedWriteVecALU.XMM]>;
+                 TB, XD, Sched<[SchedWriteVecALU.XMM]>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -7062,10 +7064,10 @@ def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
 let AddedComplexity = 400 in { // Prefer non-temporal versions
 let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
-                "movntss\t{$src, $dst|$dst, $src}", []>, XS;
+                "movntss\t{$src, $dst|$dst, $src}", []>, TB, XS;
 
 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
-                "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
+                "movntsd\t{$src, $dst|$dst, $src}", []>, TB, XD;
 } // SchedRW
 
 def : Pat<(nontemporalstore FR32:$src, addr:$dst),
@@ -7160,6 +7162,10 @@ def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF128rm addr:$src)>;
 }
 
+let Predicates = [HasAVXNECONVERT, NoVLX] in
+  def : Pat<(v16bf16 (X86SubVBroadcastld128 addr:$src)),
+            (VBROADCASTF128rm addr:$src)>;
+
 //===----------------------------------------------------------------------===//
 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
 //
@@ -7169,11 +7175,11 @@ let isCommutable = 1 in
 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
-          VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
+          VEX, VVVV, VEX_L, Sched<[WriteFShuffle256]>;
 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
-          VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
+          VEX, VVVV, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
 }
 
 // Immediate transform to help with commuting.
@@ -7212,12 +7218,12 @@ let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, u8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
+          []>, Sched<[WriteFShuffle256]>, VEX, VVVV, VEX_L;
 let mayLoad = 1 in
 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
+          []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L;
 }
 
 // To create a 256-bit all ones value, we should produce VCMPTRUEPS
@@ -7315,22 +7321,22 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
              (ins VR128:$src1, f128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
-             VEX_4V, Sched<[schedX.RM]>;
+             VEX, VVVV, Sched<[schedX.RM]>;
   def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, f256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
-             VEX_4V, VEX_L, Sched<[schedY.RM]>;
+             VEX, VVVV, VEX_L, Sched<[schedY.RM]>;
   def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
-             VEX_4V, Sched<[schedX.MR]>;
+             VEX, VVVV, Sched<[schedX.MR]>;
   def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
-             VEX_4V, VEX_L, Sched<[schedY.MR]>;
+             VEX, VVVV, VEX_L, Sched<[schedY.MR]>;
 }
 
 let ExeDomain = SSEPackedSingle in
@@ -7361,14 +7367,14 @@ multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
              [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
                                        VR128:$src2, VR128:$src3)))]>,
-             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+             VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
 
   def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, VR128:$src2, i128mem:$src3),
              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
              [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
                                       (loadv4i32 addr:$src3))))]>,
-             VEX_4V, Sched<[SchedWriteVecIMul.XMM.Folded,
+             VEX, VVVV, Sched<[SchedWriteVecIMul.XMM.Folded,
                             SchedWriteVecIMul.XMM.ReadAfterFold,
                             SchedWriteVecIMul.XMM.ReadAfterFold]>;
 
@@ -7378,14 +7384,14 @@ multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
              [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
                                        VR256:$src2, VR256:$src3)))]>,
-             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+             VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
 
   def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, VR256:$src2, i256mem:$src3),
              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
              [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
                                       (loadv8i32 addr:$src3))))]>,
-             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM.Folded,
+             VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM.Folded,
                                    SchedWriteVecIMul.YMM.ReadAfterFold,
                                    SchedWriteVecIMul.YMM.ReadAfterFold]>;
 }
@@ -7424,13 +7430,13 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
     def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
                (ins RC:$src1, RC:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
+               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX, VVVV,
                Sched<[varsched]>;
     def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
                (ins RC:$src1, x86memop_i:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
-                              (i_vt (load addr:$src2)))))]>, VEX_4V,
+                              (i_vt (load addr:$src2)))))]>, VEX, VVVV,
                Sched<[varsched.Folded, sched.ReadAfterFold]>;
 
     def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
@@ -7474,12 +7480,12 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
             YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
   // Zero All YMM registers
   def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
-                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
+                  [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L,
                   Requires<[HasAVX]>, WIG;
 
   // Zero Upper bits of YMM registers
   def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
-                     [(int_x86_avx_vzeroupper)]>, PS, VEX,
+                     [(int_x86_avx_vzeroupper)]>, TB, VEX,
                      Requires<[HasAVX]>, WIG;
 } // Defs
 } // SchedRW
@@ -7493,11 +7499,11 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
   def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
              [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
-             T8PD, VEX, Sched<[sched]>;
+             T8, PD, VEX, Sched<[sched]>;
   let hasSideEffects = 0, mayLoad = 1 in
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
-             []>, T8PD, VEX, Sched<[sched.Folded]>;
+             []>, T8, PD, VEX, Sched<[sched.Folded]>;
 }
 
 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
@@ -7506,12 +7512,12 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
                (ins RC:$src1, i32u8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
-               TAPD, VEX, Sched<[RR]>;
+               TA, PD, VEX, Sched<[RR]>;
   let hasSideEffects = 0, mayStore = 1 in
   def mr : Ii8<0x1D, MRMDestMem, (outs),
                (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-               TAPD, VEX, Sched<[MR]>;
+               TA, PD, VEX, Sched<[MR]>;
 }
 
 let Predicates = [HasF16C, NoVLX] in {
@@ -7558,14 +7564,14 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
-        Sched<[sched]>, VEX_4V;
+        Sched<[sched]>, VEX, VVVV;
   def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst,
           (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
-        Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
+        Sched<[sched.Folded, sched.ReadAfterFold]>, VEX, VVVV;
 
   // Pattern to commute if load is in first source.
   def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
@@ -7815,7 +7821,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
                        (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
-                     Sched<[Sched]>, VEX_4V, VEX_L;
+                     Sched<[Sched]>, VEX, VVVV, VEX_L;
     def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
                      (ins VR256:$src1, memOp:$src2),
                      !strconcat(OpcodeStr,
@@ -7823,7 +7829,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr,
                      [(set VR256:$dst,
                        (OpVT (X86VPermv VR256:$src1,
                               (load addr:$src2))))]>,
-                     Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
+                     Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VVVV, VEX_L;
   }
 }
 
@@ -7866,11 +7872,11 @@ let isCommutable = 1 in
 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
-          Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+          Sched<[WriteShuffle256]>, VEX, VVVV, VEX_L;
 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
-          Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
+          Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L;
 
 let Predicates = [HasAVX2] in {
   defm : vperm2x128_lowering<"VPERM2I128", v4i64,  loadv4i64>;
@@ -7888,12 +7894,12 @@ let hasSideEffects = 0 in {
 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, u8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+          []>, Sched<[WriteShuffle256]>, VEX, VVVV, VEX_L;
 let mayLoad = 1 in
 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
+          []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
@@ -7905,6 +7911,9 @@ let Predicates = [HasAVX2, NoVLX] in {
   defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
 }
 
+let Predicates = [HasAVXNECONVERT, NoVLX] in
+  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8bf16, v16bf16, loadv8bf16, loadv16bf16>;
+
 //===----------------------------------------------------------------------===//
 // VEXTRACTI128 - Extract packed integer values
 //
@@ -7927,6 +7936,9 @@ let Predicates = [HasAVX2, NoVLX] in {
   defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
 }
 
+let Predicates = [HasAVXNECONVERT, NoVLX] in
+  defm : vextract_lowering<"VEXTRACTI128", v16bf16, v8bf16>;
+
 //===----------------------------------------------------------------------===//
 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
 //
@@ -7939,22 +7951,22 @@ multiclass avx2_pmovmask<string OpcodeStr,
              (ins VR128:$src1, i128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
-             VEX_4V, Sched<[schedX.RM]>;
+             VEX, VVVV, Sched<[schedX.RM]>;
   def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
-             VEX_4V, VEX_L, Sched<[schedY.RM]>;
+             VEX, VVVV, VEX_L, Sched<[schedY.RM]>;
   def mr  : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
-             VEX_4V, Sched<[schedX.MR]>;
+             VEX, VVVV, Sched<[schedX.MR]>;
   def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
-             VEX_4V, VEX_L, Sched<[schedY.MR]>;
+             VEX, VVVV, VEX_L, Sched<[schedY.MR]>;
 }
 
 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
@@ -8012,28 +8024,28 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
-             VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
+             VEX, VVVV, Sched<[SchedWriteVarVecShift.XMM]>;
   def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1,
                        (vt128 (load addr:$src2)))))]>,
-             VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
+             VEX, VVVV, Sched<[SchedWriteVarVecShift.XMM.Folded,
                             SchedWriteVarVecShift.XMM.ReadAfterFold]>;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
              (ins VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
-             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
+             VEX, VVVV, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
                        (vt256 (load addr:$src2)))))]>,
-             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
+             VEX, VVVV, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
                                    SchedWriteVarVecShift.YMM.ReadAfterFold]>;
 }
 
@@ -8109,12 +8121,12 @@ multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
     let isCommutable = 1 in
     def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
                  [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
-             Sched<[sched]>, T8PD;
+             Sched<[sched]>, T8;
 
     def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
                  [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
                                  (MemOpFrag addr:$src2))))]>,
-             Sched<[sched.Folded, sched.ReadAfterFold]>, T8PD;
+             Sched<[sched.Folded, sched.ReadAfterFold]>, T8;
   }
 }
 
@@ -8146,10 +8158,10 @@ multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
   let Predicates  = [HasGFNI, HasAVX, NoVLX] in {
     defm V#NAME    : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
                                      load, i128mem, SchedWriteVecIMul.XMM>,
-                                     VEX_4V, REX_W;
+                                     VEX, VVVV, REX_W;
     defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
                                      load, i256mem, SchedWriteVecIMul.YMM>,
-                                     VEX_4V, VEX_L, REX_W;
+                                     VEX, VVVV, VEX_L, REX_W;
   }
 }
 
@@ -8160,16 +8172,16 @@ defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
                                     i128mem, SchedWriteVecALU.XMM, 1>;
 let Predicates  = [HasGFNI, HasAVX, NoVLX] in {
   defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
-                                   i128mem, SchedWriteVecALU.XMM>, VEX_4V;
+                                   i128mem, SchedWriteVecALU.XMM>, VEX, VVVV;
   defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
-                                   i256mem, SchedWriteVecALU.YMM>, VEX_4V, VEX_L;
+                                   i256mem, SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L;
 }
 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
 let isCommutable = 0 in {
   defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
-                                             X86GF2P8affineinvqb>, TAPD;
+                                             X86GF2P8affineinvqb>, TA, PD;
   defm GF2P8AFFINEQB    : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
-                                             X86GF2P8affineqb>, TAPD;
+                                             X86GF2P8affineqb>, TA, PD;
 }
 
 // AVX-IFMA
@@ -8183,28 +8195,28 @@ multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> {
                !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
                                          VR128:$src3, VR128:$src1)))]>,
-               VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+               VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
   }
     def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, i128mem:$src3),
                !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
                                         (loadv2i64 addr:$src3), VR128:$src1)))]>,
-               VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+               VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
   let isCommutable = 1 in {
     def Yrr  : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
                (ins VR256:$src1, VR256:$src2, VR256:$src3),
                !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
                                          VR256:$src3, VR256:$src1)))]>,
-               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+               VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
   }
     def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
                (ins VR256:$src1, VR256:$src2, i256mem:$src3),
                !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
                                         (loadv4i64 addr:$src3), VR256:$src1)))]>,
-               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+               VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
 }
 
 defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, REX_W, ExplicitVEXPrefix;
@@ -8222,52 +8234,52 @@ multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT,
              (ins RC:$src1, RC:$src2, RC:$src3),
              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
-             VEX_4V, Sched<[Sched]>;
+             VEX, VVVV, Sched<[Sched]>;
   def rm  :  I<Opc, MRMSrcMem, (outs RC:$dst),
              (ins RC:$src1, RC:$src2, X86memop:$src3),
              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
                                    (MemOpFrag addr:$src3))))]>,
-             VEX_4V, Sched<[Sched.Folded, Sched.ReadAfterFold]>;
+             VEX, VVVV, Sched<[Sched.Folded, Sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVXVNNIINT8] in {
   defm VPDPBSSD   : avx_dotprod_rm<0x50,"vpdpbssd",  v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM,
-                                   1>, T8XD;
+                                   1>, T8, XD;
   defm VPDPBSSDY  : avx_dotprod_rm<0x50,"vpdpbssd",  v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM,
-                                   1>, VEX_L, T8XD;
+                                   1>, VEX_L, T8, XD;
   defm VPDPBUUD   : avx_dotprod_rm<0x50,"vpdpbuud",  v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM,
-                                   1>, T8PS;
+                                   1>, T8;
   defm VPDPBUUDY  : avx_dotprod_rm<0x50,"vpdpbuud",  v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM,
-                                   1>, VEX_L, T8PS;
+                                   1>, VEX_L, T8;
   defm VPDPBSSDS  : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM,
-                                   1>, T8XD;
+                                   1>, T8, XD;
   defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM,
-                                   1>, VEX_L, T8XD;
+                                   1>, VEX_L, T8, XD;
   defm VPDPBUUDS  : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM,
-                                   1>, T8PS;
+                                   1>, T8;
   defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM,
-                                   1>, VEX_L, T8PS;
+                                   1>, VEX_L, T8;
   defm VPDPBSUD   : avx_dotprod_rm<0x50,"vpdpbsud",  v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbsud,  SchedWriteVecIMul.XMM,
-                                   0>, T8XS;
+                                   0>, T8, XS;
   defm VPDPBSUDY  : avx_dotprod_rm<0x50,"vpdpbsud",  v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbsud,  SchedWriteVecIMul.YMM,
-                                   0>,  VEX_L, T8XS;
+                                   0>,  VEX_L, T8, XS;
   defm VPDPBSUDS  : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM,
-                                   0>, T8XS;
+                                   0>, T8, XS;
   defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM,
-                                   0>, VEX_L, T8XS;
+                                   0>, VEX_L, T8, XS;
 }
 
 // AVX-NE-CONVERT
@@ -8306,18 +8318,18 @@ multiclass VCVTNEPS2BF16_BASE {
 
 let Predicates = [HasAVXNECONVERT] in {
   defm VBCSTNEBF162PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnebf162ps", f16mem,
-       f16mem>, T8XS;
+       f16mem>, T8, XS;
   defm VBCSTNESH2PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnesh2ps", f16mem, f16mem>,
-       T8PD;
+       T8, PD;
   defm VCVTNEEBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneebf162ps", f128mem,
-       f256mem>, T8XS;
+       f256mem>, T8, XS;
   defm VCVTNEEPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneeph2ps", f128mem,
-       f256mem>, T8PD;
+       f256mem>, T8, PD;
   defm VCVTNEOBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneobf162ps", f128mem,
-       f256mem>, T8XD;
+       f256mem>, T8, XD;
   defm VCVTNEOPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneoph2ps", f128mem,
-       f256mem>, T8PS;
-  defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8XS, ExplicitVEXPrefix;
+       f256mem>, T8;
+  defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8, XS, ExplicitVEXPrefix;
 
   def : Pat<(v8bf16 (X86vfpround (v8f32 VR256:$src))),
             (VCVTNEPS2BF16Yrr VR256:$src)>;
@@ -8337,19 +8349,19 @@ def VSHA512MSG1rr : I<0xcc, MRMSrcReg, (outs VR256:$dst),
                      "vsha512msg1\t{$src2, $dst|$dst, $src2}",
                      [(set VR256:$dst,
                        (int_x86_vsha512msg1 VR256:$src1, VR128:$src2))]>, VEX_L,
-                     VEX, T8XD, Sched<[WriteVecIMul]>;
+                     VEX, T8, XD, Sched<[WriteVecIMul]>;
 def VSHA512MSG2rr : I<0xcd, MRMSrcReg, (outs VR256:$dst),
                      (ins VR256:$src1, VR256:$src2),
                      "vsha512msg2\t{$src2, $dst|$dst, $src2}",
                      [(set VR256:$dst,
                        (int_x86_vsha512msg2 VR256:$src1, VR256:$src2))]>, VEX_L,
-                     VEX, T8XD, Sched<[WriteVecIMul]>;
+                     VEX, T8, XD, Sched<[WriteVecIMul]>;
 def VSHA512RNDS2rr : I<0xcb, MRMSrcReg, (outs VR256:$dst),
                       (ins VR256:$src1, VR256:$src2, VR128:$src3),
                       "vsha512rnds2\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set VR256:$dst,
                         (int_x86_vsha512rnds2 VR256:$src1, VR256:$src2, VR128:$src3))]>,
-                      VEX_L, VEX_4V, T8XD, Sched<[WriteVecIMul]>;
+                      VEX_L, VEX, VVVV, T8, XD, Sched<[WriteVecIMul]>;
 }
 
 // FIXME: Is there a better scheduler class for SM3 than WriteVecIMul?
@@ -8361,14 +8373,14 @@ let Predicates = [HasSM3], Constraints = "$src1 = $dst" in {
               [(set VR128:$dst,
                (!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1,
                 VR128:$src2, VR128:$src3))]>,
-              Sched<[WriteVecIMul]>, VEX_4V;
+              Sched<[WriteVecIMul]>, VEX, VVVV;
     def rm : I<0xda, MRMSrcMem, (outs VR128:$dst),
               (ins VR128:$src1, VR128:$src2, i128mem:$src3),
               !strconcat(OpStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
               [(set VR128:$dst,
                (!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1,
                 VR128:$src2, (loadv4i32 addr:$src3)))]>,
-              Sched<[WriteVecIMul]>, VEX_4V;
+              Sched<[WriteVecIMul]>, VEX, VVVV;
   }
 
   multiclass VSM3RNDS2_Base {
@@ -8389,9 +8401,9 @@ let Predicates = [HasSM3], Constraints = "$src1 = $dst" in {
   }
 }
 
-defm VSM3MSG1 : SM3_Base<"vsm3msg1">, T8PS;
-defm VSM3MSG2 : SM3_Base<"vsm3msg2">, T8PD;
-defm VSM3RNDS2 : VSM3RNDS2_Base, VEX_4V, TAPD;
+defm VSM3MSG1 : SM3_Base<"vsm3msg1">, T8;
+defm VSM3MSG2 : SM3_Base<"vsm3msg2">, T8, PD;
+defm VSM3RNDS2 : VSM3RNDS2_Base, VEX, VVVV, TA, PD;
 
 // FIXME: Is there a better scheduler class for SM4 than WriteVecIMul?
 let Predicates = [HasSM4] in {
@@ -8412,10 +8424,10 @@ let Predicates = [HasSM4] in {
   }
 }
 
-defm VSM4KEY4  : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8XS, VEX_4V;
-defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8XS, VEX_L, VEX_4V;
-defm VSM4RNDS4  : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8XD, VEX_4V;
-defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8XD, VEX_L, VEX_4V;
+defm VSM4KEY4  : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8, XS, VEX, VVVV;
+defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8, XS, VEX_L, VEX, VVVV;
+defm VSM4RNDS4  : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8, XD, VEX, VVVV;
+defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8, XD, VEX_L, VEX, VVVV;
 
 let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in
 multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
@@ -8426,7 +8438,7 @@ multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
               [(set VR128:$dst,
                 (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
                         VR128:$src1, VR128:$src2, VR128:$src3)))]>,
-              VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+              VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
 
   def rm  : I<opc, MRMSrcMem, (outs VR128:$dst),
               (ins VR128:$src1, VR128:$src2, i128mem:$src3),
@@ -8434,7 +8446,7 @@ multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
               [(set VR128:$dst,
                 (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
                         VR128:$src1, VR128:$src2, (loadv4i32 addr:$src3))))]>,
-              VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+              VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
 
   let isCommutable = IsCommutable in
   def Yrr  : I<opc, MRMSrcReg, (outs VR256:$dst),
@@ -8443,7 +8455,7 @@ multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
                [(set VR256:$dst,
                  (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
                          VR256:$src1, VR256:$src2, VR256:$src3)))]>,
-               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+               VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
 
   def Yrm  : I<opc, MRMSrcMem, (outs VR256:$dst),
                (ins VR256:$src1, VR256:$src2, i256mem:$src3),
@@ -8451,12 +8463,12 @@ multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
                [(set VR256:$dst,
                  (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
                          VR256:$src1, VR256:$src2, (loadv8i32 addr:$src3))))]>,
-               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+               VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
 }
 
-defm VPDPWSUD   : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8XS;
-defm VPDPWSUDS  : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8XS;
-defm VPDPWUSD   : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8PD;
-defm VPDPWUSDS  : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8PD;
-defm VPDPWUUD   : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8PS;
-defm VPDPWUUDS  : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8PS;
+defm VPDPWSUD   : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8, XS;
+defm VPDPWSUDS  : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8, XS;
+defm VPDPWUSD   : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8, PD;
+defm VPDPWUSDS  : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8, PD;
+defm VPDPWUUD   : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8;
+defm VPDPWUUDS  : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8;
diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td
index 48bf23f8cbf7..d13e3b7af69a 100644
--- a/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -829,12 +829,12 @@ multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop,
 let hasSideEffects = 0 in {
   def ri#Suffix : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
                       !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
-                  TAXD, VEX, Sched<[WriteShift]>;
+                  TA, XD, VEX, Sched<[WriteShift]>;
   let mayLoad = 1 in
   def mi#Suffix : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
                       (ins x86memop:$src1, u8imm:$src2),
                       !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
-                  TAXD, VEX, Sched<[WriteShiftLd]>;
+                  TA, XD, VEX, Sched<[WriteShiftLd]>;
 }
 }
 
@@ -860,23 +860,23 @@ let hasSideEffects = 0 in {
 let Predicates = [HasBMI2, NoEGPR] in {
   defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>;
   defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, REX_W;
-  defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS;
-  defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, REX_W;
-  defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD;
-  defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, REX_W;
-  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8PD;
-  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8PD, REX_W;
+  defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8, XS;
+  defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8, XS, REX_W;
+  defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8, XD;
+  defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8, XD, REX_W;
+  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8, PD;
+  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, PD, REX_W;
 }
 
 let Predicates = [HasBMI2, HasEGPR] in {
   defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem, "_EVEX">, EVEX;
   defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem, "_EVEX">, REX_W, EVEX;
-  defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem, "_EVEX">, T8XS, EVEX;
-  defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem, "_EVEX">, T8XS, REX_W, EVEX;
-  defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem, "_EVEX">, T8XD, EVEX;
-  defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem, "_EVEX">, T8XD, REX_W, EVEX;
-  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem, "_EVEX">, T8PD, EVEX;
-  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem, "_EVEX">, T8PD, REX_W, EVEX;
+  defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem, "_EVEX">, T8, XS, EVEX;
+  defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem, "_EVEX">, T8, XS, REX_W, EVEX;
+  defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem, "_EVEX">, T8, XD, EVEX;
+  defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem, "_EVEX">, T8, XD, REX_W, EVEX;
+  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem, "_EVEX">, T8, PD, EVEX;
+  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem, "_EVEX">, T8, PD, REX_W, EVEX;
 }
 
 let Predicates = [HasBMI2] in {
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index cbb5d4ed5bbd..699e5847e63f 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -426,31 +426,31 @@ let SchedRW = [WriteSystem] in {
 let Uses = [EAX, ECX, EDX] in
 def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB;
 let Uses = [EAX, ECX, EDX] in
-def WRMSRNS : I<0x01, MRM_C6, (outs), (ins), "wrmsrns", []>, PS;
+def WRMSRNS : I<0x01, MRM_C6, (outs), (ins), "wrmsrns", []>, TB;
 let Defs = [EAX, EDX], Uses = [ECX] in
 def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB;
 let Defs = [RAX, EFLAGS], Uses = [RBX, RCX], Predicates = [In64BitMode] in
-def PBNDKB : I<0x01, MRM_C7, (outs), (ins), "pbndkb", []>, PS;
+def PBNDKB : I<0x01, MRM_C7, (outs), (ins), "pbndkb", []>, TB;
 let Uses = [RSI, RDI, RCX], Predicates = [In64BitMode] in {
-def WRMSRLIST : I<0x01, MRM_C6, (outs), (ins), "wrmsrlist", []>, XS;
-def RDMSRLIST : I<0x01, MRM_C6, (outs), (ins), "rdmsrlist", []>, XD;
+def WRMSRLIST : I<0x01, MRM_C6, (outs), (ins), "wrmsrlist", []>, TB, XS;
+def RDMSRLIST : I<0x01, MRM_C6, (outs), (ins), "rdmsrlist", []>, TB, XD;
 }
 
 let Predicates = [HasUSERMSR], mayLoad = 1 in {
   def URDMSRrr : I<0xf8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                 "urdmsr\t{$src, $dst|$dst, $src}",
-                [(set GR64:$dst, (int_x86_urdmsr GR64:$src))]>, T8XD;
+                [(set GR64:$dst, (int_x86_urdmsr GR64:$src))]>, T8, XD;
   def URDMSRri : Ii32<0xf8, MRM0r, (outs GR64:$dst), (ins i64i32imm:$imm),
                 "urdmsr\t{$imm, $dst|$dst, $imm}",
-                [(set GR64:$dst, (int_x86_urdmsr i64immSExt32_su:$imm))]>, T_MAP7XD, VEX;
+                [(set GR64:$dst, (int_x86_urdmsr i64immSExt32_su:$imm))]>, T_MAP7, XD, VEX;
 }
 let Predicates = [HasUSERMSR], mayStore = 1 in {
   def UWRMSRrr : I<0xf8, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2),
-                "uwrmsr\t{$src1, $src2|$src2, $src1}",
-                [(int_x86_uwrmsr GR64:$src1, GR64:$src2)]>, T8XS;
+                "uwrmsr\t{$src2, $src1|$src1, $src2}",
+                [(int_x86_uwrmsr GR64:$src1, GR64:$src2)]>, T8, XS;
   def UWRMSRir : Ii32<0xf8, MRM0r, (outs), (ins GR64:$src, i64i32imm:$imm),
                 "uwrmsr\t{$src, $imm|$imm, $src}",
-                [(int_x86_uwrmsr GR64:$src, i64immSExt32_su:$imm)]>, T_MAP7XS, VEX;
+                [(int_x86_uwrmsr i64immSExt32_su:$imm, GR64:$src)]>, T_MAP7, XS, VEX;
 }
 let Defs = [RAX, RDX], Uses = [ECX] in
 def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB;
@@ -481,12 +481,12 @@ let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
 // Cache instructions
 let SchedRW = [WriteSystem] in {
 def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
-def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, PS;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, TB, PS;
 
 // wbnoinvd is like wbinvd, except without invalidation
 // encoding: like wbinvd + an 0xF3 prefix
 def WBNOINVD : I<0x09, RawFrm, (outs), (ins), "wbnoinvd",
-                 [(int_x86_wbnoinvd)]>, XS,
+                 [(int_x86_wbnoinvd)]>, TB, XS,
                  Requires<[HasWBNOINVD]>;
 } // SchedRW
 
@@ -497,57 +497,74 @@ let SchedRW = [WriteSystem] in {
   let Uses = [SSP] in {
     let Defs = [SSP] in {
       def INCSSPD : I<0xAE, MRM5r, (outs), (ins GR32:$src), "incsspd\t$src",
-                       [(int_x86_incsspd GR32:$src)]>, XS;
+                       [(int_x86_incsspd GR32:$src)]>, TB, XS;
       def INCSSPQ : RI<0xAE, MRM5r, (outs), (ins GR64:$src), "incsspq\t$src",
-                       [(int_x86_incsspq GR64:$src)]>, XS;
+                       [(int_x86_incsspq GR64:$src)]>, TB, XS;
     } // Defs SSP
 
     let Constraints = "$src = $dst" in {
       def RDSSPD : I<0x1E, MRM1r, (outs GR32:$dst), (ins GR32:$src),
                      "rdsspd\t$dst",
-                     [(set GR32:$dst, (int_x86_rdsspd GR32:$src))]>, XS;
+                     [(set GR32:$dst, (int_x86_rdsspd GR32:$src))]>, TB, XS;
       def RDSSPQ : RI<0x1E, MRM1r, (outs GR64:$dst), (ins GR64:$src),
                      "rdsspq\t$dst",
-                     [(set GR64:$dst, (int_x86_rdsspq GR64:$src))]>, XS;
+                     [(set GR64:$dst, (int_x86_rdsspq GR64:$src))]>, TB, XS;
     }
 
     let Defs = [SSP] in {
       def SAVEPREVSSP : I<0x01, MRM_EA, (outs), (ins), "saveprevssp",
-                       [(int_x86_saveprevssp)]>, XS;
+                       [(int_x86_saveprevssp)]>, TB, XS;
       def RSTORSSP : I<0x01, MRM5m, (outs), (ins i32mem:$src),
                        "rstorssp\t$src",
-                       [(int_x86_rstorssp addr:$src)]>, XS;
+                       [(int_x86_rstorssp addr:$src)]>, TB, XS;
     } // Defs SSP
   } // Uses SSP
 
+let Predicates = [NoEGPR] in {
   def WRSSD : I<0xF6, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                 "wrssd\t{$src, $dst|$dst, $src}",
-                [(int_x86_wrssd GR32:$src, addr:$dst)]>, T8PS;
+                [(int_x86_wrssd GR32:$src, addr:$dst)]>, T8;
   def WRSSQ : RI<0xF6, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                  "wrssq\t{$src, $dst|$dst, $src}",
-                 [(int_x86_wrssq GR64:$src, addr:$dst)]>, T8PS;
+                 [(int_x86_wrssq GR64:$src, addr:$dst)]>, T8;
   def WRUSSD : I<0xF5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                  "wrussd\t{$src, $dst|$dst, $src}",
-                 [(int_x86_wrussd GR32:$src, addr:$dst)]>, T8PD;
+                 [(int_x86_wrussd GR32:$src, addr:$dst)]>, T8, PD;
   def WRUSSQ : RI<0xF5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                   "wrussq\t{$src, $dst|$dst, $src}",
-                  [(int_x86_wrussq GR64:$src, addr:$dst)]>, T8PD;
+                  [(int_x86_wrussq GR64:$src, addr:$dst)]>, T8, PD;
+}
+
+let Predicates = [HasEGPR, In64BitMode] in {
+  def WRSSD_EVEX : I<0x66, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                     "wrssd\t{$src, $dst|$dst, $src}",
+                     [(int_x86_wrssd GR32:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4;
+  def WRSSQ_EVEX : RI<0x66, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                      "wrssq\t{$src, $dst|$dst, $src}",
+                      [(int_x86_wrssq GR64:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4;
+  def WRUSSD_EVEX : I<0x65, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                      "wrussd\t{$src, $dst|$dst, $src}",
+                      [(int_x86_wrussd GR32:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4, PD;
+  def WRUSSQ_EVEX : RI<0x65, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                       "wrussq\t{$src, $dst|$dst, $src}",
+                       [(int_x86_wrussq GR64:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4, PD;
+}
 
   let Defs = [SSP] in {
     let Uses = [SSP] in {
         def SETSSBSY : I<0x01, MRM_E8, (outs), (ins), "setssbsy",
-                         [(int_x86_setssbsy)]>, XS;
+                         [(int_x86_setssbsy)]>, TB, XS;
     } // Uses SSP
 
     def CLRSSBSY : I<0xAE, MRM6m, (outs), (ins i32mem:$src),
                      "clrssbsy\t$src",
-                     [(int_x86_clrssbsy addr:$src)]>, XS;
+                     [(int_x86_clrssbsy addr:$src)]>, TB, XS;
   } // Defs SSP
 } // SchedRW
 
 let SchedRW = [WriteSystem] in {
-    def ENDBR64 : I<0x1E, MRM_FA, (outs), (ins), "endbr64", []>, XS;
-    def ENDBR32 : I<0x1E, MRM_FB, (outs), (ins), "endbr32", []>, XS;
+    def ENDBR64 : I<0x1E, MRM_FA, (outs), (ins), "endbr64", []>, TB, XS;
+    def ENDBR32 : I<0x1E, MRM_FB, (outs), (ins), "endbr32", []>, TB, XS;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -557,51 +574,51 @@ let SchedRW = [WriteSystem] in {
 // on Windows without needing to enable the xsave feature to be compatible with
 // MSVC.
 let Defs = [EDX, EAX], Uses = [ECX] in
-def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, PS;
+def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
 
 let Uses = [EDX, EAX, ECX] in
 def XSETBV : I<0x01, MRM_D1, (outs), (ins),
               "xsetbv",
-              [(int_x86_xsetbv ECX, EDX, EAX)]>, PS;
+              [(int_x86_xsetbv ECX, EDX, EAX)]>, TB;
 
 
 let Uses = [EDX, EAX] in {
 def XSAVE : I<0xAE, MRM4m, (outs), (ins opaquemem:$dst),
               "xsave\t$dst",
-              [(int_x86_xsave addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>;
+              [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE]>;
 def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaquemem:$dst),
                  "xsave64\t$dst",
-                 [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
+                 [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>;
 def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaquemem:$dst),
                "xrstor\t$dst",
-               [(int_x86_xrstor addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>;
+               [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE]>;
 def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaquemem:$dst),
                   "xrstor64\t$dst",
-                  [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
+                  [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>;
 def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
                  "xsaveopt\t$dst",
-                 [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT]>;
+                 [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEOPT]>;
 def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
                     "xsaveopt64\t$dst",
-                    [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT, In64BitMode]>;
+                    [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEOPT, In64BitMode]>;
 def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
                "xsavec\t$dst",
-               [(int_x86_xsavec addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEC]>;
+               [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC]>;
 def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
                  "xsavec64\t$dst",
-                 [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEC, In64BitMode]>;
+                 [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC, In64BitMode]>;
 def XSAVES : I<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
                "xsaves\t$dst",
-               [(int_x86_xsaves addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES]>;
+               [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
 def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
                   "xsaves64\t$dst",
-                  [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
+                  [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>;
 def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
                 "xrstors\t$dst",
-                [(int_x86_xrstors addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES]>;
+                [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
 def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
                    "xrstors64\t$dst",
-                   [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES, In64BitMode]>;
+                   [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES, In64BitMode]>;
 } // Uses
 } // SchedRW
 
@@ -634,10 +651,10 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
 let SchedRW = [WriteSystem] in {
 let Defs = [EAX, EDX], Uses = [ECX] in
   def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru",
-                  [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, PS;
+                  [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, TB;
 let Uses = [EAX, ECX, EDX] in
   def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru",
-                  [(X86wrpkru EAX, EDX, ECX)]>, PS;
+                  [(X86wrpkru EAX, EDX, ECX)]>, TB;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -645,28 +662,28 @@ let Uses = [EAX, ECX, EDX] in
 let Predicates = [HasFSGSBase, In64BitMode], SchedRW = [WriteSystem] in {
   def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
                    "rdfsbase{l}\t$dst",
-                   [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS;
+                   [(set GR32:$dst, (int_x86_rdfsbase_32))]>, TB, XS;
   def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
                      "rdfsbase{q}\t$dst",
-                     [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS;
+                     [(set GR64:$dst, (int_x86_rdfsbase_64))]>, TB, XS;
   def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
                    "rdgsbase{l}\t$dst",
-                   [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS;
+                   [(set GR32:$dst, (int_x86_rdgsbase_32))]>, TB, XS;
   def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
                      "rdgsbase{q}\t$dst",
-                     [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS;
+                     [(set GR64:$dst, (int_x86_rdgsbase_64))]>, TB, XS;
   def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
                    "wrfsbase{l}\t$src",
-                   [(int_x86_wrfsbase_32 GR32:$src)]>, XS;
+                   [(int_x86_wrfsbase_32 GR32:$src)]>, TB, XS;
   def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
                       "wrfsbase{q}\t$src",
-                      [(int_x86_wrfsbase_64 GR64:$src)]>, XS;
+                      [(int_x86_wrfsbase_64 GR64:$src)]>, TB, XS;
   def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
                    "wrgsbase{l}\t$src",
-                   [(int_x86_wrgsbase_32 GR32:$src)]>, XS;
+                   [(int_x86_wrgsbase_32 GR32:$src)]>, TB, XS;
   def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
                       "wrgsbase{q}\t$src",
-                      [(int_x86_wrgsbase_64 GR64:$src)]>, XS;
+                      [(int_x86_wrgsbase_64 GR64:$src)]>, TB, XS;
 }
 
 //===----------------------------------------------------------------------===//
@@ -674,15 +691,15 @@ let Predicates = [HasFSGSBase, In64BitMode], SchedRW = [WriteSystem] in {
 let SchedRW = [WriteSystem] in {
 def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
                   "invpcid\t{$src2, $src1|$src1, $src2}",
-                  [(int_x86_invpcid GR32:$src1, addr:$src2)]>, T8PD,
+                  [(int_x86_invpcid GR32:$src1, addr:$src2)]>, T8, PD,
                   Requires<[Not64BitMode, HasINVPCID]>;
 def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-                  "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                  "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
                   Requires<[In64BitMode, HasINVPCID]>;
 
 def INVPCID64_EVEX : I<0xF2, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                        "invpcid\t{$src2, $src1|$src1, $src2}", []>,
-                     EVEX_NoCD8, T_MAP4XS, Requires<[In64BitMode, HasINVPCID]>;
+                     EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode, HasINVPCID]>;
 } // SchedRW
 
 let Predicates = [In64BitMode, HasINVPCID] in {
@@ -701,15 +718,15 @@ let Predicates = [In64BitMode, HasINVPCID] in {
 //===----------------------------------------------------------------------===//
 // SMAP Instruction
 let Defs = [EFLAGS], SchedRW = [WriteSystem] in {
-  def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, PS;
-  def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, PS;
+  def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
+  def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
 }
 
 //===----------------------------------------------------------------------===//
 // SMX Instruction
 let SchedRW = [WriteSystem] in {
 let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
-  def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, PS;
+  def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB;
 } // Uses, Defs
 } // SchedRW
 
@@ -730,9 +747,9 @@ def STI : I<0xFB, RawFrm, (outs), (ins), "sti", []>;
 // RDPID Instruction
 let SchedRW = [WriteSystem] in {
 def RDPID32 : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
-                "rdpid\t$dst", [(set GR32:$dst, (int_x86_rdpid))]>, XS,
+                "rdpid\t$dst", [(set GR32:$dst, (int_x86_rdpid))]>, TB, XS,
                 Requires<[Not64BitMode, HasRDPID]>;
-def RDPID64 : I<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdpid\t$dst", []>, XS,
+def RDPID64 : I<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdpid\t$dst", []>, TB, XS,
                 Requires<[In64BitMode, HasRDPID]>;
 } // SchedRW
 
@@ -748,17 +765,17 @@ let Predicates = [In64BitMode, HasRDPID] in {
 // PTWRITE Instruction - Write Data to a Processor Trace Packet
 let SchedRW = [WriteSystem] in {
 def PTWRITEm: I<0xAE, MRM4m, (outs), (ins i32mem:$dst),
-                "ptwrite{l}\t$dst", [(int_x86_ptwrite32 (loadi32 addr:$dst))]>, XS,
+                "ptwrite{l}\t$dst", [(int_x86_ptwrite32 (loadi32 addr:$dst))]>, TB, XS,
                 Requires<[HasPTWRITE]>;
 def PTWRITE64m : RI<0xAE, MRM4m, (outs), (ins i64mem:$dst),
-                    "ptwrite{q}\t$dst", [(int_x86_ptwrite64 (loadi64 addr:$dst))]>, XS,
+                    "ptwrite{q}\t$dst", [(int_x86_ptwrite64 (loadi64 addr:$dst))]>, TB, XS,
                     Requires<[In64BitMode, HasPTWRITE]>;
 
 def PTWRITEr : I<0xAE, MRM4r, (outs), (ins GR32:$dst),
-                 "ptwrite{l}\t$dst", [(int_x86_ptwrite32 GR32:$dst)]>, XS,
+                 "ptwrite{l}\t$dst", [(int_x86_ptwrite32 GR32:$dst)]>, TB, XS,
                     Requires<[HasPTWRITE]>;
 def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst),
-                    "ptwrite{q}\t$dst", [(int_x86_ptwrite64 GR64:$dst)]>, XS,
+                    "ptwrite{q}\t$dst", [(int_x86_ptwrite64 GR64:$dst)]>, TB, XS,
                     Requires<[In64BitMode, HasPTWRITE]>;
 } // SchedRW
 
@@ -767,7 +784,7 @@ def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst),
 
 let SchedRW = [WriteSystem] in {
 let Uses = [ECX], Defs = [EAX, EDX] in
-   def RDPRU : I<0x01, MRM_FD, (outs), (ins), "rdpru", []>, PS,
+   def RDPRU : I<0x01, MRM_FD, (outs), (ins), "rdpru", []>, TB,
                Requires<[HasRDPRU]>;
 }
 
@@ -786,6 +803,6 @@ let Uses = [ECX], Defs = [EAX, EDX] in
 
 let SchedRW = [WriteSystem] in {
 let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX, RDX, EFLAGS] in
-    def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, PS,
+    def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, TB,
                   Requires<[HasPCONFIG]>;
 } // SchedRW
diff --git a/llvm/lib/Target/X86/X86InstrTBM.td b/llvm/lib/Target/X86/X86InstrTBM.td
index ed514038a12e..09200f0c1a9f 100644
--- a/llvm/lib/Target/X86/X86InstrTBM.td
+++ b/llvm/lib/Target/X86/X86InstrTBM.td
@@ -46,11 +46,11 @@ multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
 let hasSideEffects = 0 in {
   def rr : I<opc,  FormReg, (outs RC:$dst), (ins RC:$src),
              !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>,
-             XOP_4V, XOP9, Sched<[Sched]>;
+             XOP, VVVV, XOP9, Sched<[Sched]>;
   let mayLoad = 1 in
   def rm : I<opc,  FormMem, (outs RC:$dst), (ins x86memop:$src),
              !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>,
-             XOP_4V, XOP9, Sched<[Sched.Folded]>;
+             XOP, VVVV, XOP9, Sched<[Sched.Folded]>;
 }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrTDX.td b/llvm/lib/Target/X86/X86InstrTDX.td
index 8d7cd6082095..fe01677b2ea1 100644
--- a/llvm/lib/Target/X86/X86InstrTDX.td
+++ b/llvm/lib/Target/X86/X86InstrTDX.td
@@ -17,23 +17,17 @@
 // 64-bit only instructions
 let SchedRW = [WriteSystem], Predicates = [In64BitMode] in {
 // SEAMCALL - Call to SEAM VMX-root Operation Module
-def SEAMCALL : I<0x01, MRM_CF, (outs), (ins),
-             "seamcall", []>, PD;
+def SEAMCALL : I<0x01, MRM_CF, (outs), (ins), "seamcall", []>, TB, PD;
 
 // SEAMRET - Return to Legacy VMX-root Operation
-def SEAMRET : I<0x01, MRM_CD, (outs), (ins),
-             "seamret", []>, PD;
+def SEAMRET : I<0x01, MRM_CD, (outs), (ins), "seamret", []>, TB, PD;
 
 // SEAMOPS - SEAM Operations
-def SEAMOPS : I<0x01, MRM_CE, (outs), (ins),
-             "seamops", []>, PD;
-
+def SEAMOPS : I<0x01, MRM_CE, (outs), (ins), "seamops", []>, TB, PD;
 } // SchedRW
 
 // common instructions
 let SchedRW = [WriteSystem] in {
 // TDCALL - Call SEAM Module Functions
-def TDCALL : I<0x01, MRM_CC, (outs), (ins),
-             "tdcall", []>, PD;
-
+def TDCALL : I<0x01, MRM_CC, (outs), (ins), "tdcall", []>, TB, PD;
 } // SchedRW
diff --git a/llvm/lib/Target/X86/X86InstrTSX.td b/llvm/lib/Target/X86/X86InstrTSX.td
index 7671eb4676ee..57604b682d54 100644
--- a/llvm/lib/Target/X86/X86InstrTSX.td
+++ b/llvm/lib/Target/X86/X86InstrTSX.td
@@ -37,11 +37,11 @@ def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>;
 }
 
 def XEND : I<0x01, MRM_D5, (outs), (ins),
-             "xend", [(int_x86_xend)]>, PS, Requires<[HasRTM]>;
+             "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
 
 let Defs = [EFLAGS] in
 def XTEST : I<0x01, MRM_D6, (outs), (ins),
-              "xtest", [(set EFLAGS, (X86xtest))]>, PS, Requires<[HasRTM]>;
+              "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasRTM]>;
 
 def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
                  "xabort\t$imm",
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index 78a481290359..da85922a018d 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -27,53 +27,31 @@ class REP    { bit hasREPPrefix = 1; }
 class TB     { Map OpMap = TB; }
 class T8     { Map OpMap = T8; }
 class TA     { Map OpMap = TA; }
-class XOP8   { Map OpMap = XOP8; Prefix OpPrefix = PS; }
-class XOP9   { Map OpMap = XOP9; Prefix OpPrefix = PS; }
-class XOPA   { Map OpMap = XOPA; Prefix OpPrefix = PS; }
+class T_MAP4 { Map OpMap = T_MAP4; }
+class T_MAP5 { Map OpMap = T_MAP5; }
+class T_MAP6 { Map OpMap = T_MAP6; }
+class T_MAP7 { Map OpMap = T_MAP7; }
+class XOP8   { Map OpMap = XOP8; }
+class XOP9   { Map OpMap = XOP9; }
+class XOPA   { Map OpMap = XOPA; }
 class ThreeDNow { Map OpMap = ThreeDNow; }
-class T_MAP4     { Map OpMap = T_MAP4; }
-class T_MAP4PS : T_MAP4 { Prefix OpPrefix = PS; } // none
-class T_MAP4PD : T_MAP4 { Prefix OpPrefix = PD; } // 0x66
-class T_MAP4XS : T_MAP4 { Prefix OpPrefix = XS; } // 0xF3
-class T_MAP4XD : T_MAP4 { Prefix OpPrefix = XD; } // 0xF2
-class T_MAP5     { Map OpMap = T_MAP5; }
-class T_MAP5PS : T_MAP5 { Prefix OpPrefix = PS; } // none
-class T_MAP5PD : T_MAP5 { Prefix OpPrefix = PD; } // 0x66
-class T_MAP5XS : T_MAP5 { Prefix OpPrefix = XS; } // 0xF3
-class T_MAP5XD : T_MAP5 { Prefix OpPrefix = XD; } // 0xF2
-class T_MAP6     { Map OpMap = T_MAP6; }
-class T_MAP6PS : T_MAP6 { Prefix OpPrefix = PS; }
-class T_MAP6PD : T_MAP6 { Prefix OpPrefix = PD; }
-class T_MAP6XS : T_MAP6 { Prefix OpPrefix = XS; }
-class T_MAP6XD : T_MAP6 { Prefix OpPrefix = XD; }
-class T_MAP7     { Map OpMap = T_MAP7; }
-class T_MAP7XS : T_MAP7 { Prefix OpPrefix = XS; } // 0xF3
-class T_MAP7XD : T_MAP7 { Prefix OpPrefix = XD; } // 0xF2
-class OBXS   { Prefix OpPrefix = XS; }
-class PS   : TB { Prefix OpPrefix = PS; }
-class PD   : TB { Prefix OpPrefix = PD; }
-class XD   : TB { Prefix OpPrefix = XD; }
-class XS   : TB { Prefix OpPrefix = XS; }
-class T8PS : T8 { Prefix OpPrefix = PS; }
-class T8PD : T8 { Prefix OpPrefix = PD; }
-class T8XD : T8 { Prefix OpPrefix = XD; }
-class T8XS : T8 { Prefix OpPrefix = XS; }
-class TAPS : TA { Prefix OpPrefix = PS; }
-class TAPD : TA { Prefix OpPrefix = PD; }
-class TAXD : TA { Prefix OpPrefix = XD; }
-class TAXS : TA { Prefix OpPrefix = XS; }
-class VEX    { Encoding OpEnc = EncVEX; }
+class PS { Prefix OpPrefix = PS; }
+class PD { Prefix OpPrefix = PD; }
+class XD { Prefix OpPrefix = XD; }
+class XS { Prefix OpPrefix = XS; }
+class XOP { Encoding OpEnc = EncXOP; }
+class VEX { Encoding OpEnc = EncVEX; }
+class EVEX { Encoding OpEnc = EncEVEX; }
 class WIG  { bit IgnoresW = 1; }
 // Special version of REX_W that can be changed to VEX.W==0 for EVEX2VEX.
 class VEX_W1X  { bit hasREX_W = 1; bit EVEX_W1_VEX_W0 = 1; }
-class VEX_4V : VEX { bit hasVEX_4V = 1; }
 class VEX_L  { bit hasVEX_L = 1; }
 class VEX_LIG { bit ignoresVEX_L = 1; }
-class EVEX   { Encoding OpEnc = EncEVEX; }
-class EVEX_4V : EVEX { bit hasVEX_4V = 1; }
+class VVVV { bit hasVEX_4V = 1; }
 class EVEX_K { bit hasEVEX_K = 1; }
 class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
 class EVEX_B { bit hasEVEX_B = 1; }
+class EVEX_NF { bit hasEVEX_NF = 1; }
 class EVEX_RC { bit hasEVEX_RC = 1; }
 class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
 class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; }
@@ -86,29 +64,28 @@ class EVEX_CD8<int esize, CD8VForm form> {
   int CD8_EltSize = !srl(esize, 3);
   bits<3> CD8_Form = form.Value;
 }
-class EVEX_NoCD8 : EVEX { bits<7> CD8_Scale = 0; }
-class XOP { Encoding OpEnc = EncXOP; }
-class XOP_4V : XOP { bit hasVEX_4V = 1; }
+class NoCD8 { bits<7> CD8_Scale = 0; }
+
 class EVEX2VEXOverride<string VEXInstrName> {
   string EVEX2VEXOverride = VEXInstrName;
 }
-class AVX512BIi8Base : PD {
+class AVX512BIi8Base : TB, PD {
   Domain ExeDomain = SSEPackedInt;
   ImmType ImmT = Imm8;
 }
-class AVX512XSIi8Base : XS {
+class AVX512XSIi8Base : TB, XS {
   Domain ExeDomain = SSEPackedInt;
   ImmType ImmT = Imm8;
 }
-class AVX512XDIi8Base : XD {
+class AVX512XDIi8Base : TB, XD {
   Domain ExeDomain = SSEPackedInt;
   ImmType ImmT = Imm8;
 }
-class AVX512PSIi8Base : PS {
+class AVX512PSIi8Base : TB {
   Domain ExeDomain = SSEPackedSingle;
   ImmType ImmT = Imm8;
 }
-class AVX512PDIi8Base : PD {
+class AVX512PDIi8Base : TB, PD {
   Domain ExeDomain = SSEPackedDouble;
   ImmType ImmT = Imm8;
 }
@@ -116,16 +93,32 @@ class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }
 class ExplicitREX2Prefix { ExplicitOpPrefix explicitOpPrefix = ExplicitREX2; }
 class ExplicitVEXPrefix { ExplicitOpPrefix explicitOpPrefix = ExplicitVEX; }
 class ExplicitEVEXPrefix { ExplicitOpPrefix explicitOpPrefix = ExplicitEVEX; }
+class DefEFLAGS { list<Register> Defs = [EFLAGS]; }
+class UseEFLAGS { list<Register> Uses = [EFLAGS]; }
+class DisassembleOnly {
+  // The disassembler should know about this, but not the asmparser.
+  bit isCodeGenOnly = 1;
+  bit ForceDisassemble = 1;
+}
 
-// SchedModel info for instruction that loads one value and gets the second
-// (and possibly third) value from a register.
-// This is used for instructions that put the memory operands before other
-// uses.
-class SchedLoadReg<X86FoldableSchedWrite Sched> : Sched<[Sched.Folded,
-  // Memory operand.
-  ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-  // Register reads (implicit or explicit).
-  Sched.ReadAfterFold, Sched.ReadAfterFold]>;
+defvar unaryop_args = "$src1";
+defvar unaryop_ndd_args = "{$src1, $dst|$dst, $src1}";
+defvar binop_args = "{$src2, $src1|$src1, $src2}";
+defvar binop_ndd_args = "{$src2, $src1, $dst|$dst, $src1, $src2}";
+defvar tie_dst_src1 = "$src1 = $dst";
+
+// NDD - Helper for new data destination instructions
+class NDD<bit ndd> {
+  string Constraints = !if(!eq(ndd, 0), tie_dst_src1, "");
+  Encoding OpEnc = !if(!eq(ndd, 0), EncNormal, EncEVEX);
+  bit hasEVEX_B = ndd;
+  bit hasVEX_4V = ndd;
+  Map OpMap = !if(!eq(ndd, 0), OB, T_MAP4);
+}
+// NF - Helper for NF (no flags update) instructions
+class NF: T_MAP4, EVEX, EVEX_NF, NoCD8;
+// PL - Helper for promoted legacy instructions
+class PL: T_MAP4, EVEX, NoCD8, ExplicitEVEXPrefix;
 
 //===----------------------------------------------------------------------===//
 // X86 Type infomation definitions
@@ -139,8 +132,7 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
                   Operand immoperand, SDPatternOperator immoperator,
                   SDPatternOperator immnosuoperator, Operand imm8operand,
                   SDPatternOperator imm8operator, SDPatternOperator imm8nosuoperator,
-                  bit hasOddOpcode, OperandSize opSize,
-                  bit hasREX_W> {
+                  bit hasEvenOpcode, bit hasREX_W> {
   /// VT - This is the value type itself.
   ValueType VT = vt;
 
@@ -189,15 +181,10 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
 
   SDPatternOperator Imm8NoSuOperator = imm8nosuoperator;
 
-  /// HasOddOpcode - This bit is true if the instruction should have an odd (as
-  /// opposed to even) opcode.  Operations on i8 are usually even, operations on
-  /// other datatypes are odd.
-  bit HasOddOpcode = hasOddOpcode;
-
-  /// OpSize - Selects whether the instruction needs a 0x66 prefix based on
-  /// 16-bit vs 32-bit mode. i8/i64 set this to OpSizeFixed. i16 sets this
-  /// to Opsize16. i32 sets this to OpSize32.
-  OperandSize OpSize = opSize;
+  /// HasEvenOpcode - This bit is true if the instruction should have an even (as
+  /// opposed to odd) opcode.  Operations on i8 are even, operations on
+  /// other datatypes are usually odd.
+  bit HasEvenOpcode = hasEvenOpcode;
 
   /// HasREX_W - This bit is set to true if the instruction should have
   /// the 0x40 REX prefix.  This is set for i64 types.
@@ -208,16 +195,16 @@ def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
 
 def Xi8  : X86TypeInfo<i8, "b", GR8, loadi8, i8mem, Imm8, i8imm,
                        imm_su, imm, i8imm, invalid_node, invalid_node,
-                       0, OpSizeFixed, 0>;
+                       1, 0>;
 def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem, Imm16, i16imm,
                        imm_su, imm, i16i8imm, i16immSExt8_su, i16immSExt8,
-                       1, OpSize16, 0>;
+                       0, 0>;
 def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem, Imm32, i32imm,
                        imm_su, imm, i32i8imm, i32immSExt8_su, i32immSExt8,
-                       1, OpSize32, 0>;
+                       0, 0>;
 def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, Imm32S, i64i32imm,
-                      i64immSExt32_su, i64immSExt32, i64i8imm, i64immSExt8_su,
-                      i64immSExt8, 1, OpSizeFixed, 1>;
+                       i64immSExt32_su, i64immSExt32, i64i8imm, i64immSExt8_su,
+                       i64immSExt8, 0, 1>;
 
 // Group template arguments that can be derived from the vector type (EltNum x
 // EltVT).  These are things like the register class for the writemask, etc.
@@ -585,26 +572,26 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE1]>;
+      : I<o, F, outs, ins, asm, pattern>, TB, XS, Requires<[UseSSE1]>;
 class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE1]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, XS, Requires<[UseSSE1]>;
 class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
         Requires<[UseSSE1]>;
 class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
         Requires<[UseSSE1]>;
 class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS,
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, TB, XS,
         Requires<[HasAVX]>;
 class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, PS,
-        Requires<[HasAVX]>;
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>,
+        TB, Requires<[HasAVX]>;
 
 // SSE2 Instruction Templates:
 //
@@ -626,49 +613,49 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, XD, Requires<[UseSSE2]>;
+      : I<o, F, outs, ins, asm, pattern>, TB, XD, Requires<[UseSSE2]>;
 class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[UseSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, XD, Requires<[UseSSE2]>;
 class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
+      : I<o, F, outs, ins, asm, pattern>, TB, XS, Requires<[UseSSE2]>;
 class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, XS, Requires<[UseSSE2]>;
 class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, PD,
         Requires<[UseSSE2]>;
 class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, PD,
         Requires<[UseSSE2]>;
 class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XD,
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, TB, XD,
         Requires<[UseAVX]>;
 class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS,
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, TB, XS,
         Requires<[HasAVX]>;
 class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>,
-        PD, Requires<[HasAVX]>;
+        TB, PD, Requires<[HasAVX]>;
 class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, PD,
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, TB, PD,
         Requires<[UseAVX]>;
 class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, PD, Requires<[UseSSE2]>;
+      : I<o, F, outs, ins, asm, pattern>, TB, PD, Requires<[UseSSE2]>;
 class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX, HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, XD, Requires<[HasMMX, HasSSE2]>;
 class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                 list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX, HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, XS, Requires<[HasMMX, HasSSE2]>;
 
 // SSE3 Instruction Templates:
 //
@@ -678,15 +665,15 @@ class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, XS,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, XS,
         Requires<[UseSSE3]>;
 class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, XD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, XD,
         Requires<[UseSSE3]>;
 class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, PD,
         Requires<[UseSSE3]>;
 
 
@@ -703,19 +690,19 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[UseSSSE3]>;
 class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[UseSSSE3]>;
 class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PS,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
         Requires<[HasMMX, HasSSSE3]>;
 class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPS,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
         Requires<[HasMMX, HasSSSE3]>;
 
 // SSE4.1 Instruction Templates:
@@ -725,11 +712,11 @@ class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //
 class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[UseSSE41]>;
 class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[UseSSE41]>;
 
 // SSE4.2 Instruction Templates:
@@ -737,56 +724,49 @@ class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   SS428I - SSE 4.2 instructions with T8 prefix.
 class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[UseSSE42]>;
 
 //   SS42AI = SSE 4.2 instructions with TA prefix
 class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[UseSSE42]>;
 
-//   CRC32I - SSE 4.2 CRC32 instructions.
-// NOTE: 'HasCRC32' is used as CRC32 instructions are GPR only and not directly
-// controlled by the SSE42 flag.
-class CRC32I<bits<8> o, Format F, dag outs, dag ins, string asm,
-             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8XD, Requires<[HasCRC32]>;
-
 // AVX Instruction Templates:
 //   Instructions introduced in AVX (no SSE equivalent forms)
 //
-//   AVX8I - AVX instructions with T8PD prefix.
-//   AVXAIi8 - AVX instructions with TAPD prefix and ImmT = Imm8.
+//   AVX8I - AVX instructions with T8, PD prefix.
+//   AVXAIi8 - AVX instructions with TA, PD prefix and ImmT = Imm8.
 class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[HasAVX]>;
 class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[HasAVX]>;
 
 // AVX2 Instruction Templates:
 //   Instructions introduced in AVX2 (no SSE equivalent forms)
 //
-//   AVX28I - AVX2 instructions with T8PD prefix.
-//   AVX2AIi8 - AVX2 instructions with TAPD prefix and ImmT = Imm8.
+//   AVX28I - AVX2 instructions with T8, PD prefix.
+//   AVX2AIi8 - AVX2 instructions with TA, PD prefix and ImmT = Imm8.
 class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[HasAVX2]>;
 class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[HasAVX2]>;
 
 
 // AVX-512 Instruction Templates:
 //   Instructions introduced in AVX-512 (no SSE equivalent forms)
 //
-//   AVX5128I - AVX-512 instructions with T8PD prefix.
-//   AVX512AIi8 - AVX-512 instructions with TAPD prefix and ImmT = Imm8.
+//   AVX5128I - AVX-512 instructions with T8, PD prefix.
+//   AVX512AIi8 - AVX-512 instructions with TA, PD prefix and ImmT = Imm8.
 //   AVX512PDI  - AVX-512 instructions with PD, double packed.
 //   AVX512PSI  - AVX-512 instructions with PS, single packed.
 //   AVX512XS8I - AVX-512 instructions with T8 and XS prefixes.
@@ -796,39 +776,39 @@ class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[HasAVX512]>;
-class AVX5128IBase : T8PD {
+class AVX5128IBase : T8, PD {
   Domain ExeDomain = SSEPackedInt;
 }
 class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8XS,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, XS,
         Requires<[HasAVX512]>;
 class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, XS,
+      : I<o, F, outs, ins, asm, pattern>, TB, XS,
         Requires<[HasAVX512]>;
 class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, XD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, TB, XD,
         Requires<[HasAVX512]>;
 class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, TB, PD,
         Requires<[HasAVX512]>;
-class AVX512BIBase : PD {
+class AVX512BIBase : TB, PD {
   Domain ExeDomain = SSEPackedInt;
 }
 class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, PD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TB, PD,
         Requires<[HasAVX512]>;
 class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[HasAVX512]>;
-class AVX512AIi8Base : TAPD {
+class AVX512AIi8Base : TA, PD {
   ImmType ImmT = Imm8;
 }
 class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
@@ -837,11 +817,11 @@ class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasAVX512]>;
 class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, PD,
         Requires<[HasAVX512]>;
 class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
         Requires<[HasAVX512]>;
 class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, Domain d>
@@ -851,8 +831,8 @@ class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
       : I<o, F, outs, ins, asm, pattern, d>, Requires<[HasAVX512]>;
 class AVX512FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8PD,
-        EVEX_4V, Requires<[HasAVX512]>;
+      : I<o, F, outs, ins, asm, pattern>, T8, PD,
+        EVEX, VVVV, Requires<[HasAVX512]>;
 
 class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
@@ -864,46 +844,46 @@ class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
 // These use the same encoding as the SSE4.2 T8 and TA encodings.
 class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag>pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[NoAVX, HasAES]>;
 
 class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[NoAVX, HasAES]>;
 
 // PCLMUL Instruction Templates
 class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag>pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD;
 
 // FMA3 Instruction Templates
 class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8PD,
-        VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;
+      : I<o, F, outs, ins, asm, pattern>, T8, PD,
+        VEX, VVVV, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;
 class FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag>pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8PD,
-        VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>;
+      : I<o, F, outs, ins, asm, pattern>, T8, PD,
+        VEX, VVVV, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>;
 class FMA3S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
                 list<dag>pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8PD,
-        VEX_4V, FMASC, Requires<[HasFMA, NoAVX512]>;
+      : I<o, F, outs, ins, asm, pattern>, T8, PD,
+        VEX, VVVV, FMASC, Requires<[HasFMA, NoAVX512]>;
 
 // FMA4 Instruction Templates
 class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
-      : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
-        VEX_4V, FMASC, Requires<[HasFMA4, NoVLX]>;
+      : Ii8Reg<o, F, outs, ins, asm, pattern>, TA, PD,
+        VEX, VVVV, FMASC, Requires<[HasFMA4, NoVLX]>;
 class FMA4S<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag>pattern>
-      : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
-        VEX_4V, FMASC, Requires<[HasFMA4, NoAVX512]>;
+      : Ii8Reg<o, F, outs, ins, asm, pattern>, TA, PD,
+        VEX, VVVV, FMASC, Requires<[HasFMA4, NoAVX512]>;
 class FMA4S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
                 list<dag>pattern>
-      : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
-        VEX_4V, FMASC, Requires<[HasFMA4]>;
+      : Ii8Reg<o, F, outs, ins, asm, pattern>, TA, PD,
+        VEX, VVVV, FMASC, Requires<[HasFMA4]>;
 
 // XOP 2, 3 and 4 Operand Instruction Template
 class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
@@ -925,8 +905,8 @@ class IXOPi8Reg<bits<8> o, Format F, dag outs, dag ins, string asm,
 //  XOP 5 operand instruction (VEX encoding!)
 class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
-      : Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
-        VEX_4V, Requires<[HasXOP]>;
+      : Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
+        VEX, VVVV, Requires<[HasXOP]>;
 
 // X86-64 Instruction templates...
 //
@@ -964,31 +944,396 @@ class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
 // MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
 class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX]>;
+      : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
 class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, PS, REX_W,
+      : I<o, F, outs, ins, asm, pattern>, TB, REX_W,
         Requires<[HasMMX,In64BitMode]>;
 class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
 
 /// ITy - This instruction base class takes the type info for the instruction.
 /// Using this, it:
 /// 1. Concatenates together the instruction mnemonic with the appropriate
 ///    suffix letter, a tab, and the arguments.
-/// 2. Infers whether the instruction should have a 0x66 prefix byte.
-/// 3. Infers whether the instruction should have a 0x40 REX_W prefix.
-/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
+/// 2. Infers whether the instruction should have a 0x40 REX_W prefix.
+/// 3. Infers whether the low bit of the opcode should be 0 (for i8 operations)
 ///    or 1 (for i16,i32,i64 operations).
-class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
-          string mnemonic, string args, list<dag> pattern>
-  : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
-       opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
-      f, outs, ins,
-      !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern> {
-
-  // Infer instruction prefixes from type info.
-  let OpSize = typeinfo.OpSize;
-  let hasREX_W  = typeinfo.HasREX_W;
+class ITy<bits<8> o, Format f, X86TypeInfo t, dag outs, dag ins, string m,
+          string args, list<dag> p>
+  : I<{o{7}, o{6}, o{5}, o{4}, o{3}, o{2}, o{1},
+       !if(!eq(t.HasEvenOpcode, 1), 0, o{0})}, f, outs, ins,
+      !strconcat(m, "{", t.InstrSuffix, "}\t", args), p> {
+  let hasSideEffects = 0;
+  let hasREX_W  = t.HasREX_W;
+}
+
+// BinOpRR - Instructions that read "reg, reg".
+class BinOpRR<bits<8> o, string m, string args, X86TypeInfo t, dag out, list<dag> p>
+  : ITy<o, MRMDestReg, t, out, (ins t.RegClass:$src1, t.RegClass:$src2), m,
+        args, p>, Sched<[WriteALU]>;
+// BinOpRR_F - Instructions that read "reg, reg" and write EFLAGS only.
+class BinOpRR_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+  : BinOpRR<o, m, binop_args, t, (outs),
+            [(set EFLAGS, (node t.RegClass:$src1, t.RegClass:$src2))]>,
+    DefEFLAGS;
+// BinOpRR_F_Rev - Reversed encoding of BinOpRR_F
+class BinOpRR_F_Rev<bits<8> o, string m, X86TypeInfo t>
+  : BinOpRR_F<o, m, t, null_frag>, DisassembleOnly {
+  let Form = MRMSrcReg;
+}
+// BinOpRR_R - Instructions that read "reg, reg" and write "reg".
+class BinOpRR_R<bits<8> o, string m, X86TypeInfo t, bit ndd = 0>
+  : BinOpRR<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t,
+            (outs t.RegClass:$dst), []>, NDD<ndd>;
+// BinOpRR_R_Rev - Reversed encoding of BinOpRR_R
+class BinOpRR_R_Rev<bits<8> o, string m, X86TypeInfo t, bit ndd = 0>
+  : BinOpRR_R<o, m, t, ndd>, DisassembleOnly {
+  let Form = MRMSrcReg;
+}
+// BinOpRR_RF - Instructions that read "reg, reg", and write "reg", EFLAGS.
+class BinOpRR_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit ndd = 0>
+  : BinOpRR<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t,
+            (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS,
+             (node t.RegClass:$src1, t.RegClass:$src2))]>, DefEFLAGS, NDD<ndd>;
+// BinOpRR_RF_Rev - Reversed encoding of BinOpRR_RF.
+class BinOpRR_RF_Rev<bits<8> o, string m, X86TypeInfo t, bit ndd = 0>
+  : BinOpRR_RF<o, m, t, null_frag, ndd>, DisassembleOnly {
+  let Form = MRMSrcReg;
+}
+// BinOpRRF_RF - Instructions that read "reg, reg", write "reg" and read/write
+// EFLAGS.
+class BinOpRRF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit ndd = 0>
+  : BinOpRR<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS,
+             (node t.RegClass:$src1, t.RegClass:$src2,
+             EFLAGS))]>, DefEFLAGS, UseEFLAGS, NDD<ndd> {
+  let SchedRW = [WriteADC];
+}
+// BinOpRRF_RF_Rev - Reversed encoding of BinOpRRF_RF
+class BinOpRRF_RF_Rev<bits<8> o, string m, X86TypeInfo t, bit ndd = 0>
+  : BinOpRRF_RF<o, m, t, null_frag, ndd>, DisassembleOnly {
+  let Form = MRMSrcReg;
+}
+
+// BinOpRM - Instructions that read "reg, [mem]".
+class BinOpRM<bits<8> o, string m, string args, X86TypeInfo t, dag out, list<dag> p>
+  : ITy<o, MRMSrcMem, t, out, (ins t.RegClass:$src1, t.MemOperand:$src2), m,
+        args, p>,
+    Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]> {
+  let mayLoad = 1;
+}
+// BinOpRM_F - Instructions that read "reg, [mem]" and write EFLAGS only.
+class BinOpRM_F<bits<8> o, string m, X86TypeInfo t, SDNode node>
+  : BinOpRM<o, m, binop_args, t, (outs),
+            [(set EFLAGS, (node t.RegClass:$src1,
+             (t.LoadNode addr:$src2)))]>, DefEFLAGS;
+// BinOpRM_R - Instructions that read "reg, [mem]", and write "reg".
+class BinOpRM_R<bits<8> o, string m, X86TypeInfo t, bit ndd = 0>
+  : BinOpRM<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst),
+            []>, NDD<ndd>;
+// BinOpRM_RF - Instructions that read "reg, [mem]", and write "reg", EFLAGS.
+class BinOpRM_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit ndd = 0>
+  : BinOpRM<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS, (node t.RegClass:$src1,
+             (t.LoadNode addr:$src2)))]>, DefEFLAGS, NDD<ndd>;
+// BinOpRMF_RF - Instructions that read "reg, [mem]", write "reg" and read/write
+// EFLAGS.
+class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit ndd = 0>
+  : BinOpRM<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS,
+             (node t.RegClass:$src1, (t.LoadNode addr:$src2), EFLAGS))]>,
+    DefEFLAGS, UseEFLAGS, NDD<ndd> {
+  let SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold,
+                 // base, scale, index, offset, segment.
+                 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                 // implicit register read.
+                 WriteADC.ReadAfterFold];
+}
+
+// BinOpRI - Instructions that read "reg, imm".
+class BinOpRI<bits<8> o, string m, string args, X86TypeInfo t, Format f, dag out, list<dag> p>
+  : ITy<o, f, t, out, (ins t.RegClass:$src1, t.ImmOperand:$src2), m,
+        args, p>, Sched<[WriteALU]> {
+  let ImmT = t.ImmEncoding;
+}
+// BinOpRI_F - Instructions that read "reg, imm" and write EFLAGS only.
+class BinOpRI_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node,
+                Format f>
+  : BinOpRI<o, m, binop_args, t, f, (outs),
+            [(set EFLAGS, (node t.RegClass:$src1,
+             t.ImmOperator:$src2))]>, DefEFLAGS;
+// BinOpRI_R - Instructions that read "reg, imm" and write "reg".
+class BinOpRI_R<bits<8> o, string m, X86TypeInfo t, Format f, bit ndd = 0>
+  : BinOpRI<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, f, (outs t.RegClass:$dst),
+            []>, NDD<ndd>;
+// BinOpRI_RF - Instructions that read "reg, imm" and write "reg", EFLAGS.
+class BinOpRI_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, Format f, bit ndd = 0>
+  : BinOpRI<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, f, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS,
+             (node t.RegClass:$src1, t.ImmOperator:$src2))]>, DefEFLAGS, NDD<ndd>;
+// BinOpRIF_RF - Instructions that read "reg, imm", write "reg" and read/write
+// EFLAGS.
+class BinOpRIF_RF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f, bit ndd = 0>
+  : BinOpRI<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, f, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS,
+             (node t.RegClass:$src1, t.ImmOperator:$src2,
+             EFLAGS))]>, DefEFLAGS, UseEFLAGS, NDD<ndd> {
+  let SchedRW = [WriteADC];
+}
+// BinOpRI8 - Instructions that read "reg, imm8".
+class BinOpRI8<bits<8> o, string m, string args, X86TypeInfo t, Format f, dag out>
+  : ITy<o, f, t, out, (ins t.RegClass:$src1, t.Imm8Operand:$src2), m,
+        args, []>, Sched<[WriteALU]> {
+  let ImmT = Imm8;
+}
+// BinOpRI8_F - Instructions that read "reg, imm8" and write EFLAGS only.
+class BinOpRI8_F<bits<8> o, string m, X86TypeInfo t, Format f>
+  : BinOpRI8<o, m, binop_args, t, f, (outs)>, DefEFLAGS;
+// BinOpRI8_R - Instructions that read "reg, imm8" and write "reg".
+class BinOpRI8_R<bits<8> o, string m, X86TypeInfo t, Format f, bit ndd = 0>
+  : BinOpRI8<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, f, (outs t.RegClass:$dst)>, NDD<ndd>;
+// BinOpRI8_RF - Instructions that read "reg, imm8" and write "reg", EFLAGS.
+class BinOpRI8_RF<bits<8> o, string m, X86TypeInfo t, Format f, bit ndd = 0>
+  : BinOpRI8<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, f, (outs t.RegClass:$dst)>, DefEFLAGS, NDD<ndd>;
+// BinOpRI8F_RF - Instructions that read "reg, imm", write "reg" and read/write
+// EFLAGS.
+class BinOpRI8F_RF<bits<8> o, string m, X86TypeInfo t, Format f, bit ndd = 0>
+  : BinOpRI8<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, f, (outs t.RegClass:$dst)>, DefEFLAGS, UseEFLAGS, NDD<ndd> {
+  let SchedRW = [WriteADC];
+}
+
+// BinOpMR - Instructions that read "[mem], reg".
+class BinOpMR<bits<8> o, string m, string args, X86TypeInfo t, dag out, list<dag> p>
+  : ITy<o, MRMDestMem, t, out, (ins t.MemOperand:$src1, t.RegClass:$src2), m,
+        args, p> {
+  let mayLoad = 1;
+  let SchedRW = [WriteALU.Folded, WriteALU.ReadAfterFold];
+}
+// BinOpMR_R - Instructions that read "[mem], reg", and write "reg".
+class BinOpMR_R<bits<8> o, string m, X86TypeInfo t>
+  : BinOpMR<o, m, binop_ndd_args, t, (outs t.RegClass:$dst), []>, NDD<1>;
+// BinOpMR_RF - Instructions that read "[mem], reg", and write "reg", EFLAGS.
+class BinOpMR_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+  : BinOpMR<o, m, binop_ndd_args, t, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode addr:$src1),
+             t.RegClass:$src2))]>, DefEFLAGS, NDD<1>;
+// BinOpMR_F - Instructions that read "[mem], imm8" and write EFLAGS only.
+class BinOpMR_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+  : BinOpMR<o, m, binop_args, t, (outs),
+            [(set EFLAGS, (node (t.LoadNode addr:$src1), t.RegClass:$src2))]>,
+    Sched<[WriteALU.Folded, ReadDefault, ReadDefault, ReadDefault,
+            ReadDefault, ReadDefault, WriteALU.ReadAfterFold]>, DefEFLAGS;
+// BinOpMR_M - Instructions that read "[mem], reg" and write "[mem]".
+class BinOpMR_M<bits<8> o, string m, X86TypeInfo t>
+  : BinOpMR<o, m, binop_args, t, (outs), []>,
+    Sched<[WriteALURMW,
+           // base, scale, index, offset, segment
+           ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault]> {
+  let mayStore = 1;
+}
+// BinOpMR_MF - Instructions that read "[mem], reg" and write "[mem]", EFLAGS.
+class BinOpMR_MF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+  : BinOpMR<o, m, binop_args, t, (outs),
+            [(store (node (load addr:$src1), t.RegClass:$src2), addr:$src1),
+             (implicit EFLAGS)]>,
+    Sched<[WriteALURMW,
+           // base, scale, index, offset, segment
+           ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+           WriteALU.ReadAfterFold]>, // reg
+    DefEFLAGS {
+  let mayStore = 1;
+}
+// BinOpMRF_RF - Instructions that read "[mem], reg", write "reg" and
+// read/write EFLAGS.
+class BinOpMRF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+  : BinOpMR<o, m, binop_ndd_args, t, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS, (node (load addr:$src1),
+             t.RegClass:$src2, EFLAGS))]>, DefEFLAGS, UseEFLAGS, NDD<1>,
+    Sched<[WriteADC.Folded, WriteADC.ReadAfterFold]>;
+// BinOpMRF_MF - Instructions that read "[mem], reg", write "[mem]" and
+// read/write EFLAGS.
+class BinOpMRF_MF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+  : BinOpMR<o, m, binop_args, t, (outs),
+            [(store (node (load addr:$src1), t.RegClass:$src2, EFLAGS),
+             addr:$src1), (implicit EFLAGS)]>,
+    Sched<[WriteADCRMW,
+          // base, scale, index, offset, segment
+          ReadDefault, ReadDefault, ReadDefault,
+          ReadDefault, ReadDefault,
+          WriteALU.ReadAfterFold,    // reg
+          WriteALU.ReadAfterFold]>,  // EFLAGS
+    DefEFLAGS, UseEFLAGS {
+  let mayStore = 1;
+}
+
+// BinOpMI - Instructions that read "[mem], imm".
+class BinOpMI<bits<8> o, string m, string args, X86TypeInfo t, Format f, dag out, list<dag> p>
+  : ITy<o, f, t, out, (ins t.MemOperand:$src1, t.ImmOperand:$src2), m,
+        args, p> {
+  let ImmT = t.ImmEncoding;
+  let mayLoad = 1;
+}
+// BinOpMI_F - Instructions that read "[mem], imm" and write EFLAGS only.
+class BinOpMI_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node,
+                Format f>
+  : BinOpMI<o, m, binop_args, t, f, (outs),
+            [(set EFLAGS, (node (t.LoadNode addr:$src1), t.ImmOperator:$src2))]>,
+    Sched<[WriteALU.Folded]>, DefEFLAGS;
+// BinOpMI_R - Instructions that read "[mem], imm" and write "reg".
+class BinOpMI_R<bits<8> o, string m, X86TypeInfo t, Format f>
+  : BinOpMI<o, m, binop_ndd_args, t, f, (outs t.RegClass:$dst), []>,
+    Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, NDD<1>;
+// BinOpMI_R - Instructions that read "[mem], imm" and write "reg", EFLAGS.
+class BinOpMI_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node,
+                Format f>
+  : BinOpMI<o, m, binop_ndd_args, t, f, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode addr:$src1), t.ImmOperator:$src2))]>,
+    Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, DefEFLAGS, NDD<1>;
+// BinOpMI_M - Instructions that read "[mem], imm" and write "[mem]".
+class BinOpMI_M<bits<8> o, string m, X86TypeInfo t, Format f>
+  : BinOpMI<o, m, binop_args, t, f, (outs), []>, Sched<[WriteALURMW]> {
+  let mayStore = 1;
+}
+// BinOpMI_MF - Instructions that read "[mem], imm" and write "[mem]", EFLAGS.
+class BinOpMI_MF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, Format f>
+  : BinOpMI<o, m, binop_args, t, f, (outs),
+            [(store (node (t.VT (load addr:$src1)),
+             t.ImmOperator:$src2), addr:$src1), (implicit EFLAGS)]>,
+    Sched<[WriteALURMW]>, DefEFLAGS {
+  let mayStore = 1;
+}
+// BinOpMIF_RF - Instructions that read "[mem], imm", write "reg" and
+// read/write EFLAGS.
+class BinOpMIF_RF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
+  : BinOpMI<o, m, binop_ndd_args, t, f, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS, (node (t.VT (load addr:$src1)),
+             t.ImmOperator:$src2, EFLAGS))]>,
+    Sched<[WriteADC.Folded, WriteADC.ReadAfterFold]>, DefEFLAGS, UseEFLAGS, NDD<1>;
+// BinOpMIF_MF - Instructions that read "[mem], imm", write "[mem]" and
+// read/write EFLAGS.
+class BinOpMIF_MF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
+  : BinOpMI<o, m, binop_args, t, f, (outs),
+            [(store (node (t.VT (load addr:$src1)),
+             t.ImmOperator:$src2, EFLAGS), addr:$src1), (implicit EFLAGS)]>,
+    Sched<[WriteADCRMW]>, DefEFLAGS, UseEFLAGS {
+  let mayStore = 1;
+}
+
+// BinOpMI8 - Instructions that read "[mem], imm8".
+class BinOpMI8<string m, string args, X86TypeInfo t, Format f, dag out>
+  : ITy<0x83, f, t, out, (ins t.MemOperand:$src1, t.Imm8Operand:$src2), m,
+        args, []> {
+  let ImmT = Imm8;
+  let mayLoad = 1;
+}
+// BinOpMI8_F - Instructions that read "[mem], imm8" and write EFLAGS only.
+class BinOpMI8_F<string m, X86TypeInfo t, Format f>
+  : BinOpMI8<m, binop_args, t, f, (outs)>, Sched<[WriteALU.Folded]>, DefEFLAGS;
+// BinOpMI8_R - Instructions that read "[mem], imm8" and write "reg".
+class BinOpMI8_R<string m, X86TypeInfo t, Format f>
+  : BinOpMI8<m, binop_ndd_args, t, f, (outs t.RegClass:$dst)>, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, NDD<1>;
+// BinOpMI8_RF - Instructions that read "[mem], imm8" and write "reg"/EFLAGS.
+class BinOpMI8_RF<string m, X86TypeInfo t, Format f>
+  : BinOpMI8<m, binop_ndd_args, t, f, (outs t.RegClass:$dst)>, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, DefEFLAGS, NDD<1>;
+// BinOpMI8_M - Instructions that read "[mem], imm8" and write "[mem]".
+class BinOpMI8_M<string m, X86TypeInfo t, Format f>
+  : BinOpMI8<m, binop_args, t, f, (outs)>, Sched<[WriteALURMW]> {
+  let mayStore = 1;
+}
+// BinOpMI8_MF - Instructions that read "[mem], imm8" and write "[mem]", EFLAGS.
+class BinOpMI8_MF<string m, X86TypeInfo t, Format f>
+  : BinOpMI8<m, binop_args, t, f, (outs)>, Sched<[WriteALURMW]>, DefEFLAGS {
+  let mayStore = 1;
+}
+// BinOpMI8F_RF - Instructions that read "[mem], imm8", write "reg" and
+// read/write EFLAGS.
+class BinOpMI8F_RF<string m, X86TypeInfo t, Format f>
+  : BinOpMI8<m, binop_ndd_args, t, f, (outs t.RegClass:$dst)>,
+    Sched<[WriteADC.Folded, WriteADC.ReadAfterFold]>, DefEFLAGS, UseEFLAGS, NDD<1>;
+// BinOpMI8F_MF - Instructions that read "[mem], imm8", write "[mem]" and
+// read/write EFLAGS.
+class BinOpMI8F_MF<string m, X86TypeInfo t, Format f>
+  : BinOpMI8<m, binop_args, t, f, (outs)>, Sched<[WriteADCRMW]>, DefEFLAGS, UseEFLAGS {
+  let mayStore = 1;
+}
+
+// BinOpAI - Instructions that read "a-reg imm" (Accumulator register).
+class BinOpAI<bits<8> o, string m, X86TypeInfo t, Register areg, string args>
+  : ITy<o, RawFrm, t, (outs), (ins t.ImmOperand:$src), m, args, []>,
+    Sched<[WriteALU]> {
+  let ImmT = t.ImmEncoding;
+  let Uses = [areg];
+}
+// BinOpAI_F - Instructions that read "a-reg imm" and write EFLAGS only.
+class BinOpAI_F<bits<8> o, string m, X86TypeInfo t, Register areg, string args>
+  : BinOpAI<o, m, t, areg, args>, DefEFLAGS;
+
+// BinOpAI_AF - Instructions that read "a-reg imm" and write a-reg/EFLAGS.
+class BinOpAI_AF<bits<8> o, string m, X86TypeInfo t, Register areg,
+                 string args> : BinOpAI<o, m, t, areg, args> {
+  let Defs = [areg, EFLAGS];
+}
+// BinOpAIF_AF - Instructions that read "a-reg imm", write a-reg and read/write
+// EFLAGS.
+class BinOpAIF_AF<bits<8> o, string m, X86TypeInfo t, Register areg,
+                  string args> : BinOpAI<o, m, t, areg, args> {
+  let Uses = [areg, EFLAGS];
+  let Defs = [areg, EFLAGS];
+  let SchedRW = [WriteADC];
+}
+
+// UnaryOpR - Instructions that read "reg".
+class UnaryOpR<bits<8> o, Format f, string m, string args, X86TypeInfo t,
+               dag out, list<dag> p>
+  : ITy<o, f, t, out, (ins t.RegClass:$src1), m, args, p>, Sched<[WriteALU]>;
+// UnaryOpR_R - Instructions that read "reg" and write "reg".
+class UnaryOpR_R<bits<8> o, Format f, string m, X86TypeInfo t,
+                  SDPatternOperator node, bit ndd = 0>
+  : UnaryOpR<o, f, m, !if(!eq(ndd, 0), unaryop_args, unaryop_ndd_args), t,
+             (outs t.RegClass:$dst),
+             [(set t.RegClass:$dst, (node t.RegClass:$src1))]>, NDD<ndd>;
+// UnaryOpR_RF - Instructions that read "reg" and write "reg"/EFLAGS.
+class UnaryOpR_RF<bits<8> o, Format f, string m, X86TypeInfo t,
+                  SDPatternOperator node, bit ndd = 0>
+  : UnaryOpR<o, f, m, !if(!eq(ndd, 0), unaryop_args, unaryop_ndd_args), t,
+             (outs t.RegClass:$dst),
+             [(set t.RegClass:$dst, (node t.RegClass:$src1)),
+              (implicit EFLAGS)]>, DefEFLAGS, NDD<ndd>;
+
+// UnaryOpM - Instructions that read "[mem]".
+class UnaryOpM<bits<8> o, Format f, string m, string args, X86TypeInfo t,
+               dag out, list<dag> p>
+  : ITy<o, f, t, out, (ins t.MemOperand:$src1), m, args, p> {
+  let mayLoad = 1;
+}
+// UnaryOpM_R - Instructions that read "[mem]" and writes "reg".
+class UnaryOpM_R<bits<8> o, Format f, string m, X86TypeInfo t,
+                  SDPatternOperator node>
+  : UnaryOpM<o, f, m, unaryop_ndd_args, t, (outs t.RegClass:$dst),
+             [(set t.RegClass:$dst, (node (t.LoadNode addr:$src1)))]>,
+    Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, NDD<1>;
+// UnaryOpM_RF - Instructions that read "[mem]" and writes "reg"/EFLAGS.
+class UnaryOpM_RF<bits<8> o, Format f, string m, X86TypeInfo t,
+                  SDPatternOperator node>
+  : UnaryOpM<o, f, m, unaryop_ndd_args, t, (outs t.RegClass:$dst),
+             [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode addr:$src1)))]>,
+    Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, DefEFLAGS, NDD<1>;
+// UnaryOpM_M - Instructions that read "[mem]" and writes "[mem]".
+class UnaryOpM_M<bits<8> o, Format f, string m, X86TypeInfo t,
+                  SDPatternOperator node>
+  : UnaryOpM<o, f, m, unaryop_args, t, (outs),
+             [(store (node (t.LoadNode addr:$src1)), addr:$src1)]>,
+    Sched<[WriteALURMW]>{
+  let mayStore = 1;
+}
+// UnaryOpM_MF - Instructions that read "[mem]" and writes "[mem]"/EFLAGS.
+class UnaryOpM_MF<bits<8> o, Format f, string m, X86TypeInfo t,
+                  SDPatternOperator node>
+  : UnaryOpM<o, f, m, unaryop_args, t, (outs),
+             [(store (node (t.LoadNode addr:$src1)), addr:$src1),
+              (implicit EFLAGS)]>, Sched<[WriteALURMW]>, DefEFLAGS {
+  let mayStore = 1;
 }
diff --git a/llvm/lib/Target/X86/X86InstrVMX.td b/llvm/lib/Target/X86/X86InstrVMX.td
index 5289819119ce..7cc468fe15ad 100644
--- a/llvm/lib/Target/X86/X86InstrVMX.td
+++ b/llvm/lib/Target/X86/X86InstrVMX.td
@@ -17,33 +17,33 @@
 let SchedRW = [WriteSystem] in {
 // 66 0F 38 80
 def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
                Requires<[Not64BitMode]>;
 def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
                Requires<[In64BitMode]>;
 def INVEPT64_EVEX : I<0xF0, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                       "invept\t{$src2, $src1|$src1, $src2}", []>,
-                    EVEX_NoCD8, T_MAP4XS, Requires<[In64BitMode]>;
+                    EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode]>;
 
 // 66 0F 38 81
 def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
                 Requires<[Not64BitMode]>;
 def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
                 Requires<[In64BitMode]>;
 def INVVPID64_EVEX : I<0xF1, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                        "invvpid\t{$src2, $src1|$src1, $src2}", []>,
-                     EVEX_NoCD8, T_MAP4XS, Requires<[In64BitMode]>;
+                     EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode]>;
 
 // 0F 01 C1
 def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
 def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
-  "vmclear\t$vmcs", []>, PD;
+  "vmclear\t$vmcs", []>, TB, PD;
 
 // OF 01 D4
-def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, PS;
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
 
 // 0F 01 C2
 def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
@@ -51,35 +51,35 @@ def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
 // 0F 01 C3
 def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
 def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
-  "vmptrld\t$vmcs", []>, PS;
+  "vmptrld\t$vmcs", []>, TB;
 def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs),
-  "vmptrst\t$vmcs", []>, PS;
+  "vmptrst\t$vmcs", []>, TB;
 def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
 def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[Not64BitMode]>;
 
 let mayStore = 1 in {
 def VMREAD64mr : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
-  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
 def VMREAD32mr : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[Not64BitMode]>;
 } // mayStore
 
 def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
 def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[Not64BitMode]>;
 
 let mayLoad = 1 in {
 def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
 def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[Not64BitMode]>;
 } // mayLoad
 
 // 0F 01 C4
 def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
 def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
-  "vmxon\t$vmxon", []>, XS;
+  "vmxon\t$vmxon", []>, TB, XS;
 } // SchedRW
diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td
index 70bd77bba03a..bbd19cf8d5b2 100644
--- a/llvm/lib/Target/X86/X86InstrVecCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td
@@ -130,6 +130,9 @@ let Predicates = [HasAVX, NoVLX] in {
   defm : subvec_zero_lowering<"DQA", VR128, v32i8, v16i8, sub_xmm>;
 }
 
+let Predicates = [HasAVXNECONVERT, NoVLX] in
+  defm : subvec_zero_lowering<"DQA", VR128, v16bf16, v8bf16, sub_xmm>;
+
 let Predicates = [HasVLX] in {
   defm : subvec_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, sub_xmm>;
   defm : subvec_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, sub_xmm>;
@@ -175,6 +178,12 @@ let Predicates = [HasFP16, HasVLX] in {
   defm : subvec_zero_lowering<"APSZ256", VR256X, v32f16, v16f16, sub_ymm>;
 }
 
+let Predicates = [HasBF16, HasVLX] in {
+  defm : subvec_zero_lowering<"APSZ128", VR128X, v16bf16, v8bf16, sub_xmm>;
+  defm : subvec_zero_lowering<"APSZ128", VR128X, v32bf16, v8bf16, sub_xmm>;
+  defm : subvec_zero_lowering<"APSZ256", VR256X, v32bf16, v16bf16, sub_ymm>;
+}
+
 class maskzeroupper<ValueType vt, RegisterClass RC> :
   PatLeaf<(vt RC:$src), [{
     return isMaskZeroExtended(N);
diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td
index a62bb2e855c9..1504d77bfb86 100644
--- a/llvm/lib/Target/X86/X86InstrXOP.td
+++ b/llvm/lib/Target/X86/X86InstrXOP.td
@@ -105,7 +105,7 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
            [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1),
                              (vt128 (load addr:$src2)))))]>,
-           XOP_4V, REX_W, Sched<[sched.Folded, sched.ReadAfterFold]>;
+           XOP, VVVV, REX_W, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
            (ins i128mem:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -119,7 +119,7 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
                (ins VR128:$src1, VR128:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                []>,
-               XOP_4V, REX_W, Sched<[sched]>;
+               XOP, VVVV, REX_W, Sched<[sched]>;
 }
 
 let ExeDomain = SSEPackedInt in {
@@ -173,7 +173,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int,
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-              (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V,
+              (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP, VVVV,
            Sched<[sched]>;
   def rm : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
@@ -181,7 +181,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
               (Int VR128:$src1, (load addr:$src2),
-              VR128:$src3))]>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              VR128:$src3))]>, XOP, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let ExeDomain = SSEPackedInt in {
@@ -252,7 +252,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
              [(set VR128:$dst,
                 (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
                                timm:$cc)))]>,
-             XOP_4V, Sched<[sched]>;
+             XOP, VVVV, Sched<[sched]>;
     def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2, u8imm:$cc),
              !strconcat("vpcom", Suffix,
@@ -261,7 +261,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
                 (vt128 (OpNode (vt128 VR128:$src1),
                                (vt128 (load addr:$src2)),
                                 timm:$cc)))]>,
-             XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+             XOP, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 
   def : Pat<(OpNode (load addr:$src2),
@@ -288,7 +288,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
             [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
                              (vt128 VR128:$src3))))]>,
-            XOP_4V, Sched<[sched]>;
+            XOP, VVVV, Sched<[sched]>;
   def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
             (ins VR128:$src1, VR128:$src2, i128mem:$src3),
             !strconcat(OpcodeStr,
@@ -296,7 +296,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
             [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
                              (vt128 (load addr:$src3)))))]>,
-            XOP_4V, REX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+            XOP, VVVV, REX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
             (ins VR128:$src1, i128mem:$src2, VR128:$src3),
             !strconcat(OpcodeStr,
@@ -304,7 +304,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
             [(set VR128:$dst,
               (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (load addr:$src2)),
                              (vt128 VR128:$src3))))]>,
-            XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+            XOP, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
                            // 128mem:$src2
                            ReadDefault, ReadDefault, ReadDefault, ReadDefault,
                            ReadDefault,
@@ -316,7 +316,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
                 (ins VR128:$src1, VR128:$src2, VR128:$src3),
                 !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                []>, XOP_4V, REX_W, Sched<[sched]>;
+                []>, XOP, VVVV, REX_W, Sched<[sched]>;
 }
 
 let ExeDomain = SSEPackedInt in {
@@ -333,7 +333,7 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
-                                   (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V,
+                                   (X86andnp RC:$src3, RC:$src2))))]>, XOP, VVVV,
             Sched<[sched]>;
   // FIXME: We can't write a pattern for this in tablegen.
   let hasSideEffects = 0, mayLoad = 1 in
@@ -342,14 +342,14 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             []>,
-            XOP_4V, REX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+            XOP, VVVV, REX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst),
             (ins RC:$src1, x86memop:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
                                    (X86andnp RC:$src3, (load addr:$src2)))))]>,
-            XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+            XOP, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
                            // x86memop:$src2
                            ReadDefault, ReadDefault, ReadDefault, ReadDefault,
                            ReadDefault,
@@ -361,7 +361,7 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
             (ins RC:$src1, RC:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            []>, XOP_4V, REX_W, Sched<[sched]>;
+            []>, XOP, VVVV, REX_W, Sched<[sched]>;
 }
 
 let ExeDomain = SSEPackedInt in {
diff --git a/llvm/lib/Target/X86/X86MacroFusion.cpp b/llvm/lib/Target/X86/X86MacroFusion.cpp
index 82667b8cdbdb..c0fa9aa70324 100644
--- a/llvm/lib/Target/X86/X86MacroFusion.cpp
+++ b/llvm/lib/Target/X86/X86MacroFusion.cpp
@@ -68,7 +68,8 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation> createX86MacroFusionDAGMutation() {
-  return createBranchMacroFusionDAGMutation(shouldScheduleAdjacent);
+  return createMacroFusionDAGMutation(shouldScheduleAdjacent,
+                                      /*BranchOnly=*/true);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 8a04987e768a..49631f38017a 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1459,6 +1459,15 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
                                        Args, CxtI);
 }
 
+InstructionCost
+X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
+                            unsigned Opcode1, const SmallBitVector &OpcodeMask,
+                            TTI::TargetCostKind CostKind) const {
+  if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
+    return TTI::TCC_Basic;
+  return InstructionCost::getInvalid();
+}
+
 InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            VectorType *BaseTp,
                                            ArrayRef<int> Mask,
@@ -3724,10 +3733,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BITREVERSE, MVT::v8i16,   {  8, 13, 10, 16 } },
     { ISD::BITREVERSE, MVT::v32i8,   { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert
     { ISD::BITREVERSE, MVT::v16i8,   {  7,  7,  9, 13 } },
-    { ISD::BSWAP,      MVT::v4i64,   {  5,  7,  5, 10 } },
-    { ISD::BSWAP,      MVT::v2i64,   {  2,  3,  1,  3 } },
-    { ISD::BSWAP,      MVT::v8i32,   {  5,  7,  5, 10 } },
-    { ISD::BSWAP,      MVT::v4i32,   {  2,  3,  1,  3 } },
+    { ISD::BSWAP,      MVT::v4i64,   {  5,  6,  5, 10 } },
+    { ISD::BSWAP,      MVT::v2i64,   {  2,  2,  1,  3 } },
+    { ISD::BSWAP,      MVT::v8i32,   {  5,  6,  5, 10 } },
+    { ISD::BSWAP,      MVT::v4i32,   {  2,  2,  1,  3 } },
     { ISD::BSWAP,      MVT::v16i16,  {  5,  6,  5, 10 } },
     { ISD::BSWAP,      MVT::v8i16,   {  2,  2,  1,  3 } },
     { ISD::CTLZ,       MVT::v4i64,   { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert
@@ -3804,6 +3813,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::FSQRT,      MVT::v2f64,   { 67, 71, 1, 5 } }, // sqrtpd
   };
   static const CostKindTblEntry SLMCostTbl[] = {
+    { ISD::BSWAP,      MVT::v2i64,   {  5,  5, 1, 5 } },
+    { ISD::BSWAP,      MVT::v4i32,   {  5,  5, 1, 5 } },
+    { ISD::BSWAP,      MVT::v8i16,   {  5,  5, 1, 5 } },
     { ISD::FSQRT,      MVT::f32,     { 20, 20, 1, 1 } }, // sqrtss
     { ISD::FSQRT,      MVT::v4f32,   { 40, 41, 1, 5 } }, // sqrtps
     { ISD::FSQRT,      MVT::f64,     { 35, 35, 1, 1 } }, // sqrtsd
@@ -3842,9 +3854,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::BITREVERSE, MVT::v4i32,   { 16, 20, 11, 21 } },
     { ISD::BITREVERSE, MVT::v8i16,   { 16, 20, 11, 21 } },
     { ISD::BITREVERSE, MVT::v16i8,   { 11, 12, 10, 16 } },
-    { ISD::BSWAP,      MVT::v2i64,   {  5,  5,  1,  5 } },
-    { ISD::BSWAP,      MVT::v4i32,   {  5,  5,  1,  5 } },
-    { ISD::BSWAP,      MVT::v8i16,   {  5,  5,  1,  5 } },
+    { ISD::BSWAP,      MVT::v2i64,   {  2,  3,  1,  5 } },
+    { ISD::BSWAP,      MVT::v4i32,   {  2,  3,  1,  5 } },
+    { ISD::BSWAP,      MVT::v8i16,   {  2,  3,  1,  5 } },
     { ISD::CTLZ,       MVT::v2i64,   { 18, 28, 28, 35 } },
     { ISD::CTLZ,       MVT::v4i32,   { 15, 20, 22, 28 } },
     { ISD::CTLZ,       MVT::v8i16,   { 13, 17, 16, 22 } },
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 0fa0d240a548..07a3fff4f84b 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -140,6 +140,11 @@ public:
       TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
       const Instruction *CxtI = nullptr);
+  InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
+                                  unsigned Opcode1,
+                                  const SmallBitVector &OpcodeMask,
+                                  TTI::TargetCostKind CostKind) const;
+
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask,
                                  TTI::TargetCostKind CostKind, int Index,
diff --git a/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 1288597fc6b0..05003ec304ad 100644
--- a/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -250,7 +250,7 @@ bool XCoreDAGToDAGISel::tryBRIND(SDNode *N) {
   SDValue Addr = N->getOperand(1);
   if (Addr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
     return false;
-  unsigned IntNo = cast<ConstantSDNode>(Addr->getOperand(1))->getZExtValue();
+  unsigned IntNo = Addr->getConstantOperandVal(1);
   if (IntNo != Intrinsic::xcore_checkevent)
     return false;
   SDValue nextAddr = Addr->getOperand(2);
diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index 7736adab19e8..18feeaadb03c 100644
--- a/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -767,7 +767,7 @@ SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
   // An index of zero corresponds to the current function's frame address.
   // An index of one to the parent's frame address, and so on.
   // Depths > 0 not supported yet!
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+  if (Op.getConstantOperandVal(0) > 0)
     return SDValue();
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -783,7 +783,7 @@ LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
   // An index of zero corresponds to the current function's return address.
   // An index of one to the parent's return address, and so on.
   // Depths > 0 not supported yet!
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+  if (Op.getConstantOperandVal(0) > 0)
     return SDValue();
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -905,7 +905,7 @@ LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
 SDValue XCoreTargetLowering::
 LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned IntNo = Op.getConstantOperandVal(0);
   switch (IntNo) {
     case Intrinsic::xcore_crc8:
       EVT VT = Op.getValueType();
@@ -1497,7 +1497,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
   default: break;
   case ISD::INTRINSIC_VOID:
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    switch (N->getConstantOperandVal(1)) {
     case Intrinsic::xcore_outt:
     case Intrinsic::xcore_outct:
     case Intrinsic::xcore_chkct: {
@@ -1733,30 +1733,30 @@ void XCoreTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     break;
   case ISD::INTRINSIC_W_CHAIN:
     {
-      unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      switch (IntNo) {
-      case Intrinsic::xcore_getts:
-        // High bits are known to be zero.
-        Known.Zero = APInt::getHighBitsSet(Known.getBitWidth(),
-                                           Known.getBitWidth() - 16);
-        break;
-      case Intrinsic::xcore_int:
-      case Intrinsic::xcore_inct:
-        // High bits are known to be zero.
-        Known.Zero = APInt::getHighBitsSet(Known.getBitWidth(),
-                                           Known.getBitWidth() - 8);
-        break;
-      case Intrinsic::xcore_testct:
-        // Result is either 0 or 1.
-        Known.Zero = APInt::getHighBitsSet(Known.getBitWidth(),
-                                           Known.getBitWidth() - 1);
-        break;
-      case Intrinsic::xcore_testwct:
-        // Result is in the range 0 - 4.
-        Known.Zero = APInt::getHighBitsSet(Known.getBitWidth(),
-                                           Known.getBitWidth() - 3);
-        break;
-      }
+    unsigned IntNo = Op.getConstantOperandVal(1);
+    switch (IntNo) {
+    case Intrinsic::xcore_getts:
+      // High bits are known to be zero.
+      Known.Zero =
+          APInt::getHighBitsSet(Known.getBitWidth(), Known.getBitWidth() - 16);
+      break;
+    case Intrinsic::xcore_int:
+    case Intrinsic::xcore_inct:
+      // High bits are known to be zero.
+      Known.Zero =
+          APInt::getHighBitsSet(Known.getBitWidth(), Known.getBitWidth() - 8);
+      break;
+    case Intrinsic::xcore_testct:
+      // Result is either 0 or 1.
+      Known.Zero =
+          APInt::getHighBitsSet(Known.getBitWidth(), Known.getBitWidth() - 1);
+      break;
+    case Intrinsic::xcore_testwct:
+      // Result is in the range 0 - 4.
+      Known.Zero =
+          APInt::getHighBitsSet(Known.getBitWidth(), Known.getBitWidth() - 3);
+      break;
+    }
     }
     break;
   }
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index 27d168020ce6..ce640f5b8d45 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -348,11 +348,7 @@ StringRef ARM::getArchExtName(uint64_t ArchExtKind) {
 }
 
 static bool stripNegationPrefix(StringRef &Name) {
-  if (Name.starts_with("no")) {
-    Name = Name.substr(2);
-    return true;
-  }
-  return false;
+  return Name.consume_front("no");
 }
 
 StringRef ARM::getArchExtFeature(StringRef ArchExt) {
diff --git a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
index 10b80cad4347..6d3a59d532fd 100644
--- a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
@@ -134,13 +134,13 @@ ARM::EndianKind ARM::parseArchEndian(StringRef Arch) {
 }
 
 // Parse a branch protection specification, which has the form
-//   standard | none | [bti,pac-ret[+b-key,+leaf]*]
+//   standard | none | [bti,pac-ret[+b-key,+leaf,+pc]*]
 // Returns true on success, with individual elements of the specification
 // returned in `PBP`. Returns false in error, with `Err` containing
 // an erroneous part of the spec.
 bool ARM::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
                                 StringRef &Err) {
-  PBP = {"none", "a_key", false};
+  PBP = {"none", "a_key", false, false};
   if (Spec == "none")
     return true; // defaults are ok
 
@@ -166,6 +166,8 @@ bool ARM::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
           PBP.Scope = "all";
         else if (PACOpt == "b-key")
           PBP.Key = "b_key";
+        else if (PACOpt == "pc")
+          PBP.BranchProtectionPAuthLR = true;
         else
           break;
       }
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index e61fcb248fae..2e08c7b12d9d 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1131,37 +1131,59 @@ getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
   case 23:
     CPU = "znver1";
     *Type = X86::AMDFAM17H;
-    if ((Model >= 0x30 && Model <= 0x3f) || Model == 0x71) {
+    if ((Model >= 0x30 && Model <= 0x3f) || (Model == 0x47) ||
+        (Model >= 0x60 && Model <= 0x67) || (Model >= 0x68 && Model <= 0x6f) ||
+        (Model >= 0x70 && Model <= 0x7f) || (Model >= 0x84 && Model <= 0x87) ||
+        (Model >= 0x90 && Model <= 0x97) || (Model >= 0x98 && Model <= 0x9f) ||
+        (Model >= 0xa0 && Model <= 0xaf)) {
+      // Family 17h Models 30h-3Fh (Starship) Zen 2
+      // Family 17h Models 47h (Cardinal) Zen 2
+      // Family 17h Models 60h-67h (Renoir) Zen 2
+      // Family 17h Models 68h-6Fh (Lucienne) Zen 2
+      // Family 17h Models 70h-7Fh (Matisse) Zen 2
+      // Family 17h Models 84h-87h (ProjectX) Zen 2
+      // Family 17h Models 90h-97h (VanGogh) Zen 2
+      // Family 17h Models 98h-9Fh (Mero) Zen 2
+      // Family 17h Models A0h-AFh (Mendocino) Zen 2
       CPU = "znver2";
       *Subtype = X86::AMDFAM17H_ZNVER2;
-      break; // 30h-3fh, 71h: Zen2
+      break;
     }
-    if (Model <= 0x0f) {
+    if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x20 && Model <= 0x2f)) {
+      // Family 17h Models 10h-1Fh (Raven1) Zen
+      // Family 17h Models 10h-1Fh (Picasso) Zen+
+      // Family 17h Models 20h-2Fh (Raven2 x86) Zen
       *Subtype = X86::AMDFAM17H_ZNVER1;
-      break; // 00h-0Fh: Zen1
+      break;
     }
     break;
   case 25:
     CPU = "znver3";
     *Type = X86::AMDFAM19H;
-    if (Model <= 0x0f || (Model >= 0x20 && Model <= 0x5f)) {
-      // Family 19h Models 00h-0Fh - Zen3
-      // Family 19h Models 20h-2Fh - Zen3
-      // Family 19h Models 30h-3Fh - Zen3
-      // Family 19h Models 40h-4Fh - Zen3+
-      // Family 19h Models 50h-5Fh - Zen3+
+    if (Model <= 0x0f || (Model >= 0x20 && Model <= 0x2f) ||
+        (Model >= 0x30 && Model <= 0x3f) || (Model >= 0x40 && Model <= 0x4f) ||
+        (Model >= 0x50 && Model <= 0x5f)) {
+      // Family 19h Models 00h-0Fh (Genesis, Chagall) Zen 3
+      // Family 19h Models 20h-2Fh (Vermeer) Zen 3
+      // Family 19h Models 30h-3Fh (Badami) Zen 3
+      // Family 19h Models 40h-4Fh (Rembrandt) Zen 3+
+      // Family 19h Models 50h-5Fh (Cezanne) Zen 3
       *Subtype = X86::AMDFAM19H_ZNVER3;
       break;
     }
-    if ((Model >= 0x10 && Model <= 0x1f) ||
-        (Model >= 0x60 && Model <= 0x74) ||
-        (Model >= 0x78 && Model <= 0x7b) ||
-        (Model >= 0xA0 && Model <= 0xAf)) {
+    if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x60 && Model <= 0x6f) ||
+        (Model >= 0x70 && Model <= 0x77) || (Model >= 0x78 && Model <= 0x7f) ||
+        (Model >= 0xa0 && Model <= 0xaf)) {
+      // Family 19h Models 10h-1Fh (Stones; Storm Peak) Zen 4
+      // Family 19h Models 60h-6Fh (Raphael) Zen 4
+      // Family 19h Models 70h-77h (Phoenix, Hawkpoint1) Zen 4
+      // Family 19h Models 78h-7Fh (Phoenix 2, Hawkpoint2) Zen 4
+      // Family 19h Models A0h-AFh (Stones-Dense) Zen 4
       CPU = "znver4";
       *Subtype = X86::AMDFAM19H_ZNVER4;
       break; //  "znver4"
     }
-    break; // family 19h
+    break;
   default:
     break; // Unknown AMD CPU.
   }
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index d475650c2d18..e93502187b54 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1208,8 +1208,7 @@ static VersionTuple parseVersionFromName(StringRef Name) {
 VersionTuple Triple::getEnvironmentVersion() const {
   StringRef EnvironmentName = getEnvironmentName();
   StringRef EnvironmentTypeName = getEnvironmentTypeName(getEnvironment());
-  if (EnvironmentName.starts_with(EnvironmentTypeName))
-    EnvironmentName = EnvironmentName.substr(EnvironmentTypeName.size());
+  EnvironmentName.consume_front(EnvironmentTypeName);
 
   return parseVersionFromName(EnvironmentName);
 }
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 085554f18b2b..d46ff07ec734 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -162,8 +162,6 @@ constexpr FeatureBitset FeaturesAlderlake =
 constexpr FeatureBitset FeaturesSierraforest =
     FeaturesAlderlake | FeatureCMPCCXADD | FeatureAVXIFMA | FeatureUINTR |
     FeatureENQCMD | FeatureAVXNECONVERT | FeatureAVXVNNIINT8;
-constexpr FeatureBitset FeaturesGrandridge =
-    FeaturesSierraforest | FeatureRAOINT;
 constexpr FeatureBitset FeaturesArrowlakeS = FeaturesSierraforest |
     FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 | FeatureSM4;
 constexpr FeatureBitset FeaturesPantherlake =
@@ -369,7 +367,7 @@ constexpr ProcInfo Processors[] = {
   // Sierraforest microarchitecture based processors.
   { {"sierraforest"}, CK_Sierraforest, FEATURE_AVX2, FeaturesSierraforest, 'p', false },
   // Grandridge microarchitecture based processors.
-  { {"grandridge"}, CK_Grandridge, FEATURE_AVX2, FeaturesGrandridge, 'p', false },
+  { {"grandridge"}, CK_Grandridge, FEATURE_AVX2, FeaturesSierraforest, 'p', false },
   // Granite Rapids microarchitecture based processors.
   { {"graniterapids"}, CK_Graniterapids, FEATURE_AVX512BF16, FeaturesGraniteRapids, 'n', false },
   // Granite Rapids D microarchitecture based processors.
diff --git a/llvm/lib/TextAPI/CMakeLists.txt b/llvm/lib/TextAPI/CMakeLists.txt
index 2017a1ad6398..29533756e41a 100644
--- a/llvm/lib/TextAPI/CMakeLists.txt
+++ b/llvm/lib/TextAPI/CMakeLists.txt
@@ -13,6 +13,7 @@ add_llvm_component_library(LLVMTextAPI
   TextAPIError.cpp
   TextStub.cpp
   TextStubCommon.cpp
+  Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
   "${LLVM_MAIN_INCLUDE_DIR}/llvm/TextAPI"
diff --git a/llvm/lib/TextAPI/TextStubV5.cpp b/llvm/lib/TextAPI/TextStubV5.cpp
index 2f82bc03480b..aea772dbc4be 100644
--- a/llvm/lib/TextAPI/TextStubV5.cpp
+++ b/llvm/lib/TextAPI/TextStubV5.cpp
@@ -201,8 +201,9 @@ Expected<StubT> getRequiredValue(
 template <typename JsonT, typename StubT = JsonT>
 Expected<StubT> getRequiredValue(
     TBDKey Key, const Object *Obj,
-    std::function<std::optional<JsonT>(const Object *, StringRef)> GetValue,
-    StubT DefaultValue, std::function<std::optional<StubT>(JsonT)> Validate) {
+    std::function<std::optional<JsonT>(const Object *, StringRef)> const
+        GetValue,
+    StubT DefaultValue, function_ref<std::optional<StubT>(JsonT)> Validate) {
   std::optional<JsonT> Val = GetValue(Obj, Keys[Key]);
   if (!Val)
     return DefaultValue;
@@ -215,7 +216,7 @@ Expected<StubT> getRequiredValue(
 }
 
 Error collectFromArray(TBDKey Key, const Object *Obj,
-                       std::function<void(StringRef)> Append,
+                       function_ref<void(StringRef)> Append,
                        bool IsRequired = false) {
   const auto *Values = Obj->getArray(Keys[Key]);
   if (!Values) {
diff --git a/llvm/lib/TextAPI/Utils.cpp b/llvm/lib/TextAPI/Utils.cpp
new file mode 100644
index 000000000000..6d85083e0b54
--- /dev/null
+++ b/llvm/lib/TextAPI/Utils.cpp
@@ -0,0 +1,40 @@
+//===- Utils.cpp ----------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements utility functions for TextAPI Darwin operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TextAPI/Utils.h"
+
+using namespace llvm;
+using namespace llvm::MachO;
+
+void llvm::MachO::replace_extension(SmallVectorImpl<char> &Path,
+                                    const Twine &Extension) {
+  StringRef P(Path.begin(), Path.size());
+  auto ParentPath = sys::path::parent_path(P);
+  auto Filename = sys::path::filename(P);
+
+  if (!ParentPath.ends_with(Filename.str() + ".framework")) {
+    sys::path::replace_extension(Path, Extension);
+    return;
+  }
+  // Framework dylibs do not have a file extension, in those cases the new
+  // extension is appended. e.g. given Path: "Foo.framework/Foo" and Extension:
+  // "tbd", the result is "Foo.framework/Foo.tbd".
+  SmallString<8> Storage;
+  StringRef Ext = Extension.toStringRef(Storage);
+
+  // Append '.' if needed.
+  if (!Ext.empty() && Ext[0] != '.')
+    Path.push_back('.');
+
+  // Append extension.
+  Path.append(Ext.begin(), Ext.end());
+}
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index f37b4dc938d3..529f7309a1a2 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -2951,9 +2951,11 @@ void coro::salvageDebugInfo(
   // dbg.declare does.
   if (isa<DbgDeclareInst>(DVI)) {
     std::optional<BasicBlock::iterator> InsertPt;
-    if (auto *I = dyn_cast<Instruction>(Storage))
+    if (auto *I = dyn_cast<Instruction>(Storage)) {
       InsertPt = I->getInsertionPointAfterDef();
-    else if (isa<Argument>(Storage))
+      if (!OptimizeFrame && I->getDebugLoc())
+        DVI.setDebugLoc(I->getDebugLoc());
+    } else if (isa<Argument>(Storage))
       InsertPt = F->getEntryBlock().begin();
     if (InsertPt)
       DVI.moveBefore(*(*InsertPt)->getParent(), *InsertPt);
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 8e1f782f7cd8..b2618e35b085 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -291,42 +291,15 @@ static const Value *getPointerOperand(const Instruction *I,
 }
 
 /// Helper function to create a pointer based on \p Ptr, and advanced by \p
-/// Offset bytes. To aid later analysis the method tries to build
-/// getelement pointer instructions that traverse the natural type of \p Ptr if
-/// possible. If that fails, the remaining offset is adjusted byte-wise, hence
-/// through a cast to i8*.
-///
-/// TODO: This could probably live somewhere more prominantly if it doesn't
-///       already exist.
-static Value *constructPointer(Type *PtrElemTy, Value *Ptr, int64_t Offset,
-                               IRBuilder<NoFolder> &IRB, const DataLayout &DL) {
-  assert(Offset >= 0 && "Negative offset not supported yet!");
+/// Offset bytes.
+static Value *constructPointer(Value *Ptr, int64_t Offset,
+                               IRBuilder<NoFolder> &IRB) {
   LLVM_DEBUG(dbgs() << "Construct pointer: " << *Ptr << " + " << Offset
                     << "-bytes\n");
 
-  if (Offset) {
-    Type *Ty = PtrElemTy;
-    APInt IntOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset);
-    SmallVector<APInt> IntIndices = DL.getGEPIndicesForOffset(Ty, IntOffset);
-
-    SmallVector<Value *, 4> ValIndices;
-    std::string GEPName = Ptr->getName().str();
-    for (const APInt &Index : IntIndices) {
-      ValIndices.push_back(IRB.getInt(Index));
-      GEPName += "." + std::to_string(Index.getZExtValue());
-    }
-
-    // Create a GEP for the indices collected above.
-    Ptr = IRB.CreateGEP(PtrElemTy, Ptr, ValIndices, GEPName);
-
-    // If an offset is left we use byte-wise adjustment.
-    if (IntOffset != 0) {
-      Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(IntOffset),
-                          GEPName + ".b" + Twine(IntOffset.getZExtValue()));
-    }
-  }
-
-  LLVM_DEBUG(dbgs() << "Constructed pointer: " << *Ptr << "\n");
+  if (Offset)
+    Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt64(Offset),
+                        Ptr->getName() + ".b" + Twine(Offset));
   return Ptr;
 }
 
@@ -7487,16 +7460,15 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
     if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
       const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType);
       for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) {
-        Value *Ptr = constructPointer(
-            PrivType, &Base, PrivStructLayout->getElementOffset(u), IRB, DL);
+        Value *Ptr =
+            constructPointer(&Base, PrivStructLayout->getElementOffset(u), IRB);
         new StoreInst(F.getArg(ArgNo + u), Ptr, &IP);
       }
     } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
       Type *PointeeTy = PrivArrayType->getElementType();
       uint64_t PointeeTySize = DL.getTypeStoreSize(PointeeTy);
       for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
-        Value *Ptr =
-            constructPointer(PrivType, &Base, u * PointeeTySize, IRB, DL);
+        Value *Ptr = constructPointer(&Base, u * PointeeTySize, IRB);
         new StoreInst(F.getArg(ArgNo + u), Ptr, &IP);
       }
     } else {
@@ -7521,8 +7493,8 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
       const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType);
       for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) {
         Type *PointeeTy = PrivStructType->getElementType(u);
-        Value *Ptr = constructPointer(
-            PrivType, Base, PrivStructLayout->getElementOffset(u), IRB, DL);
+        Value *Ptr =
+            constructPointer(Base, PrivStructLayout->getElementOffset(u), IRB);
         LoadInst *L = new LoadInst(PointeeTy, Ptr, "", IP);
         L->setAlignment(Alignment);
         ReplacementValues.push_back(L);
@@ -7531,8 +7503,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
       Type *PointeeTy = PrivArrayType->getElementType();
       uint64_t PointeeTySize = DL.getTypeStoreSize(PointeeTy);
       for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
-        Value *Ptr =
-            constructPointer(PrivType, Base, u * PointeeTySize, IRB, DL);
+        Value *Ptr = constructPointer(Base, u * PointeeTySize, IRB);
         LoadInst *L = new LoadInst(PointeeTy, Ptr, "", IP);
         L->setAlignment(Alignment);
         ReplacementValues.push_back(L);
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 7c277518b21d..7ebf265e17ba 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -76,6 +76,7 @@ STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly");
 STATISTIC(NumWriteOnlyArg, "Number of arguments marked writeonly");
 STATISTIC(NumNoAlias, "Number of function returns marked noalias");
 STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
+STATISTIC(NumNoUndefReturn, "Number of function returns marked noundef");
 STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
 STATISTIC(NumNoUnwind, "Number of functions marked as nounwind");
 STATISTIC(NumNoFree, "Number of functions marked as nofree");
@@ -1279,6 +1280,45 @@ static void addNonNullAttrs(const SCCNodeSet &SCCNodes,
   }
 }
 
+/// Deduce noundef attributes for the SCC.
+static void addNoUndefAttrs(const SCCNodeSet &SCCNodes,
+                            SmallSet<Function *, 8> &Changed) {
+  // Check each function in turn, determining which functions return noundef
+  // values.
+  for (Function *F : SCCNodes) {
+    // Already noundef.
+    if (F->getAttributes().hasRetAttr(Attribute::NoUndef))
+      continue;
+
+    // We can infer and propagate function attributes only when we know that the
+    // definition we'll get at link time is *exactly* the definition we see now.
+    // For more details, see GlobalValue::mayBeDerefined.
+    if (!F->hasExactDefinition())
+      return;
+
+    // MemorySanitizer assumes that the definition and declaration of a
+    // function will be consistent. A function with sanitize_memory attribute
+    // should be skipped from inference.
+    if (F->hasFnAttribute(Attribute::SanitizeMemory))
+      continue;
+
+    if (F->getReturnType()->isVoidTy())
+      continue;
+
+    if (all_of(*F, [](BasicBlock &BB) {
+          if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) {
+            // TODO: perform context-sensitive analysis?
+            return isGuaranteedNotToBeUndefOrPoison(Ret->getReturnValue());
+          }
+          return true;
+        })) {
+      F->addRetAttr(Attribute::NoUndef);
+      ++NumNoUndefReturn;
+      Changed.insert(F);
+    }
+  }
+}
+
 namespace {
 
 /// Collects a set of attribute inference requests and performs them all in one
@@ -1629,7 +1669,10 @@ static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
     for (auto &I : BB.instructionsWithoutDebug())
       if (auto *CB = dyn_cast<CallBase>(&I)) {
         Function *Callee = CB->getCalledFunction();
-        if (!Callee || Callee == F || !Callee->doesNotRecurse())
+        if (!Callee || Callee == F ||
+            (!Callee->doesNotRecurse() &&
+             !(Callee->isDeclaration() &&
+               Callee->hasFnAttribute(Attribute::NoCallback))))
           // Function calls a potentially recursive function.
           return;
       }
@@ -1785,6 +1828,7 @@ deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter,
   inferConvergent(Nodes.SCCNodes, Changed);
   addNoReturnAttrs(Nodes.SCCNodes, Changed);
   addWillReturn(Nodes.SCCNodes, Changed);
+  addNoUndefAttrs(Nodes.SCCNodes, Changed);
 
   // If we have no external nodes participating in the SCC, we can deduce some
   // more precise attributes as well.
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 2c880316e0a1..4176d561363f 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2053,6 +2053,9 @@ private:
     LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
                       << " functions, result: " << Changed << ".\n");
 
+    if (Changed == ChangeStatus::CHANGED)
+      OMPInfoCache.invalidateAnalyses();
+
     return Changed == ChangeStatus::CHANGED;
   }
 
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 6c6f0a0eca72..2fd8668d15e2 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -794,10 +794,9 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
     return R;
 
   auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
-  auto T = FS->findCallTargetMapAt(CallSite);
   Sum = 0;
-  if (T)
-    for (const auto &T_C : T.get())
+  if (auto T = FS->findCallTargetMapAt(CallSite))
+    for (const auto &T_C : *T)
       Sum += T_C.second;
   if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
     if (M->empty())
@@ -1679,7 +1678,8 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
           if (!FS)
             continue;
           auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
-          auto T = FS->findCallTargetMapAt(CallSite);
+          ErrorOr<SampleRecord::CallTargetMap> T =
+              FS->findCallTargetMapAt(CallSite);
           if (!T || T.get().empty())
             continue;
           if (FunctionSamples::ProfileIsProbeBased) {
@@ -2261,9 +2261,8 @@ void SampleProfileMatcher::countProfileCallsiteMismatches(
 
     // Compute number of samples in the original profile.
     uint64_t CallsiteSamples = 0;
-    auto CTM = FS.findCallTargetMapAt(Loc);
-    if (CTM) {
-      for (const auto &I : CTM.get())
+    if (auto CTM = FS.findCallTargetMapAt(Loc)) {
+      for (const auto &I : *CTM)
         CallsiteSamples += I.second;
     }
     const auto *FSMap = FS.findFunctionSamplesMapAt(Loc);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 719a2678fc18..556fde37efeb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1685,8 +1685,8 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
       assert(NotLHS != nullptr && NotRHS != nullptr &&
              "isFreeToInvert desynced with getFreelyInverted");
       Value *LHSPlusRHS = Builder.CreateAdd(NotLHS, NotRHS);
-      return BinaryOperator::CreateSub(ConstantInt::get(RHS->getType(), -2),
-                                       LHSPlusRHS);
+      return BinaryOperator::CreateSub(
+          ConstantInt::getSigned(RHS->getType(), -2), LHSPlusRHS);
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 5e362f4117d0..c03f50d75814 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3513,9 +3513,13 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     return BinaryOperator::CreateOr(Op0, C);
 
   // ((B | C) & A) | B -> B | (A & C)
-  if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
+  if (match(Op0, m_c_And(m_c_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
     return BinaryOperator::CreateOr(Op1, Builder.CreateAnd(A, C));
 
+  // B | ((B | C) & A) -> B | (A & C)
+  if (match(Op1, m_c_And(m_c_Or(m_Specific(Op0), m_Value(C)), m_Value(A))))
+    return BinaryOperator::CreateOr(Op0, Builder.CreateAnd(A, C));
+
   if (Instruction *DeMorgan = matchDeMorgansLaws(I, *this))
     return DeMorgan;
 
@@ -3872,6 +3876,14 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     }
   }
 
+  // (X & C1) | C2 -> X & (C1 | C2) iff (X & C2) == C2
+  if (match(Op0, m_OneUse(m_And(m_Value(X), m_APInt(C1)))) &&
+      match(Op1, m_APInt(C2))) {
+    KnownBits KnownX = computeKnownBits(X, /*Depth*/ 0, &I);
+    if ((KnownX.One & *C2) == *C2)
+      return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *C1 | *C2));
+  }
+
   return nullptr;
 }
 
@@ -3956,35 +3968,50 @@ Value *InstCombinerImpl::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   const APInt *LC, *RC;
   if (match(LHS1, m_APInt(LC)) && match(RHS1, m_APInt(RC)) &&
       LHS0->getType() == RHS0->getType() &&
-      LHS0->getType()->isIntOrIntVectorTy() &&
-      (LHS->hasOneUse() || RHS->hasOneUse())) {
+      LHS0->getType()->isIntOrIntVectorTy()) {
     // Convert xor of signbit tests to signbit test of xor'd values:
     // (X > -1) ^ (Y > -1) --> (X ^ Y) < 0
     // (X <  0) ^ (Y <  0) --> (X ^ Y) < 0
     // (X > -1) ^ (Y <  0) --> (X ^ Y) > -1
     // (X <  0) ^ (Y > -1) --> (X ^ Y) > -1
     bool TrueIfSignedL, TrueIfSignedR;
-    if (isSignBitCheck(PredL, *LC, TrueIfSignedL) &&
+    if ((LHS->hasOneUse() || RHS->hasOneUse()) &&
+        isSignBitCheck(PredL, *LC, TrueIfSignedL) &&
         isSignBitCheck(PredR, *RC, TrueIfSignedR)) {
       Value *XorLR = Builder.CreateXor(LHS0, RHS0);
       return TrueIfSignedL == TrueIfSignedR ? Builder.CreateIsNeg(XorLR) :
                                               Builder.CreateIsNotNeg(XorLR);
     }
 
-    // (X > C) ^ (X < C + 2) --> X != C + 1
-    // (X < C + 2) ^ (X > C) --> X != C + 1
-    // Considering the correctness of this pattern, we should avoid that C is
-    // non-negative and C + 2 is negative, although it will be matched by other
-    // patterns.
-    const APInt *C1, *C2;
-    if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_APInt(C1)) &&
-         PredR == CmpInst::ICMP_SLT && match(RHS1, m_APInt(C2))) ||
-        (PredL == CmpInst::ICMP_SLT && match(LHS1, m_APInt(C2)) &&
-         PredR == CmpInst::ICMP_SGT && match(RHS1, m_APInt(C1))))
-      if (LHS0 == RHS0 && *C1 + 2 == *C2 &&
-          (C1->isNegative() || C2->isNonNegative()))
-        return Builder.CreateICmpNE(LHS0,
-                                    ConstantInt::get(LHS0->getType(), *C1 + 1));
+    // Fold (icmp pred1 X, C1) ^ (icmp pred2 X, C2)
+    // into a single comparison using range-based reasoning.
+    if (LHS0 == RHS0) {
+      ConstantRange CR1 = ConstantRange::makeExactICmpRegion(PredL, *LC);
+      ConstantRange CR2 = ConstantRange::makeExactICmpRegion(PredR, *RC);
+      auto CRUnion = CR1.exactUnionWith(CR2);
+      auto CRIntersect = CR1.exactIntersectWith(CR2);
+      if (CRUnion && CRIntersect)
+        if (auto CR = CRUnion->exactIntersectWith(CRIntersect->inverse())) {
+          if (CR->isFullSet())
+            return ConstantInt::getTrue(I.getType());
+          if (CR->isEmptySet())
+            return ConstantInt::getFalse(I.getType());
+
+          CmpInst::Predicate NewPred;
+          APInt NewC, Offset;
+          CR->getEquivalentICmp(NewPred, NewC, Offset);
+
+          if ((Offset.isZero() && (LHS->hasOneUse() || RHS->hasOneUse())) ||
+              (LHS->hasOneUse() && RHS->hasOneUse())) {
+            Value *NewV = LHS0;
+            Type *Ty = LHS0->getType();
+            if (!Offset.isZero())
+              NewV = Builder.CreateAdd(NewV, ConstantInt::get(Ty, Offset));
+            return Builder.CreateICmp(NewPred, NewV,
+                                      ConstantInt::get(Ty, NewC));
+          }
+        }
+    }
   }
 
   // Instead of trying to imitate the folds for and/or, decompose this 'xor'
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c496f9c7419b..43d4496571be 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1539,6 +1539,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (Instruction *I = foldCommutativeIntrinsicOverSelects(*II))
       return I;
 
+    if (Instruction *I = foldCommutativeIntrinsicOverPhis(*II))
+      return I;
+
     if (CallInst *NewCall = canonicalizeConstantArg0ToArg1(CI))
       return NewCall;
   }
@@ -1793,6 +1796,23 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (Instruction *NewMinMax = factorizeMinMaxTree(II))
        return NewMinMax;
 
+    // Try to fold minmax with constant RHS based on range information
+    const APInt *RHSC;
+    if (match(I1, m_APIntAllowUndef(RHSC))) {
+      ICmpInst::Predicate Pred =
+          ICmpInst::getNonStrictPredicate(MinMaxIntrinsic::getPredicate(IID));
+      bool IsSigned = MinMaxIntrinsic::isSigned(IID);
+      ConstantRange LHS_CR = computeConstantRangeIncludingKnownBits(
+          I0, IsSigned, SQ.getWithInstruction(II));
+      if (!LHS_CR.isFullSet()) {
+        if (LHS_CR.icmp(Pred, *RHSC))
+          return replaceInstUsesWith(*II, I0);
+        if (LHS_CR.icmp(ICmpInst::getSwappedPredicate(Pred), *RHSC))
+          return replaceInstUsesWith(*II,
+                                     ConstantInt::get(II->getType(), *RHSC));
+      }
+    }
+
     break;
   }
   case Intrinsic::bitreverse: {
@@ -3830,6 +3850,12 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
   if (Callee->hasFnAttribute("thunk"))
     return false;
 
+  // If this is a call to a naked function, the assembly might be
+  // using an argument, or otherwise rely on the frame layout,
+  // the function prototype will mismatch.
+  if (Callee->hasFnAttribute(Attribute::Naked))
+    return false;
+
   // If this is a musttail call, the callee's prototype must match the caller's
   // prototype with the exception of pointee types. The code below doesn't
   // implement that, so we can't do this transform.
@@ -4237,3 +4263,22 @@ InstCombinerImpl::foldCommutativeIntrinsicOverSelects(IntrinsicInst &II) {
 
   return nullptr;
 }
+
+Instruction *
+InstCombinerImpl::foldCommutativeIntrinsicOverPhis(IntrinsicInst &II) {
+  assert(II.isCommutative() && "Instruction should be commutative");
+
+  PHINode *LHS = dyn_cast<PHINode>(II.getOperand(0));
+  PHINode *RHS = dyn_cast<PHINode>(II.getOperand(1));
+
+  if (!LHS || !RHS)
+    return nullptr;
+
+  if (auto P = matchSymmetricPhiNodesPair(LHS, RHS)) {
+    replaceOperand(II, 0, P->first);
+    replaceOperand(II, 1, P->second);
+    return &II;
+  }
+
+  return nullptr;
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 289976718e52..3875e59c3ede 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -111,8 +111,8 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
     LoadInst *LI, GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI,
     ConstantInt *AndCst) {
   if (LI->isVolatile() || LI->getType() != GEP->getResultElementType() ||
-      GV->getValueType() != GEP->getSourceElementType() ||
-      !GV->isConstant() || !GV->hasDefinitiveInitializer())
+      GV->getValueType() != GEP->getSourceElementType() || !GV->isConstant() ||
+      !GV->hasDefinitiveInitializer())
     return nullptr;
 
   Constant *Init = GV->getInitializer();
@@ -128,8 +128,7 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
   // the simple index into a single-dimensional array.
   //
   // Require: GEP GV, 0, i {{, constant indices}}
-  if (GEP->getNumOperands() < 3 ||
-      !isa<ConstantInt>(GEP->getOperand(1)) ||
+  if (GEP->getNumOperands() < 3 || !isa<ConstantInt>(GEP->getOperand(1)) ||
       !cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
       isa<Constant>(GEP->getOperand(2)))
     return nullptr;
@@ -142,15 +141,18 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
   Type *EltTy = Init->getType()->getArrayElementType();
   for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) {
     ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
-    if (!Idx) return nullptr;  // Variable index.
+    if (!Idx)
+      return nullptr; // Variable index.
 
     uint64_t IdxVal = Idx->getZExtValue();
-    if ((unsigned)IdxVal != IdxVal) return nullptr; // Too large array index.
+    if ((unsigned)IdxVal != IdxVal)
+      return nullptr; // Too large array index.
 
     if (StructType *STy = dyn_cast<StructType>(EltTy))
       EltTy = STy->getElementType(IdxVal);
     else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
-      if (IdxVal >= ATy->getNumElements()) return nullptr;
+      if (IdxVal >= ATy->getNumElements())
+        return nullptr;
       EltTy = ATy->getElementType();
     } else {
       return nullptr; // Unknown type.
@@ -191,7 +193,8 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
   Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
   for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
     Constant *Elt = Init->getAggregateElement(i);
-    if (!Elt) return nullptr;
+    if (!Elt)
+      return nullptr;
 
     // If this is indexing an array of structures, get the structure element.
     if (!LaterIndices.empty()) {
@@ -214,16 +217,17 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
     if (isa<UndefValue>(C)) {
       // Extend range state machines to cover this element in case there is an
       // undef in the middle of the range.
-      if (TrueRangeEnd == (int)i-1)
+      if (TrueRangeEnd == (int)i - 1)
         TrueRangeEnd = i;
-      if (FalseRangeEnd == (int)i-1)
+      if (FalseRangeEnd == (int)i - 1)
         FalseRangeEnd = i;
       continue;
     }
 
     // If we can't compute the result for any of the elements, we have to give
     // up evaluating the entire conditional.
-    if (!isa<ConstantInt>(C)) return nullptr;
+    if (!isa<ConstantInt>(C))
+      return nullptr;
 
     // Otherwise, we know if the comparison is true or false for this element,
     // update our state machines.
@@ -233,7 +237,7 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
     if (IsTrueForElt) {
       // Update the TrueElement state machine.
       if (FirstTrueElement == Undefined)
-        FirstTrueElement = TrueRangeEnd = i;  // First true element.
+        FirstTrueElement = TrueRangeEnd = i; // First true element.
       else {
         // Update double-compare state machine.
         if (SecondTrueElement == Undefined)
@@ -242,7 +246,7 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
           SecondTrueElement = Overdefined;
 
         // Update range state machine.
-        if (TrueRangeEnd == (int)i-1)
+        if (TrueRangeEnd == (int)i - 1)
           TrueRangeEnd = i;
         else
           TrueRangeEnd = Overdefined;
@@ -259,7 +263,7 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
           SecondFalseElement = Overdefined;
 
         // Update range state machine.
-        if (FalseRangeEnd == (int)i-1)
+        if (FalseRangeEnd == (int)i - 1)
           FalseRangeEnd = i;
         else
           FalseRangeEnd = Overdefined;
@@ -348,7 +352,8 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
 
     // False for two elements -> 'i != 47 & i != 72'.
     Value *C1 = Builder.CreateICmpNE(Idx, FirstFalseIdx);
-    Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement);
+    Value *SecondFalseIdx =
+        ConstantInt::get(Idx->getType(), SecondFalseElement);
     Value *C2 = Builder.CreateICmpNE(Idx, SecondFalseIdx);
     return BinaryOperator::CreateAnd(C1, C2);
   }
@@ -365,8 +370,8 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
       Idx = Builder.CreateAdd(Idx, Offs);
     }
 
-    Value *End = ConstantInt::get(Idx->getType(),
-                                  TrueRangeEnd-FirstTrueElement+1);
+    Value *End =
+        ConstantInt::get(Idx->getType(), TrueRangeEnd - FirstTrueElement + 1);
     return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End);
   }
 
@@ -380,8 +385,8 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
       Idx = Builder.CreateAdd(Idx, Offs);
     }
 
-    Value *End = ConstantInt::get(Idx->getType(),
-                                  FalseRangeEnd-FirstFalseElement);
+    Value *End =
+        ConstantInt::get(Idx->getType(), FalseRangeEnd - FirstFalseElement);
     return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End);
   }
 
@@ -4624,27 +4629,35 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
   }
 
   bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
-  if (BO0 && isa<OverflowingBinaryOperator>(BO0))
-    NoOp0WrapProblem =
-        ICmpInst::isEquality(Pred) ||
-        (CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) ||
-        (CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap());
-  if (BO1 && isa<OverflowingBinaryOperator>(BO1))
-    NoOp1WrapProblem =
-        ICmpInst::isEquality(Pred) ||
-        (CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) ||
-        (CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap());
-
+  bool Op0HasNUW = false, Op1HasNUW = false;
+  bool Op0HasNSW = false, Op1HasNSW = false;
   // Analyze the case when either Op0 or Op1 is an add instruction.
   // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null).
+  auto hasNoWrapProblem = [](const BinaryOperator &BO, CmpInst::Predicate Pred,
+                             bool &HasNSW, bool &HasNUW) -> bool {
+    if (isa<OverflowingBinaryOperator>(BO)) {
+      HasNUW = BO.hasNoUnsignedWrap();
+      HasNSW = BO.hasNoSignedWrap();
+      return ICmpInst::isEquality(Pred) ||
+             (CmpInst::isUnsigned(Pred) && HasNUW) ||
+             (CmpInst::isSigned(Pred) && HasNSW);
+    } else if (BO.getOpcode() == Instruction::Or) {
+      HasNUW = true;
+      HasNSW = true;
+      return true;
+    } else {
+      return false;
+    }
+  };
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
-  if (BO0 && BO0->getOpcode() == Instruction::Add) {
-    A = BO0->getOperand(0);
-    B = BO0->getOperand(1);
+
+  if (BO0) {
+    match(BO0, m_AddLike(m_Value(A), m_Value(B)));
+    NoOp0WrapProblem = hasNoWrapProblem(*BO0, Pred, Op0HasNSW, Op0HasNUW);
   }
-  if (BO1 && BO1->getOpcode() == Instruction::Add) {
-    C = BO1->getOperand(0);
-    D = BO1->getOperand(1);
+  if (BO1) {
+    match(BO1, m_AddLike(m_Value(C), m_Value(D)));
+    NoOp1WrapProblem = hasNoWrapProblem(*BO1, Pred, Op1HasNSW, Op1HasNUW);
   }
 
   // icmp (A+B), A -> icmp B, 0 for equalities or if there is no overflow.
@@ -4764,17 +4777,15 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
       APInt AP2Abs = AP2->abs();
       if (AP1Abs.uge(AP2Abs)) {
         APInt Diff = *AP1 - *AP2;
-        bool HasNUW = BO0->hasNoUnsignedWrap() && Diff.ule(*AP1);
-        bool HasNSW = BO0->hasNoSignedWrap();
         Constant *C3 = Constant::getIntegerValue(BO0->getType(), Diff);
-        Value *NewAdd = Builder.CreateAdd(A, C3, "", HasNUW, HasNSW);
+        Value *NewAdd = Builder.CreateAdd(
+            A, C3, "", Op0HasNUW && Diff.ule(*AP1), Op0HasNSW);
         return new ICmpInst(Pred, NewAdd, C);
       } else {
         APInt Diff = *AP2 - *AP1;
-        bool HasNUW = BO1->hasNoUnsignedWrap() && Diff.ule(*AP2);
-        bool HasNSW = BO1->hasNoSignedWrap();
         Constant *C3 = Constant::getIntegerValue(BO0->getType(), Diff);
-        Value *NewAdd = Builder.CreateAdd(C, C3, "", HasNUW, HasNSW);
+        Value *NewAdd = Builder.CreateAdd(
+            C, C3, "", Op1HasNUW && Diff.ule(*AP2), Op1HasNSW);
         return new ICmpInst(Pred, A, NewAdd);
       }
     }
@@ -4868,16 +4879,14 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
                   isKnownNonZero(Z, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
         // if Z != 0 and nsw(X * Z) and nsw(Y * Z)
         //    X * Z eq/ne Y * Z -> X eq/ne Y
-        if (NonZero && BO0 && BO1 && BO0->hasNoSignedWrap() &&
-            BO1->hasNoSignedWrap())
+        if (NonZero && BO0 && BO1 && Op0HasNSW && Op1HasNSW)
           return new ICmpInst(Pred, X, Y);
       } else
         NonZero = isKnownNonZero(Z, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
 
       // If Z != 0 and nuw(X * Z) and nuw(Y * Z)
       //    X * Z u{lt/le/gt/ge}/eq/ne Y * Z -> X u{lt/le/gt/ge}/eq/ne Y
-      if (NonZero && BO0 && BO1 && BO0->hasNoUnsignedWrap() &&
-          BO1->hasNoUnsignedWrap())
+      if (NonZero && BO0 && BO1 && Op0HasNUW && Op1HasNUW)
         return new ICmpInst(Pred, X, Y);
     }
   }
@@ -4966,7 +4975,8 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
       return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
 
     case Instruction::SDiv:
-      if (!I.isEquality() || !BO0->isExact() || !BO1->isExact())
+      if (!(I.isEquality() || match(BO0->getOperand(1), m_NonNegative())) ||
+          !BO0->isExact() || !BO1->isExact())
         break;
       return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
 
@@ -4976,8 +4986,8 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
       return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
 
     case Instruction::Shl: {
-      bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
-      bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
+      bool NUW = Op0HasNUW && Op1HasNUW;
+      bool NSW = Op0HasNSW && Op1HasNSW;
       if (!NUW && !NSW)
         break;
       if (!NSW && I.isSigned())
@@ -5029,10 +5039,10 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
 }
 
 /// Fold icmp Pred min|max(X, Y), Z.
-Instruction *
-InstCombinerImpl::foldICmpWithMinMaxImpl(Instruction &I,
-                                         MinMaxIntrinsic *MinMax, Value *Z,
-                                         ICmpInst::Predicate Pred) {
+Instruction *InstCombinerImpl::foldICmpWithMinMax(Instruction &I,
+                                                  MinMaxIntrinsic *MinMax,
+                                                  Value *Z,
+                                                  ICmpInst::Predicate Pred) {
   Value *X = MinMax->getLHS();
   Value *Y = MinMax->getRHS();
   if (ICmpInst::isSigned(Pred) && !MinMax->isSigned())
@@ -5161,24 +5171,6 @@ InstCombinerImpl::foldICmpWithMinMaxImpl(Instruction &I,
 
   return nullptr;
 }
-Instruction *InstCombinerImpl::foldICmpWithMinMax(ICmpInst &Cmp) {
-  ICmpInst::Predicate Pred = Cmp.getPredicate();
-  Value *Lhs = Cmp.getOperand(0);
-  Value *Rhs = Cmp.getOperand(1);
-
-  if (MinMaxIntrinsic *MinMax = dyn_cast<MinMaxIntrinsic>(Lhs)) {
-    if (Instruction *Res = foldICmpWithMinMaxImpl(Cmp, MinMax, Rhs, Pred))
-      return Res;
-  }
-
-  if (MinMaxIntrinsic *MinMax = dyn_cast<MinMaxIntrinsic>(Rhs)) {
-    if (Instruction *Res = foldICmpWithMinMaxImpl(
-            Cmp, MinMax, Lhs, ICmpInst::getSwappedPredicate(Pred)))
-      return Res;
-  }
-
-  return nullptr;
-}
 
 // Canonicalize checking for a power-of-2-or-zero value:
 static Instruction *foldICmpPow2Test(ICmpInst &I,
@@ -6843,6 +6835,34 @@ static Instruction *foldReductionIdiom(ICmpInst &I,
   return nullptr;
 }
 
+// This helper will be called with icmp operands in both orders.
+Instruction *InstCombinerImpl::foldICmpCommutative(ICmpInst::Predicate Pred,
+                                                   Value *Op0, Value *Op1,
+                                                   ICmpInst &CxtI) {
+  // Try to optimize 'icmp GEP, P' or 'icmp P, GEP'.
+  if (auto *GEP = dyn_cast<GEPOperator>(Op0))
+    if (Instruction *NI = foldGEPICmp(GEP, Op1, Pred, CxtI))
+      return NI;
+
+  if (auto *SI = dyn_cast<SelectInst>(Op0))
+    if (Instruction *NI = foldSelectICmp(Pred, SI, Op1, CxtI))
+      return NI;
+
+  if (auto *MinMax = dyn_cast<MinMaxIntrinsic>(Op0))
+    if (Instruction *Res = foldICmpWithMinMax(CxtI, MinMax, Op1, Pred))
+      return Res;
+
+  {
+    Value *X;
+    const APInt *C;
+    // icmp X+Cst, X
+    if (match(Op0, m_Add(m_Value(X), m_APInt(C))) && Op1 == X)
+      return foldICmpAddOpConst(X, *C, Pred);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
@@ -6966,20 +6986,11 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
     return Res;
 
-  // Try to optimize 'icmp GEP, P' or 'icmp P, GEP'.
-  if (auto *GEP = dyn_cast<GEPOperator>(Op0))
-    if (Instruction *NI = foldGEPICmp(GEP, Op1, I.getPredicate(), I))
-      return NI;
-  if (auto *GEP = dyn_cast<GEPOperator>(Op1))
-    if (Instruction *NI = foldGEPICmp(GEP, Op0, I.getSwappedPredicate(), I))
-      return NI;
-
-  if (auto *SI = dyn_cast<SelectInst>(Op0))
-    if (Instruction *NI = foldSelectICmp(I.getPredicate(), SI, Op1, I))
-      return NI;
-  if (auto *SI = dyn_cast<SelectInst>(Op1))
-    if (Instruction *NI = foldSelectICmp(I.getSwappedPredicate(), SI, Op0, I))
-      return NI;
+  if (Instruction *Res = foldICmpCommutative(I.getPredicate(), Op0, Op1, I))
+    return Res;
+  if (Instruction *Res =
+          foldICmpCommutative(I.getSwappedPredicate(), Op1, Op0, I))
+    return Res;
 
   // In case of a comparison with two select instructions having the same
   // condition, check whether one of the resulting branches can be simplified.
@@ -7030,9 +7041,6 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Instruction *R = foldICmpWithCastOp(I))
     return R;
 
-  if (Instruction *Res = foldICmpWithMinMax(I))
-    return Res;
-
   {
     Value *X, *Y;
     // Transform (X & ~Y) == 0 --> (X & Y) != 0
@@ -7134,18 +7142,6 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
             !ACXI->isWeak())
           return ExtractValueInst::Create(ACXI, 1);
 
-  {
-    Value *X;
-    const APInt *C;
-    // icmp X+Cst, X
-    if (match(Op0, m_Add(m_Value(X), m_APInt(C))) && Op1 == X)
-      return foldICmpAddOpConst(X, *C, I.getPredicate());
-
-    // icmp X, X+Cst
-    if (match(Op1, m_Add(m_Value(X), m_APInt(C))) && Op0 == X)
-      return foldICmpAddOpConst(X, *C, I.getSwappedPredicate());
-  }
-
   if (Instruction *Res = foldICmpWithHighBitMask(I, Builder))
     return Res;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index f86db698ef8f..bdaf7550b4b4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -278,6 +278,16 @@ private:
                                               IntrinsicInst &Tramp);
   Instruction *foldCommutativeIntrinsicOverSelects(IntrinsicInst &II);
 
+  // Match a pair of Phi Nodes like
+  // phi [a, BB0], [b, BB1] & phi [b, BB0], [a, BB1]
+  // Return the matched two operands.
+  std::optional<std::pair<Value *, Value *>>
+  matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS);
+
+  // Tries to fold (op phi(a, b) phi(b, a)) -> (op a, b)
+  // while op is a commutative intrinsic call.
+  Instruction *foldCommutativeIntrinsicOverPhis(IntrinsicInst &II);
+
   Value *simplifyMaskedLoad(IntrinsicInst &II);
   Instruction *simplifyMaskedStore(IntrinsicInst &II);
   Instruction *simplifyMaskedGather(IntrinsicInst &II);
@@ -492,6 +502,11 @@ public:
   /// X % (C0 * C1)
   Value *SimplifyAddWithRemainder(BinaryOperator &I);
 
+  // Tries to fold (Binop phi(a, b) phi(b, a)) -> (Binop a, b)
+  // while Binop is commutative.
+  Value *SimplifyPhiCommutativeBinaryOp(BinaryOperator &I, Value *LHS,
+                                        Value *RHS);
+
   // Binary Op helper for select operations where the expression can be
   // efficiently reorganized.
   Value *SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS,
@@ -633,9 +648,8 @@ public:
   Instruction *foldICmpInstWithConstantAllowUndef(ICmpInst &Cmp,
                                                   const APInt &C);
   Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ);
-  Instruction *foldICmpWithMinMaxImpl(Instruction &I, MinMaxIntrinsic *MinMax,
-                                      Value *Z, ICmpInst::Predicate Pred);
-  Instruction *foldICmpWithMinMax(ICmpInst &Cmp);
+  Instruction *foldICmpWithMinMax(Instruction &I, MinMaxIntrinsic *MinMax,
+                                  Value *Z, ICmpInst::Predicate Pred);
   Instruction *foldICmpEquality(ICmpInst &Cmp);
   Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I);
   Instruction *foldSignBitTest(ICmpInst &I);
@@ -693,6 +707,8 @@ public:
                                                const APInt &C);
   Instruction *foldICmpBitCast(ICmpInst &Cmp);
   Instruction *foldICmpWithTrunc(ICmpInst &Cmp);
+  Instruction *foldICmpCommutative(ICmpInst::Predicate Pred, Value *Op0,
+                                   Value *Op1, ICmpInst &CxtI);
 
   // Helpers of visitSelectInst().
   Instruction *foldSelectOfBools(SelectInst &SI);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index e5566578869d..f0ea3d9fcad5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -350,6 +350,13 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   if (match(&I, m_c_Mul(m_OneUse(m_Neg(m_Value(X))), m_Value(Y))))
     return BinaryOperator::CreateNeg(Builder.CreateMul(X, Y));
 
+  // (-X * Y) * -X --> (X * Y) * X
+  // (-X << Y) * -X --> (X << Y) * X
+  if (match(Op1, m_Neg(m_Value(X)))) {
+    if (Value *NegOp0 = Negator::Negate(false, /*IsNSW*/ false, Op0, *this))
+      return BinaryOperator::CreateMul(NegOp0, X);
+  }
+
   // (X / Y) *  Y = X - (X % Y)
   // (X / Y) * -Y = (X % Y) - X
   {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 20bf00344b14..ab55f235920a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1171,14 +1171,15 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
   return nullptr;
 }
 
-static Instruction *canonicalizeSPF(SelectInst &Sel, ICmpInst &Cmp,
-                                    InstCombinerImpl &IC) {
+static Value *canonicalizeSPF(ICmpInst &Cmp, Value *TrueVal, Value *FalseVal,
+                              InstCombinerImpl &IC) {
   Value *LHS, *RHS;
   // TODO: What to do with pointer min/max patterns?
-  if (!Sel.getType()->isIntOrIntVectorTy())
+  if (!TrueVal->getType()->isIntOrIntVectorTy())
     return nullptr;
 
-  SelectPatternFlavor SPF = matchSelectPattern(&Sel, LHS, RHS).Flavor;
+  SelectPatternFlavor SPF =
+      matchDecomposedSelectPattern(&Cmp, TrueVal, FalseVal, LHS, RHS).Flavor;
   if (SPF == SelectPatternFlavor::SPF_ABS ||
       SPF == SelectPatternFlavor::SPF_NABS) {
     if (!Cmp.hasOneUse() && !RHS->hasOneUse())
@@ -1188,13 +1189,13 @@ static Instruction *canonicalizeSPF(SelectInst &Sel, ICmpInst &Cmp,
     bool IntMinIsPoison = SPF == SelectPatternFlavor::SPF_ABS &&
                           match(RHS, m_NSWNeg(m_Specific(LHS)));
     Constant *IntMinIsPoisonC =
-        ConstantInt::get(Type::getInt1Ty(Sel.getContext()), IntMinIsPoison);
+        ConstantInt::get(Type::getInt1Ty(Cmp.getContext()), IntMinIsPoison);
     Instruction *Abs =
         IC.Builder.CreateBinaryIntrinsic(Intrinsic::abs, LHS, IntMinIsPoisonC);
 
     if (SPF == SelectPatternFlavor::SPF_NABS)
-      return BinaryOperator::CreateNeg(Abs); // Always without NSW flag!
-    return IC.replaceInstUsesWith(Sel, Abs);
+      return IC.Builder.CreateNeg(Abs); // Always without NSW flag!
+    return Abs;
   }
 
   if (SelectPatternResult::isMinOrMax(SPF)) {
@@ -1215,8 +1216,7 @@ static Instruction *canonicalizeSPF(SelectInst &Sel, ICmpInst &Cmp,
     default:
       llvm_unreachable("Unexpected SPF");
     }
-    return IC.replaceInstUsesWith(
-        Sel, IC.Builder.CreateBinaryIntrinsic(IntrinsicID, LHS, RHS));
+    return IC.Builder.CreateBinaryIntrinsic(IntrinsicID, LHS, RHS);
   }
 
   return nullptr;
@@ -1677,8 +1677,9 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
   if (Instruction *NewSel = foldSelectValueEquivalence(SI, *ICI))
     return NewSel;
 
-  if (Instruction *NewSPF = canonicalizeSPF(SI, *ICI, *this))
-    return NewSPF;
+  if (Value *V =
+          canonicalizeSPF(*ICI, SI.getTrueValue(), SI.getFalseValue(), *this))
+    return replaceInstUsesWith(SI, V);
 
   if (Value *V = foldSelectInstWithICmpConst(SI, ICI, Builder))
     return replaceInstUsesWith(SI, V);
@@ -2363,6 +2364,9 @@ static Instruction *foldSelectToCopysign(SelectInst &Sel,
   Value *FVal = Sel.getFalseValue();
   Type *SelType = Sel.getType();
 
+  if (ICmpInst::makeCmpResultType(TVal->getType()) != Cond->getType())
+    return nullptr;
+
   // Match select ?, TC, FC where the constants are equal but negated.
   // TODO: Generalize to handle a negated variable operand?
   const APFloat *TC, *FC;
@@ -3790,5 +3794,50 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   if (Instruction *I = foldBitCeil(SI, Builder))
     return I;
 
+  // Fold:
+  // (select A && B, T, F) -> (select A, (select B, T, F), F)
+  // (select A || B, T, F) -> (select A, T, (select B, T, F))
+  // if (select B, T, F) is foldable.
+  // TODO: preserve FMF flags
+  auto FoldSelectWithAndOrCond = [&](bool IsAnd, Value *A,
+                                     Value *B) -> Instruction * {
+    if (Value *V = simplifySelectInst(B, TrueVal, FalseVal,
+                                      SQ.getWithInstruction(&SI)))
+      return SelectInst::Create(A, IsAnd ? V : TrueVal, IsAnd ? FalseVal : V);
+
+    // Is (select B, T, F) a SPF?
+    if (CondVal->hasOneUse() && SelType->isIntOrIntVectorTy()) {
+      if (ICmpInst *Cmp = dyn_cast<ICmpInst>(B))
+        if (Value *V = canonicalizeSPF(*Cmp, TrueVal, FalseVal, *this))
+          return SelectInst::Create(A, IsAnd ? V : TrueVal,
+                                    IsAnd ? FalseVal : V);
+    }
+
+    return nullptr;
+  };
+
+  Value *LHS, *RHS;
+  if (match(CondVal, m_And(m_Value(LHS), m_Value(RHS)))) {
+    if (Instruction *I = FoldSelectWithAndOrCond(/*IsAnd*/ true, LHS, RHS))
+      return I;
+    if (Instruction *I = FoldSelectWithAndOrCond(/*IsAnd*/ true, RHS, LHS))
+      return I;
+  } else if (match(CondVal, m_Or(m_Value(LHS), m_Value(RHS)))) {
+    if (Instruction *I = FoldSelectWithAndOrCond(/*IsAnd*/ false, LHS, RHS))
+      return I;
+    if (Instruction *I = FoldSelectWithAndOrCond(/*IsAnd*/ false, RHS, LHS))
+      return I;
+  } else {
+    // We cannot swap the operands of logical and/or.
+    // TODO: Can we swap the operands by inserting a freeze?
+    if (match(CondVal, m_LogicalAnd(m_Value(LHS), m_Value(RHS)))) {
+      if (Instruction *I = FoldSelectWithAndOrCond(/*IsAnd*/ true, LHS, RHS))
+        return I;
+    } else if (match(CondVal, m_LogicalOr(m_Value(LHS), m_Value(RHS)))) {
+      if (Instruction *I = FoldSelectWithAndOrCond(/*IsAnd*/ false, LHS, RHS))
+        return I;
+    }
+  }
+
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4188b5b46e87..351fc3b0174f 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1096,6 +1096,54 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) {
   return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
 }
 
+std::optional<std::pair<Value *, Value *>>
+InstCombinerImpl::matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) {
+  if (LHS->getParent() != RHS->getParent())
+    return std::nullopt;
+
+  if (LHS->getNumIncomingValues() < 2)
+    return std::nullopt;
+
+  if (!equal(LHS->blocks(), RHS->blocks()))
+    return std::nullopt;
+
+  Value *L0 = LHS->getIncomingValue(0);
+  Value *R0 = RHS->getIncomingValue(0);
+
+  for (unsigned I = 1, E = LHS->getNumIncomingValues(); I != E; ++I) {
+    Value *L1 = LHS->getIncomingValue(I);
+    Value *R1 = RHS->getIncomingValue(I);
+
+    if ((L0 == L1 && R0 == R1) || (L0 == R1 && R0 == L1))
+      continue;
+
+    return std::nullopt;
+  }
+
+  return std::optional(std::pair(L0, R0));
+}
+
+Value *InstCombinerImpl::SimplifyPhiCommutativeBinaryOp(BinaryOperator &I,
+                                                        Value *Op0,
+                                                        Value *Op1) {
+  assert(I.isCommutative() && "Instruction should be commutative");
+
+  PHINode *LHS = dyn_cast<PHINode>(Op0);
+  PHINode *RHS = dyn_cast<PHINode>(Op1);
+
+  if (!LHS || !RHS)
+    return nullptr;
+
+  if (auto P = matchSymmetricPhiNodesPair(LHS, RHS)) {
+    Value *BI = Builder.CreateBinOp(I.getOpcode(), P->first, P->second);
+    if (auto *BO = dyn_cast<BinaryOperator>(BI))
+      BO->copyIRFlags(&I);
+    return BI;
+  }
+
+  return nullptr;
+}
+
 Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
                                                         Value *LHS,
                                                         Value *RHS) {
@@ -1529,6 +1577,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) {
       BO.getParent() != Phi1->getParent())
     return nullptr;
 
+  if (BO.isCommutative()) {
+    if (Value *V = SimplifyPhiCommutativeBinaryOp(BO, Phi0, Phi1))
+      return replaceInstUsesWith(BO, V);
+  }
+
   // Fold if there is at least one specific constant value in phi0 or phi1's
   // incoming values that comes from the same block and this specific constant
   // value can be used to do optimization for specific binary operator.
@@ -2416,31 +2469,43 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         DL.getIndexSizeInBits(AS)) {
       uint64_t TyAllocSize = DL.getTypeAllocSize(GEPEltType).getFixedValue();
 
-      bool Matched = false;
-      uint64_t C;
-      Value *V = nullptr;
       if (TyAllocSize == 1) {
-        V = GEP.getOperand(1);
-        Matched = true;
-      } else if (match(GEP.getOperand(1),
-                       m_AShr(m_Value(V), m_ConstantInt(C)))) {
-        if (TyAllocSize == 1ULL << C)
-          Matched = true;
-      } else if (match(GEP.getOperand(1),
-                       m_SDiv(m_Value(V), m_ConstantInt(C)))) {
-        if (TyAllocSize == C)
-          Matched = true;
+        // Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X)) to (bitcast Y),
+        // but only if the result pointer is only used as if it were an integer,
+        // or both point to the same underlying object (otherwise provenance is
+        // not necessarily retained).
+        Value *X = GEP.getPointerOperand();
+        Value *Y;
+        if (match(GEP.getOperand(1),
+                  m_Sub(m_PtrToInt(m_Value(Y)), m_PtrToInt(m_Specific(X)))) &&
+            GEPType == Y->getType()) {
+          bool HasSameUnderlyingObject =
+              getUnderlyingObject(X) == getUnderlyingObject(Y);
+          bool Changed = false;
+          GEP.replaceUsesWithIf(Y, [&](Use &U) {
+            bool ShouldReplace = HasSameUnderlyingObject ||
+                                 isa<ICmpInst>(U.getUser()) ||
+                                 isa<PtrToIntInst>(U.getUser());
+            Changed |= ShouldReplace;
+            return ShouldReplace;
+          });
+          return Changed ? &GEP : nullptr;
+        }
+      } else {
+        // Canonicalize (gep T* X, V / sizeof(T)) to (gep i8* X, V)
+        Value *V;
+        if ((has_single_bit(TyAllocSize) &&
+             match(GEP.getOperand(1),
+                   m_Exact(m_AShr(m_Value(V),
+                                  m_SpecificInt(countr_zero(TyAllocSize)))))) ||
+            match(GEP.getOperand(1),
+                  m_Exact(m_SDiv(m_Value(V), m_SpecificInt(TyAllocSize))))) {
+          GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+              Builder.getInt8Ty(), GEP.getPointerOperand(), V);
+          NewGEP->setIsInBounds(GEP.isInBounds());
+          return NewGEP;
+        }
       }
-
-      // Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X)) to (bitcast Y), but
-      // only if both point to the same underlying object (otherwise provenance
-      // is not necessarily retained).
-      Value *Y;
-      Value *X = GEP.getOperand(0);
-      if (Matched &&
-          match(V, m_Sub(m_PtrToInt(m_Value(Y)), m_PtrToInt(m_Specific(X)))) &&
-          getUnderlyingObject(X) == getUnderlyingObject(Y))
-        return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y, GEPType);
     }
   }
   // We do not handle pointer-vector geps here.
@@ -2463,7 +2528,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
                                        Idx2);
     }
     ConstantInt *C;
-    if (match(GEP.getOperand(1), m_OneUse(m_SExt(m_OneUse(m_NSWAdd(
+    if (match(GEP.getOperand(1), m_OneUse(m_SExtLike(m_OneUse(m_NSWAdd(
                                      m_Value(Idx1), m_ConstantInt(C))))))) {
       // %add = add nsw i32 %idx1, idx2
       // %sidx = sext i32 %add to i64
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 6468d07b4f4f..afb0e6cd1548 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -2737,7 +2737,7 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // the shadow memory.
   // We cannot just ignore these methods, because they may call other
   // instrumented functions.
-  if (F.getName().find(" load]") != std::string::npos) {
+  if (F.getName().contains(" load]")) {
     FunctionCallee AsanInitFunction =
         declareSanitizerInitFunction(*F.getParent(), kAsanInitName, {});
     IRBuilder<> IRB(&F.front(), F.front().begin());
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 539b7441d24b..2236e9cd44c5 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -535,7 +535,7 @@ bool MemProfiler::maybeInsertMemProfInitAtFunctionEntry(Function &F) {
   // the shadow memory.
   // We cannot just ignore these methods, because they may call other
   // instrumented functions.
-  if (F.getName().find(" load]") != std::string::npos) {
+  if (F.getName().contains(" load]")) {
     FunctionCallee MemProfInitFunction =
         declareSanitizerInitFunction(*F.getParent(), MemProfInitName, {});
     IRBuilder<> IRB(&F.front(), F.front().begin());
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index fe672a4377a1..ce570bdfd8b8 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -603,7 +603,7 @@ void ModuleSanitizerCoverage::instrumentFunction(
     Function &F, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
   if (F.empty())
     return;
-  if (F.getName().find(".module_ctor") != std::string::npos)
+  if (F.getName().contains(".module_ctor"))
     return; // Should not instrument sanitizer init functions.
   if (F.getName().starts_with("__sanitizer_"))
     return; // Don't instrument __sanitizer_* callbacks.
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 98cfadddee8e..49ac1e96e255 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -273,7 +273,16 @@ class ConstraintInfo {
 
 public:
   ConstraintInfo(const DataLayout &DL, ArrayRef<Value *> FunctionArgs)
-      : UnsignedCS(FunctionArgs), SignedCS(FunctionArgs), DL(DL) {}
+      : UnsignedCS(FunctionArgs), SignedCS(FunctionArgs), DL(DL) {
+    auto &Value2Index = getValue2Index(false);
+    // Add Arg > -1 constraints to unsigned system for all function arguments.
+    for (Value *Arg : FunctionArgs) {
+      ConstraintTy VarPos(SmallVector<int64_t, 8>(Value2Index.size() + 1, 0),
+                          false, false, false);
+      VarPos.Coefficients[Value2Index[Arg]] = -1;
+      UnsignedCS.addVariableRow(VarPos.Coefficients);
+    }
+  }
 
   DenseMap<Value *, unsigned> &getValue2Index(bool Signed) {
     return Signed ? SignedCS.getValue2Index() : UnsignedCS.getValue2Index();
@@ -1001,22 +1010,14 @@ void State::addInfoFor(BasicBlock &BB) {
       continue;
     }
 
-    if (match(&I, m_Intrinsic<Intrinsic::ssub_with_overflow>())) {
-      WorkList.push_back(
-          FactOrCheck::getCheck(DT.getNode(&BB), cast<CallInst>(&I)));
-      continue;
-    }
-
-    if (isa<MinMaxIntrinsic>(&I)) {
-      WorkList.push_back(FactOrCheck::getInstFact(DT.getNode(&BB), &I));
-      continue;
-    }
-
-    Value *A, *B;
-    CmpInst::Predicate Pred;
-    // For now, just handle assumes with a single compare as condition.
-    if (match(&I, m_Intrinsic<Intrinsic::assume>(
-                      m_ICmp(Pred, m_Value(A), m_Value(B))))) {
+    auto *II = dyn_cast<IntrinsicInst>(&I);
+    Intrinsic::ID ID = II ? II->getIntrinsicID() : Intrinsic::not_intrinsic;
+    switch (ID) {
+    case Intrinsic::assume: {
+      Value *A, *B;
+      CmpInst::Predicate Pred;
+      if (!match(I.getOperand(0), m_ICmp(Pred, m_Value(A), m_Value(B))))
+        break;
       if (GuaranteedToExecute) {
         // The assume is guaranteed to execute when BB is entered, hence Cond
         // holds on entry to BB.
@@ -1026,7 +1027,23 @@ void State::addInfoFor(BasicBlock &BB) {
         WorkList.emplace_back(
             FactOrCheck::getInstFact(DT.getNode(I.getParent()), &I));
       }
+      break;
     }
+    // Enqueue ssub_with_overflow for simplification.
+    case Intrinsic::ssub_with_overflow:
+      WorkList.push_back(
+          FactOrCheck::getCheck(DT.getNode(&BB), cast<CallInst>(&I)));
+      break;
+    // Enqueue the intrinsics to add extra info.
+    case Intrinsic::abs:
+    case Intrinsic::umin:
+    case Intrinsic::umax:
+    case Intrinsic::smin:
+    case Intrinsic::smax:
+      WorkList.push_back(FactOrCheck::getInstFact(DT.getNode(&BB), &I));
+      break;
+    }
+
     GuaranteedToExecute &= isGuaranteedToTransferExecutionToSuccessor(&I);
   }
 
@@ -1466,6 +1483,17 @@ void ConstraintInfo::addFact(CmpInst::Predicate Pred, Value *A, Value *B,
     DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
                             std::move(ValuesToRelease));
 
+    if (!R.IsSigned) {
+      for (Value *V : NewVariables) {
+        ConstraintTy VarPos(SmallVector<int64_t, 8>(Value2Index.size() + 1, 0),
+                            false, false, false);
+        VarPos.Coefficients[Value2Index[V]] = -1;
+        CSToUse.addVariableRow(VarPos.Coefficients);
+        DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
+                                SmallVector<Value *, 2>());
+      }
+    }
+
     if (R.isEq()) {
       // Also add the inverted constraint for equality constraints.
       for (auto &Coeff : R.Coefficients)
@@ -1673,6 +1701,13 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
 
     ICmpInst::Predicate Pred;
     if (!CB.isConditionFact()) {
+      Value *X;
+      if (match(CB.Inst, m_Intrinsic<Intrinsic::abs>(m_Value(X)))) {
+        // TODO: Add CB.Inst >= 0 fact.
+        AddFact(CmpInst::ICMP_SGE, CB.Inst, X);
+        continue;
+      }
+
       if (auto *MinMax = dyn_cast<MinMaxIntrinsic>(CB.Inst)) {
         Pred = ICmpInst::getNonStrictPredicate(MinMax->getPredicate());
         AddFact(Pred, MinMax, MinMax->getLHS());
@@ -1711,7 +1746,8 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
 #ifndef NDEBUG
   unsigned SignedEntries =
       count_if(DFSInStack, [](const StackEntry &E) { return E.IsSigned; });
-  assert(Info.getCS(false).size() == DFSInStack.size() - SignedEntries &&
+  assert(Info.getCS(false).size() - FunctionArgs.size() ==
+             DFSInStack.size() - SignedEntries &&
          "updates to CS and DFSInStack are out of sync");
   assert(Info.getCS(true).size() == SignedEntries &&
          "updates to CS and DFSInStack are out of sync");
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index d2dfc764d042..c44d3748a80d 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -935,11 +935,13 @@ static bool processSDiv(BinaryOperator *SDI, const ConstantRange &LCR,
   UDiv->setDebugLoc(SDI->getDebugLoc());
   UDiv->setIsExact(SDI->isExact());
 
-  Value *Res = UDiv;
+  auto *Res = UDiv;
 
   // If the operands had two different domains, we need to negate the result.
-  if (Ops[0].D != Ops[1].D)
+  if (Ops[0].D != Ops[1].D) {
     Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI);
+    Res->setDebugLoc(SDI->getDebugLoc());
+  }
 
   SDI->replaceAllUsesWith(Res);
   SDI->eraseFromParent();
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index edfeb36f3422..c5bf913cda30 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -521,7 +521,7 @@ struct AllSwitchPaths {
 
       const BasicBlock *PrevBB = Path.back();
       for (const BasicBlock *BB : Path) {
-        if (StateDef.count(BB) != 0) {
+        if (StateDef.contains(BB)) {
           const PHINode *Phi = dyn_cast<PHINode>(StateDef[BB]);
           assert(Phi && "Expected a state-defining instr to be a phi node.");
 
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 5e58af0edc15..e36578f3de7a 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -592,7 +592,7 @@ uint32_t GVNPass::ValueTable::lookupOrAddCall(CallInst *C) {
 
 /// Returns true if a value number exists for the specified value.
 bool GVNPass::ValueTable::exists(Value *V) const {
-  return valueNumbering.count(V) != 0;
+  return valueNumbering.contains(V);
 }
 
 /// lookup_or_add - Returns the value number for the specified value, assigning
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 39607464dd00..a58bbe318563 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -7006,7 +7006,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
 
       LLVM_DEBUG(dbgs() << "Old term-cond:\n"
                         << *OldTermCond << "\n"
-                        << "New term-cond:\b" << *NewTermCond << "\n");
+                        << "New term-cond:\n" << *NewTermCond << "\n");
 
       BI->setCondition(NewTermCond);
 
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index f14541a1a037..7cfeb019af97 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -200,6 +200,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
   UP.Count = 0;
   UP.DefaultUnrollRuntimeCount = 8;
   UP.MaxCount = std::numeric_limits<unsigned>::max();
+  UP.MaxUpperBound = UnrollMaxUpperBound;
   UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
   UP.BEInsns = 2;
   UP.Partial = false;
@@ -237,6 +238,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
   if (UnrollMaxCount.getNumOccurrences() > 0)
     UP.MaxCount = UnrollMaxCount;
+  if (UnrollMaxUpperBound.getNumOccurrences() > 0)
+    UP.MaxUpperBound = UnrollMaxUpperBound;
   if (UnrollFullMaxCount.getNumOccurrences() > 0)
     UP.FullUnrollMaxCount = UnrollFullMaxCount;
   if (UnrollAllowPartial.getNumOccurrences() > 0)
@@ -777,7 +780,7 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
     return TripCount;
 
   if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount &&
-      MaxTripCount <= UnrollMaxUpperBound)
+      MaxTripCount <= UP.MaxUpperBound)
     return MaxTripCount;
 
   // if didn't return until here, should continue to other priorties
@@ -952,7 +955,7 @@ bool llvm::computeUnrollCount(
   // cost of exact full unrolling.  As such, if we have an exact count and
   // found it unprofitable, we'll never chose to bounded unroll.
   if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
-      MaxTripCount <= UnrollMaxUpperBound) {
+      MaxTripCount <= UP.MaxUpperBound) {
     UP.Count = MaxTripCount;
     if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
                                              MaxTripCount, UCE, UP)) {
@@ -1026,7 +1029,7 @@ bool llvm::computeUnrollCount(
   }
 
   // Don't unroll a small upper bound loop unless user or TTI asked to do so.
-  if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
+  if (MaxTripCount && !UP.Force && MaxTripCount < UP.MaxUpperBound) {
     UP.Count = 0;
     return false;
   }
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 40b4ea92e1ff..3f02441b74ba 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -2057,7 +2057,7 @@ static void relocationViaAlloca(
   for (const auto &Info : Records)
     for (auto RematerializedValuePair : Info.RematerializedValues) {
       Value *OriginalValue = RematerializedValuePair.second;
-      if (AllocaMap.count(OriginalValue) != 0)
+      if (AllocaMap.contains(OriginalValue))
         continue;
 
       emitAllocaFor(OriginalValue);
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 24da26c9f0f2..656abdb0abbf 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3285,6 +3285,7 @@ private:
         (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
          SliceSize !=
              DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedValue() ||
+         !DL.typeSizeEqualsStoreSize(NewAI.getAllocatedType()) ||
          !NewAI.getAllocatedType()->isSingleValueType());
 
     // If we're just going to emit a memcpy, the alloca hasn't changed, and the
diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
index fb4d82885377..282c44563466 100644
--- a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
@@ -29,9 +29,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -66,19 +67,6 @@ class CanonicalizeFreezeInLoopsImpl {
   ScalarEvolution &SE;
   DominatorTree &DT;
 
-  struct FrozenIndPHIInfo {
-    // A freeze instruction that uses an induction phi
-    FreezeInst *FI = nullptr;
-    // The induction phi, step instruction, the operand idx of StepInst which is
-    // a step value
-    PHINode *PHI;
-    BinaryOperator *StepInst;
-    unsigned StepValIdx = 0;
-
-    FrozenIndPHIInfo(PHINode *PHI, BinaryOperator *StepInst)
-        : PHI(PHI), StepInst(StepInst) {}
-  };
-
   // Can freeze instruction be pushed into operands of I?
   // In order to do this, I should not create a poison after I's flags are
   // stripped.
@@ -99,6 +87,46 @@ public:
 
 } // anonymous namespace
 
+namespace llvm {
+
+struct FrozenIndPHIInfo {
+  // A freeze instruction that uses an induction phi
+  FreezeInst *FI = nullptr;
+  // The induction phi, step instruction, the operand idx of StepInst which is
+  // a step value
+  PHINode *PHI;
+  BinaryOperator *StepInst;
+  unsigned StepValIdx = 0;
+
+  FrozenIndPHIInfo(PHINode *PHI, BinaryOperator *StepInst)
+      : PHI(PHI), StepInst(StepInst) {}
+
+  bool operator==(const FrozenIndPHIInfo &Other) { return FI == Other.FI; }
+};
+
+template <> struct DenseMapInfo<FrozenIndPHIInfo> {
+  static inline FrozenIndPHIInfo getEmptyKey() {
+    return FrozenIndPHIInfo(DenseMapInfo<PHINode *>::getEmptyKey(),
+                            DenseMapInfo<BinaryOperator *>::getEmptyKey());
+  }
+
+  static inline FrozenIndPHIInfo getTombstoneKey() {
+    return FrozenIndPHIInfo(DenseMapInfo<PHINode *>::getTombstoneKey(),
+                            DenseMapInfo<BinaryOperator *>::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const FrozenIndPHIInfo &Val) {
+    return DenseMapInfo<FreezeInst *>::getHashValue(Val.FI);
+  };
+
+  static bool isEqual(const FrozenIndPHIInfo &LHS,
+                      const FrozenIndPHIInfo &RHS) {
+    return LHS.FI == RHS.FI;
+  };
+};
+
+} // end namespace llvm
+
 // Given U = (value, user), replace value with freeze(value), and let
 // SCEV forget user. The inserted freeze is placed in the preheader.
 void CanonicalizeFreezeInLoopsImpl::InsertFreezeAndForgetFromSCEV(Use &U) {
@@ -126,7 +154,7 @@ bool CanonicalizeFreezeInLoopsImpl::run() {
   if (!L->isLoopSimplifyForm())
     return false;
 
-  SmallVector<FrozenIndPHIInfo, 4> Candidates;
+  SmallSetVector<FrozenIndPHIInfo, 4> Candidates;
 
   for (auto &PHI : L->getHeader()->phis()) {
     InductionDescriptor ID;
@@ -155,7 +183,7 @@ bool CanonicalizeFreezeInLoopsImpl::run() {
       if (auto *FI = dyn_cast<FreezeInst>(U)) {
         LLVM_DEBUG(dbgs() << "canonfr: found: " << *FI << "\n");
         Info.FI = FI;
-        Candidates.push_back(Info);
+        Candidates.insert(Info);
       }
     };
     for_each(PHI.users(), Visit);
diff --git a/llvm/lib/Transforms/Utils/DXILUpgrade.cpp b/llvm/lib/Transforms/Utils/DXILUpgrade.cpp
index 735686ddce38..09991f628224 100644
--- a/llvm/lib/Transforms/Utils/DXILUpgrade.cpp
+++ b/llvm/lib/Transforms/Utils/DXILUpgrade.cpp
@@ -7,14 +7,26 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/DXILUpgrade.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "dxil-upgrade"
+
 static bool handleValVerMetadata(Module &M) {
   NamedMDNode *ValVer = M.getNamedMetadata("dx.valver");
   if (!ValVer)
     return false;
 
+  LLVM_DEBUG({
+    MDNode *N = ValVer->getOperand(0);
+    auto X = mdconst::extract<ConstantInt>(N->getOperand(0))->getZExtValue();
+    auto Y = mdconst::extract<ConstantInt>(N->getOperand(1))->getZExtValue();
+    dbgs() << "DXIL: validation version: " << X << "." << Y << "\n";
+  });
   // We don't need the validation version internally, so we drop it.
   ValVer->dropAllReferences();
   ValVer->eraseFromParent();
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index a758fb306982..c76cc9db16d7 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3593,8 +3593,9 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C,
   if (isa<ConstantInt>(C))
     return createIntegerExpression(C);
 
-  if (Ty.isFloatTy() || Ty.isDoubleTy()) {
-    const APFloat &APF = cast<ConstantFP>(&C)->getValueAPF();
+  auto *FP = dyn_cast<ConstantFP>(&C);
+  if (FP && (Ty.isFloatTy() || Ty.isDoubleTy())) {
+    const APFloat &APF = FP->getValueAPF();
     return DIB.createConstantValueExpression(
         APF.bitcastToAPInt().getZExtValue());
   }
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 1e42d7491676..f94047633022 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -64,7 +64,7 @@ bool forAllReachableExits(const DominatorTree &DT, const PostDominatorTree &PDT,
     // sure that the return is covered. Otherwise, we can check whether there
     // is a way to reach the RI from the start of the lifetime without passing
     // through an end.
-    if (EndBlocks.count(RI->getParent()) > 0 ||
+    if (EndBlocks.contains(RI->getParent()) ||
         !isPotentiallyReachable(Start, RI, &EndBlocks, &DT, &LI)) {
       ++NumCoveredExits;
     }
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 722ed03db3de..42e7c4006b42 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "indvars"
 
@@ -786,8 +787,6 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
 /// otherwise.
 bool SimplifyIndvar::strengthenRightShift(BinaryOperator *BO,
                                           Instruction *IVOperand) {
-  using namespace llvm::PatternMatch;
-
   if (BO->getOpcode() == Instruction::Shl) {
     bool Changed = false;
     ConstantRange IVRange = SE->getUnsignedRange(SE->getSCEV(IVOperand));
@@ -1763,7 +1762,7 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU, SCEVExpander &Rewri
   };
 
   // Our raison d'etre! Eliminate sign and zero extension.
-  if ((isa<SExtInst>(DU.NarrowUse) && canWidenBySExt()) ||
+  if ((match(DU.NarrowUse, m_SExtLike(m_Value())) && canWidenBySExt()) ||
       (isa<ZExtInst>(DU.NarrowUse) && canWidenByZExt())) {
     Value *NewDef = DU.WideDef;
     if (DU.NarrowUse->getType() != WideType) {
@@ -2011,8 +2010,6 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
 /// by looking at dominating conditions inside of the loop
 void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
                                     Instruction *NarrowUser) {
-  using namespace llvm::PatternMatch;
-
   Value *NarrowDefLHS;
   const APInt *NarrowDefRHS;
   if (!match(NarrowDef, m_NSWAdd(m_Value(NarrowDefLHS),
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f82e161fb846..8e135d80f4f2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8174,13 +8174,20 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
   bool Consecutive =
       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
 
+  VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
+  if (Consecutive) {
+    auto *VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
+                                                Reverse, I->getDebugLoc());
+    Builder.getInsertBlock()->appendRecipe(VectorPtr);
+    Ptr = VectorPtr;
+  }
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
-    return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
-                                              Consecutive, Reverse);
+    return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
+                                              Reverse);
 
   StoreInst *Store = cast<StoreInst>(I);
-  return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
-                                            Mask, Consecutive, Reverse);
+  return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
+                                            Consecutive, Reverse);
 }
 
 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -9475,8 +9482,8 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
   bool isMaskRequired = getMask();
   if (isMaskRequired) {
-    // Mask reversal is only neede for non-all-one (null) masks, as reverse of a
-    // null all-one mask is a null mask.
+    // Mask reversal is only needed for non-all-one (null) masks, as reverse of
+    // a null all-one mask is a null mask.
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       Value *Mask = State.get(getMask(), Part);
       if (isReverse())
@@ -9485,44 +9492,6 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
     }
   }
 
-  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
-    // Calculate the pointer for the specific unroll-part.
-    Value *PartPtr = nullptr;
-
-    // Use i32 for the gep index type when the value is constant,
-    // or query DataLayout for a more suitable index type otherwise.
-    const DataLayout &DL =
-        Builder.GetInsertBlock()->getModule()->getDataLayout();
-    Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)
-                        ? DL.getIndexType(PointerType::getUnqual(
-                              ScalarDataTy->getContext()))
-                        : Builder.getInt32Ty();
-    bool InBounds = false;
-    if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
-      InBounds = gep->isInBounds();
-    if (isReverse()) {
-      // If the address is consecutive but reversed, then the
-      // wide store needs to start at the last vector element.
-      // RunTimeVF =  VScale * VF.getKnownMinValue()
-      // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
-      Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
-      // NumElt = -Part * RunTimeVF
-      Value *NumElt =
-          Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
-      // LastLane = 1 - RunTimeVF
-      Value *LastLane =
-          Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
-      PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
-      PartPtr =
-          Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
-    } else {
-      Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
-      PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
-    }
-
-    return PartPtr;
-  };
-
   // Handle Stores:
   if (SI) {
     State.setDebugLocFrom(SI->getDebugLoc());
@@ -9543,8 +9512,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
           // We don't want to update the value in the map as it might be used in
           // another expression. So don't call resetVectorValue(StoredVal).
         }
-        auto *VecPtr =
-            CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
+        auto *VecPtr = State.get(getAddr(), Part);
         if (isMaskRequired)
           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                             BlockInMaskParts[Part]);
@@ -9568,8 +9536,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
                                          nullptr, "wide.masked.gather");
       State.addMetadata(NewLI, LI);
     } else {
-      auto *VecPtr =
-          CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
+      auto *VecPtr = State.get(getAddr(), Part);
       if (isMaskRequired)
         NewLI = Builder.CreateMaskedLoad(
             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5c325ad8a291..304991526064 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4925,36 +4925,34 @@ void BoUpSLP::buildExternalUses(
         LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
 
         Instruction *UserInst = dyn_cast<Instruction>(U);
-        if (!UserInst)
+        if (!UserInst || isDeleted(UserInst))
           continue;
 
-        if (isDeleted(UserInst))
+        // Ignore users in the user ignore list.
+        if (UserIgnoreList && UserIgnoreList->contains(UserInst))
           continue;
 
         // Skip in-tree scalars that become vectors
         if (TreeEntry *UseEntry = getTreeEntry(U)) {
-          Value *UseScalar = UseEntry->Scalars[0];
           // Some in-tree scalars will remain as scalar in vectorized
-          // instructions. If that is the case, the one in Lane 0 will
+          // instructions. If that is the case, the one in FoundLane will
           // be used.
-          if (UseScalar != U ||
-              UseEntry->State == TreeEntry::ScatterVectorize ||
+          if (UseEntry->State == TreeEntry::ScatterVectorize ||
               UseEntry->State == TreeEntry::PossibleStridedVectorize ||
-              !doesInTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+              !doesInTreeUserNeedToExtract(
+                  Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                               << ".\n");
             assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
             continue;
           }
+          U = nullptr;
         }
 
-        // Ignore users in the user ignore list.
-        if (UserIgnoreList && UserIgnoreList->contains(UserInst))
-          continue;
-
-        LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
-                          << Lane << " from " << *Scalar << ".\n");
-        ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
+        LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
+                          << " from lane " << Lane << " from " << *Scalar
+                          << ".\n");
+        ExternalUses.emplace_back(Scalar, U, FoundLane);
       }
     }
   }
@@ -6443,7 +6441,7 @@ bool BoUpSLP::areAllUsersVectorized(
     Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
   return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
          all_of(I->users(), [this](User *U) {
-           return ScalarToTreeEntry.count(U) > 0 ||
+           return ScalarToTreeEntry.contains(U) ||
                   isVectorLikeInstWithConstOps(U) ||
                   (isa<ExtractElementInst>(U) && MustGather.contains(U));
          });
@@ -8384,6 +8382,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       (void)E;
       return TTI->getInstructionCost(VI, CostKind);
     };
+    // FIXME: Workaround for syntax error reported by MSVC buildbots.
+    TargetTransformInfo &TTIRef = *TTI;
     // Need to clear CommonCost since the final shuffle cost is included into
     // vector cost.
     auto GetVectorCost = [&](InstructionCost) {
@@ -8398,14 +8398,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         // No need to add new vector costs here since we're going to reuse
         // same main/alternate vector ops, just do different shuffling.
       } else if (Instruction::isBinaryOp(E->getOpcode())) {
-        VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
+        VecCost =
+            TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
         VecCost +=
-            TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
+            TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
       } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
         auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
-        VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
-                                          CI0->getPredicate(), CostKind, VL0);
-        VecCost += TTI->getCmpSelInstrCost(
+        VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
+                                            CI0->getPredicate(), CostKind, VL0);
+        VecCost += TTIRef.getCmpSelInstrCost(
             E->getOpcode(), VecTy, MaskTy,
             cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
             E->getAltOp());
@@ -8414,10 +8415,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
         auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
         auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
-        VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
-                                        TTI::CastContextHint::None, CostKind);
-        VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
-                                         TTI::CastContextHint::None, CostKind);
+        VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
+                                          TTI::CastContextHint::None, CostKind);
+        VecCost +=
+            TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
+                                    TTI::CastContextHint::None, CostKind);
       }
       SmallVector<int> Mask;
       E->buildAltOpShuffleMask(
@@ -8426,8 +8428,27 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
             return I->getOpcode() == E->getAltOpcode();
           },
           Mask);
-      VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
-                                     FinalVecTy, Mask);
+      VecCost += TTIRef.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+                                       FinalVecTy, Mask);
+      // Patterns like [fadd,fsub] can be combined into a single instruction
+      // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
+      // need to take into account their order when looking for the most used
+      // order.
+      unsigned Opcode0 = E->getOpcode();
+      unsigned Opcode1 = E->getAltOpcode();
+      // The opcode mask selects between the two opcodes.
+      SmallBitVector OpcodeMask(E->Scalars.size(), false);
+      for (unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
+        if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
+          OpcodeMask.set(Lane);
+      // If this pattern is supported by the target then we consider the
+      // order.
+      if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
+        InstructionCost AltVecCost = TTIRef.getAltInstrCost(
+            VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
+        return AltVecCost < VecCost ? AltVecCost : VecCost;
+      }
+      // TODO: Check the reverse order too.
       return VecCost;
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
@@ -11493,17 +11514,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *PO = LI->getPointerOperand();
       if (E->State == TreeEntry::Vectorize) {
         NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
-
-        // The pointer operand uses an in-tree scalar so we add the new
-        // LoadInst to ExternalUses list to make sure that an extract will
-        // be generated in the future.
-        if (isa<Instruction>(PO)) {
-          if (TreeEntry *Entry = getTreeEntry(PO)) {
-            // Find which lane we need to extract.
-            unsigned FoundLane = Entry->findLaneForValue(PO);
-            ExternalUses.emplace_back(PO, NewLI, FoundLane);
-          }
-        }
       } else {
         assert((E->State == TreeEntry::ScatterVectorize ||
                 E->State == TreeEntry::PossibleStridedVectorize) &&
@@ -11539,17 +11549,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       StoreInst *ST =
           Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
 
-      // The pointer operand uses an in-tree scalar, so add the new StoreInst to
-      // ExternalUses to make sure that an extract will be generated in the
-      // future.
-      if (isa<Instruction>(Ptr)) {
-        if (TreeEntry *Entry = getTreeEntry(Ptr)) {
-          // Find which lane we need to extract.
-          unsigned FoundLane = Entry->findLaneForValue(Ptr);
-          ExternalUses.push_back(ExternalUser(Ptr, ST, FoundLane));
-        }
-      }
-
       Value *V = propagateMetadata(ST, E->Scalars);
 
       E->VectorizedValue = V;
@@ -11597,10 +11596,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       CallInst *CI = cast<CallInst>(VL0);
       setInsertPointAfterBundle(E);
 
-      Intrinsic::ID IID = Intrinsic::not_intrinsic;
-      if (Function *FI = CI->getCalledFunction())
-        IID = FI->getIntrinsicID();
-
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
@@ -11611,18 +11606,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       SmallVector<Value *> OpVecs;
       SmallVector<Type *, 2> TysForDecl;
       // Add return type if intrinsic is overloaded on it.
-      if (isVectorIntrinsicWithOverloadTypeAtArg(IID, -1))
+      if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
         TysForDecl.push_back(
             FixedVectorType::get(CI->getType(), E->Scalars.size()));
       for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
         ValueList OpVL;
         // Some intrinsics have scalar arguments. This argument should not be
         // vectorized.
-        if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
+        if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
           CallInst *CEI = cast<CallInst>(VL0);
           ScalarArg = CEI->getArgOperand(I);
           OpVecs.push_back(CEI->getArgOperand(I));
-          if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I))
+          if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
             TysForDecl.push_back(ScalarArg->getType());
           continue;
         }
@@ -11634,7 +11629,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         }
         LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
-        if (isVectorIntrinsicWithOverloadTypeAtArg(IID, I))
+        if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
           TysForDecl.push_back(OpVec->getType());
       }
 
@@ -11654,18 +11649,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       CI->getOperandBundlesAsDefs(OpBundles);
       Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
 
-      // The scalar argument uses an in-tree scalar so we add the new vectorized
-      // call to ExternalUses list to make sure that an extract will be
-      // generated in the future.
-      if (isa_and_present<Instruction>(ScalarArg)) {
-        if (TreeEntry *Entry = getTreeEntry(ScalarArg)) {
-          // Find which lane we need to extract.
-          unsigned FoundLane = Entry->findLaneForValue(ScalarArg);
-          ExternalUses.push_back(
-              ExternalUser(ScalarArg, cast<User>(V), FoundLane));
-        }
-      }
-
       propagateIRFlags(V, E->Scalars, VL0);
       V = FinalShuffle(V, E, VecTy, IsSigned);
 
@@ -11877,6 +11860,7 @@ Value *BoUpSLP::vectorizeTree(
   DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
   SmallDenseSet<Value *, 4> UsedInserts;
   DenseMap<Value *, Value *> VectorCasts;
+  SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
   // Extract all of the elements with the external uses.
   for (const auto &ExternalUse : ExternalUses) {
     Value *Scalar = ExternalUse.Scalar;
@@ -11947,13 +11931,27 @@ Value *BoUpSLP::vectorizeTree(
       VectorToInsertElement.try_emplace(Vec, IE);
       return Vec;
     };
-    // If User == nullptr, the Scalar is used as extra arg. Generate
-    // ExtractElement instruction and update the record for this scalar in
-    // ExternallyUsedValues.
+    // If User == nullptr, the Scalar remains as scalar in vectorized
+    // instructions or is used as extra arg. Generate ExtractElement instruction
+    // and update the record for this scalar in ExternallyUsedValues.
     if (!User) {
-      assert(ExternallyUsedValues.count(Scalar) &&
-             "Scalar with nullptr as an external user must be registered in "
-             "ExternallyUsedValues map");
+      if (!ScalarsWithNullptrUser.insert(Scalar).second)
+        continue;
+      assert((ExternallyUsedValues.count(Scalar) ||
+              any_of(Scalar->users(),
+                     [&](llvm::User *U) {
+                       TreeEntry *UseEntry = getTreeEntry(U);
+                       return UseEntry &&
+                              UseEntry->State == TreeEntry::Vectorize &&
+                              E->State == TreeEntry::Vectorize &&
+                              doesInTreeUserNeedToExtract(
+                                  Scalar,
+                                  cast<Instruction>(UseEntry->Scalars.front()),
+                                  TLI);
+                     })) &&
+             "Scalar with nullptr User must be registered in "
+             "ExternallyUsedValues map or remain as scalar in vectorized "
+             "instructions");
       if (auto *VecI = dyn_cast<Instruction>(Vec)) {
         if (auto *PHI = dyn_cast<PHINode>(VecI))
           Builder.SetInsertPoint(PHI->getParent(),
@@ -16222,7 +16220,7 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
       for (auto *V : Candidates) {
         auto *GEP = cast<GetElementPtrInst>(V);
         auto *GEPIdx = GEP->idx_begin()->get();
-        assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
+        assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
         Bundle[BundleIndex++] = GEPIdx;
       }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 94cb76889813..7d33baac52c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1357,6 +1357,36 @@ public:
 #endif
 };
 
+/// A recipe to compute the pointers for widened memory accesses of IndexTy for
+/// all parts. If IsReverse is true, compute pointers for accessing the input in
+/// reverse order per part.
+class VPVectorPointerRecipe : public VPRecipeBase, public VPValue {
+  Type *IndexedTy;
+  bool IsReverse;
+
+public:
+  VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsReverse,
+                        DebugLoc DL)
+      : VPRecipeBase(VPDef::VPVectorPointerSC, {Ptr}, DL), VPValue(this),
+        IndexedTy(IndexedTy), IsReverse(IsReverse) {}
+
+  VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
+
+  void execute(VPTransformState &State) override;
+
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A pure virtual base class for all recipes modeling header phis, including
 /// phis for first order recurrences, pointer inductions and reductions. The
 /// start value is the first operand of the recipe and the incoming value from
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 02e400d590be..76961629aece 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -498,16 +498,17 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
          "DbgInfoIntrinsic should have been dropped during VPlan construction");
   State.setDebugLocFrom(CI.getDebugLoc());
 
+  bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
   FunctionType *VFTy = nullptr;
   if (Variant)
     VFTy = Variant->getFunctionType();
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     SmallVector<Type *, 2> TysForDecl;
     // Add return type if intrinsic is overloaded on it.
-    if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1)) {
+    if (UseIntrinsic &&
+        isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
       TysForDecl.push_back(
           VectorType::get(CI.getType()->getScalarType(), State.VF));
-    }
     SmallVector<Value *, 4> Args;
     for (const auto &I : enumerate(operands())) {
       // Some intrinsics have a scalar argument - don't replace it with a
@@ -516,18 +517,19 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
       // e.g. linear parameters for pointers.
       Value *Arg;
       if ((VFTy && !VFTy->getParamType(I.index())->isVectorTy()) ||
-          (VectorIntrinsicID != Intrinsic::not_intrinsic &&
+          (UseIntrinsic &&
            isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())))
         Arg = State.get(I.value(), VPIteration(0, 0));
       else
         Arg = State.get(I.value(), Part);
-      if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
+      if (UseIntrinsic &&
+          isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
         TysForDecl.push_back(Arg->getType());
       Args.push_back(Arg);
     }
 
     Function *VectorF;
-    if (VectorIntrinsicID != Intrinsic::not_intrinsic) {
+    if (UseIntrinsic) {
       // Use vector version of the intrinsic.
       Module *M = State.Builder.GetInsertBlock()->getModule();
       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
@@ -1209,6 +1211,59 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPVectorPointerRecipe ::execute(VPTransformState &State) {
+  auto &Builder = State.Builder;
+  State.setDebugLocFrom(getDebugLoc());
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    // Calculate the pointer for the specific unroll-part.
+    Value *PartPtr = nullptr;
+    // Use i32 for the gep index type when the value is constant,
+    // or query DataLayout for a more suitable index type otherwise.
+    const DataLayout &DL =
+        Builder.GetInsertBlock()->getModule()->getDataLayout();
+    Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
+                        ? DL.getIndexType(IndexedTy->getPointerTo())
+                        : Builder.getInt32Ty();
+    Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
+    bool InBounds = false;
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+      InBounds = GEP->isInBounds();
+    if (IsReverse) {
+      // If the address is consecutive but reversed, then the
+      // wide store needs to start at the last vector element.
+      // RunTimeVF =  VScale * VF.getKnownMinValue()
+      // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
+      Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
+      // NumElt = -Part * RunTimeVF
+      Value *NumElt = Builder.CreateMul(
+          ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
+      // LastLane = 1 - RunTimeVF
+      Value *LastLane =
+          Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
+      PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
+      PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
+    } else {
+      Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
+      PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
+    }
+
+    State.set(this, PartPtr, Part);
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
+                                  VPSlotTracker &SlotTracker) const {
+  O << Indent;
+  printAsOperand(O, SlotTracker);
+  O << " = vector-pointer ";
+  if (IsReverse)
+    O << "(reverse) ";
+
+  printOperands(O, SlotTracker);
+}
+#endif
+
 void VPBlendRecipe::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
   // We know that all PHIs in non-header blocks are converted into
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 116acad8e8f3..8cc98f4abf93 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -351,6 +351,7 @@ public:
     VPReductionSC,
     VPReplicateSC,
     VPScalarIVStepsSC,
+    VPVectorPointerSC,
     VPWidenCallSC,
     VPWidenCanonicalIVSC,
     VPWidenCastSC,
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index d3b772e519af..77254b7eb5e6 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -522,6 +522,7 @@ if(runtimes)
         FileCheck
         count
         llvm-cov
+        llvm-lto
         llvm-nm
         llvm-objdump
         llvm-profdata
@@ -529,6 +530,7 @@ if(runtimes)
         llvm-xray
         not
         obj2yaml
+        opt
         sancov
         sanstats
         llvm_gtest_main
diff --git a/llvm/test/Analysis/CostModel/X86/bswap-codesize.ll b/llvm/test/Analysis/CostModel/X86/bswap-codesize.ll
index 21789ca58364..1d7f19d1664e 100644
--- a/llvm/test/Analysis/CostModel/X86/bswap-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap-codesize.ll
@@ -7,6 +7,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SLM
 
 ;
 ; bswap(X)
@@ -69,6 +71,13 @@ define void @cost_bswap_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; SLM-LABEL: 'cost_bswap_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
   %I64    = call i64 @llvm.bswap.i64(i64 %a64)
   %V2I64  = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
   %V4I64  = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
@@ -133,6 +142,13 @@ define void @cost_bswap_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; SLM-LABEL: 'cost_bswap_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
   %I32   = call i32 @llvm.bswap.i32(i32 %a32)
   %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
   %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
@@ -197,6 +213,13 @@ define void @cost_bswap_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; SLM-LABEL: 'cost_bswap_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
   %I16    = call i16 @llvm.bswap.i16(i16 %a16)
   %V8I16  = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
   %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
diff --git a/llvm/test/Analysis/CostModel/X86/bswap-latency.ll b/llvm/test/Analysis/CostModel/X86/bswap-latency.ll
index 370369cf7e93..592541f4816f 100644
--- a/llvm/test/Analysis/CostModel/X86/bswap-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap-latency.ll
@@ -7,6 +7,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SLM
 
 ;
 ; bswap(X)
@@ -22,23 +24,23 @@ define void @cost_bswap_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ;
 ; SSSE3-LABEL: 'cost_bswap_i64'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_bswap_i64'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_bswap_i64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'cost_bswap_i64'
@@ -69,6 +71,13 @@ define void @cost_bswap_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; SLM-LABEL: 'cost_bswap_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
   %I64    = call i64 @llvm.bswap.i64(i64 %a64)
   %V2I64  = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
   %V4I64  = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
@@ -86,23 +95,23 @@ define void @cost_bswap_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ;
 ; SSSE3-LABEL: 'cost_bswap_i32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_bswap_i32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_bswap_i32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'cost_bswap_i32'
@@ -133,6 +142,13 @@ define void @cost_bswap_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; SLM-LABEL: 'cost_bswap_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
   %I32   = call i32 @llvm.bswap.i32(i32 %a32)
   %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
   %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
@@ -150,16 +166,16 @@ define void @cost_bswap_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ;
 ; SSSE3-LABEL: 'cost_bswap_i16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_bswap_i16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_bswap_i16'
@@ -197,6 +213,13 @@ define void @cost_bswap_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; SLM-LABEL: 'cost_bswap_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
   %I16    = call i16 @llvm.bswap.i16(i16 %a16)
   %V8I16  = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
   %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
diff --git a/llvm/test/Analysis/CostModel/X86/bswap-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/bswap-sizelatency.ll
index 7c53e250c68f..c6cfafb20462 100644
--- a/llvm/test/Analysis/CostModel/X86/bswap-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap-sizelatency.ll
@@ -7,6 +7,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SLM
 
 ;
 ; bswap(X)
@@ -69,6 +71,13 @@ define void @cost_bswap_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; SLM-LABEL: 'cost_bswap_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
   %I64    = call i64 @llvm.bswap.i64(i64 %a64)
   %V2I64  = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
   %V4I64  = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
@@ -133,6 +142,13 @@ define void @cost_bswap_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; SLM-LABEL: 'cost_bswap_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
   %I32   = call i32 @llvm.bswap.i32(i32 %a32)
   %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
   %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
@@ -197,6 +213,13 @@ define void @cost_bswap_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; SLM-LABEL: 'cost_bswap_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
   %I16    = call i16 @llvm.bswap.i16(i16 %a16)
   %V8I16  = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
   %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
diff --git a/llvm/test/Analysis/CostModel/X86/bswap.ll b/llvm/test/Analysis/CostModel/X86/bswap.ll
index 2894c884bd7b..d1c7daa04cfa 100644
--- a/llvm/test/Analysis/CostModel/X86/bswap.ll
+++ b/llvm/test/Analysis/CostModel/X86/bswap.ll
@@ -7,6 +7,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SLM
 
 ;
 ; bswap(X)
@@ -22,16 +24,16 @@ define void @cost_bswap_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ;
 ; SSSE3-LABEL: 'cost_bswap_i64'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_bswap_i64'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_bswap_i64'
@@ -69,6 +71,13 @@ define void @cost_bswap_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; SLM-LABEL: 'cost_bswap_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.bswap.i64(i64 %a64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %I64    = call i64 @llvm.bswap.i64(i64 %a64)
   %V2I64  = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a128)
   %V4I64  = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a256)
@@ -86,16 +95,16 @@ define void @cost_bswap_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ;
 ; SSSE3-LABEL: 'cost_bswap_i32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_bswap_i32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_bswap_i32'
@@ -133,6 +142,13 @@ define void @cost_bswap_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; SLM-LABEL: 'cost_bswap_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.bswap.i32(i32 %a32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %I32   = call i32 @llvm.bswap.i32(i32 %a32)
   %V2I32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a128)
   %V4I32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a256)
@@ -150,16 +166,16 @@ define void @cost_bswap_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ;
 ; SSSE3-LABEL: 'cost_bswap_i16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_bswap_i16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_bswap_i16'
@@ -197,6 +213,13 @@ define void @cost_bswap_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; SLM-LABEL: 'cost_bswap_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.bswap.i16(i16 %a16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %a512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %I16    = call i16 @llvm.bswap.i16(i16 %a16)
   %V8I16  = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a128)
   %V16I16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a256)
diff --git a/llvm/test/Analysis/CostModel/X86/cast.ll b/llvm/test/Analysis/CostModel/X86/cast.ll
index 5a83d4e81fd3..e0173e9df4dc 100644
--- a/llvm/test/Analysis/CostModel/X86/cast.ll
+++ b/llvm/test/Analysis/CostModel/X86/cast.ll
@@ -616,27 +616,31 @@ define void @fp_conv(<8 x float> %a, <16 x float>%b, <4 x float> %c) {
 ; SSE-LABEL: 'fp_conv'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = fpext <4 x float> %c to <4 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A2 = fpext <8 x float> %a to <8 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A3 = fptrunc <4 x double> undef to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %A3 = fpext <16 x float> %b to <16 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A4 = fptrunc <4 x double> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A5 = fptrunc <8 x double> undef to <8 x float>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'fp_conv'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A1 = fpext <4 x float> %c to <4 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A2 = fpext <8 x float> %a to <8 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A3 = fptrunc <4 x double> undef to <4 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A3 = fpext <16 x float> %b to <16 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A4 = fptrunc <4 x double> undef to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A5 = fptrunc <8 x double> undef to <8 x float>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'fp_conv'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A1 = fpext <4 x float> %c to <4 x double>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A2 = fpext <8 x float> %a to <8 x double>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A3 = fptrunc <4 x double> undef to <4 x float>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A3 = fpext <16 x float> %b to <16 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A4 = fptrunc <4 x double> undef to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A5 = fptrunc <8 x double> undef to <8 x float>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %A1 = fpext <4 x float> %c to <4 x double>
   %A2 = fpext <8 x float> %a to <8 x double>
-  %A3 = fptrunc <4 x double> undef to <4 x float>
-  %A4 = fptrunc <8 x double> undef to <8 x float>
+  %A3 = fpext <16 x float> %b to <16 x double>
+  %A4 = fptrunc <4 x double> undef to <4 x float>
+  %A5 = fptrunc <8 x double> undef to <8 x float>
   ret void
 }
diff --git a/llvm/test/Analysis/ScalarEvolution/max-expr-cache.ll b/llvm/test/Analysis/ScalarEvolution/max-expr-cache.ll
index d401ff31035e..c2d905375069 100644
--- a/llvm/test/Analysis/ScalarEvolution/max-expr-cache.ll
+++ b/llvm/test/Analysis/ScalarEvolution/max-expr-cache.ll
@@ -229,21 +229,21 @@ define void @umax(i32 %tmp3) {
 ; CHECK-NEXT:    %tmp48 = select i1 %tmp47, i32 %tmp44, i32 %tmp46
 ; CHECK-NEXT:    --> ((7 + (256 umin {%tmp3,+,-256}<%bb4>))<nuw><nsw> umax (256 umin (1 + (256 umin (1 + (256 umin (1 + (256 umin (1 + (256 umin (1 + (256 umin (1 + (256 umin (1 + (256 umin {%tmp3,+,-256}<%bb4>))<nuw><nsw> umin {%tmp3,+,-256}<%bb4>))<nuw><nsw> umin {%tmp3,+,-256}<%bb4>))<nuw><nsw> umin {%tmp3,+,-256}<%bb4>))<nuw><nsw> umin {%tmp3,+,-256}<%bb4>))<nuw><nsw> umin {%tmp3,+,-256}<%bb4>))<nuw><nsw> umin {%tmp3,+,-256}<%bb4>))<nuw><nsw> umin {%tmp3,+,-256}<%bb4>)) U: [7,264) S: [7,264) Exits: <<Unknown>> LoopDispositions: { %bb4: Computable, %bb53: Invariant }
 ; CHECK-NEXT:    %tmp49 = ashr i32 %tmp48, 3
-; CHECK-NEXT:    --> %tmp49 U: [0,128) S: [0,128) Exits: <<Unknown>> LoopDispositions: { %bb4: Variant, %bb53: Invariant }
+; CHECK-NEXT:    --> %tmp49 U: [-268435456,268435456) S: [-268435456,268435456) Exits: <<Unknown>> LoopDispositions: { %bb4: Variant, %bb53: Invariant }
 ; CHECK-NEXT:    %tmp51 = select i1 %tmp50, i32 %tmp49, i32 0
-; CHECK-NEXT:    --> %tmp49 U: [0,128) S: [0,128) Exits: <<Unknown>> LoopDispositions: { %bb4: Variant, %bb53: Invariant }
+; CHECK-NEXT:    --> %tmp49 U: [-268435456,268435456) S: [-268435456,268435456) Exits: <<Unknown>> LoopDispositions: { %bb4: Variant, %bb53: Invariant }
 ; CHECK-NEXT:    %tmp52 = zext i32 %tmp51 to i64
-; CHECK-NEXT:    --> (zext i32 %tmp49 to i64) U: [0,128) S: [0,128) Exits: <<Unknown>> LoopDispositions: { %bb4: Variant, %bb53: Invariant }
+; CHECK-NEXT:    --> (zext i32 %tmp49 to i64) U: [0,4294967296) S: [0,4294967296) Exits: <<Unknown>> LoopDispositions: { %bb4: Variant, %bb53: Invariant }
 ; CHECK-NEXT:    %tmp54 = phi i64 [ undef, %bb4 ], [ %tmp59, %bb53 ]
 ; CHECK-NEXT:    --> {undef,+,1}<nsw><%bb53> U: full-set S: full-set Exits: (-1 + (zext i32 %tmp49 to i64))<nsw> LoopDispositions: { %bb53: Computable, %bb4: Variant }
 ; CHECK-NEXT:    %tmp55 = trunc i64 %tmp54 to i32
 ; CHECK-NEXT:    --> {(trunc i64 undef to i32),+,1}<%bb53> U: full-set S: full-set Exits: (-1 + %tmp49)<nsw> LoopDispositions: { %bb53: Computable, %bb4: Variant }
 ; CHECK-NEXT:    %tmp56 = shl nsw i32 %tmp55, 3
-; CHECK-NEXT:    --> {(8 * (trunc i64 undef to i32)),+,8}<%bb53> U: [0,-7) S: [-2147483648,2147483641) Exits: (-8 + (8 * %tmp49)<nuw><nsw>)<nsw> LoopDispositions: { %bb53: Computable, %bb4: Variant }
+; CHECK-NEXT:    --> {(8 * (trunc i64 undef to i32)),+,8}<%bb53> U: [0,-7) S: [-2147483648,2147483641) Exits: (-8 + (8 * %tmp49)<nsw>) LoopDispositions: { %bb53: Computable, %bb4: Variant }
 ; CHECK-NEXT:    %tmp57 = sext i32 %tmp56 to i64
-; CHECK-NEXT:    --> (sext i32 {(8 * (trunc i64 undef to i32)),+,8}<%bb53> to i64) U: [0,-7) S: [-2147483648,2147483641) Exits: (-8 + (8 * (zext i32 %tmp49 to i64))<nuw><nsw>)<nsw> LoopDispositions: { %bb53: Computable, %bb4: Variant }
+; CHECK-NEXT:    --> (sext i32 {(8 * (trunc i64 undef to i32)),+,8}<%bb53> to i64) U: [0,-7) S: [-2147483648,2147483641) Exits: (sext i32 (-8 + (8 * %tmp49)<nsw>) to i64) LoopDispositions: { %bb53: Computable, %bb4: Variant }
 ; CHECK-NEXT:    %tmp58 = getelementptr inbounds i8, ptr null, i64 %tmp57
-; CHECK-NEXT:    --> ((sext i32 {(8 * (trunc i64 undef to i32)),+,8}<%bb53> to i64) + null) U: [0,-7) S: [-2147483648,2147483641) Exits: (-8 + (8 * (zext i32 %tmp49 to i64))<nuw><nsw> + null) LoopDispositions: { %bb53: Computable, %bb4: Variant }
+; CHECK-NEXT:    --> ((sext i32 {(8 * (trunc i64 undef to i32)),+,8}<%bb53> to i64) + null) U: [0,-7) S: [-2147483648,2147483641) Exits: ((sext i32 (-8 + (8 * %tmp49)<nsw>) to i64) + null) LoopDispositions: { %bb53: Computable, %bb4: Variant }
 ; CHECK-NEXT:    %tmp59 = add nsw i64 %tmp54, 1
 ; CHECK-NEXT:    --> {(1 + undef),+,1}<nsw><%bb53> U: full-set S: full-set Exits: (zext i32 %tmp49 to i64) LoopDispositions: { %bb53: Computable, %bb4: Variant }
 ; CHECK-NEXT:    %tmp62 = add nuw nsw i64 %tmp5, 1
diff --git a/llvm/test/Analysis/ScalarEvolution/pr76234.ll b/llvm/test/Analysis/ScalarEvolution/pr76234.ll
new file mode 100644
index 000000000000..0d82f0ed1a81
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/pr76234.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -disable-output "-passes=print<scalar-evolution>" 2>&1 | FileCheck %s
+
+; Reduced from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=65278
+define i32 @PR76234() {
+; CHECK-LABEL: 'PR76234'
+; CHECK-NEXT:  Classifying expressions for: @PR76234
+; CHECK-NEXT:    %B9 = shl i896 0, -264147265567832623176169892458258303259423663018060761063980354513336951278362429737208627943828593947337197496628564339441173779751342768625269489231469788454193341999502542084365758838213220526512116454105594202074014146375780869419198449383518238244769290448868999168
+; CHECK-NEXT:    --> %B9 U: [0,1) S: [0,1)
+; CHECK-NEXT:    %B39 = ashr i896 %B9, 1
+; CHECK-NEXT:    --> %B39 U: [0,1) S: [0,1) Exits: <<Unknown>> LoopDispositions: { %1: Variant }
+; CHECK-NEXT:  Determining loop execution counts for: @PR76234
+; CHECK-NEXT:  Loop %1: <multiple exits> Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %1: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %1: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %1: Unpredictable predicated backedge-taken count.
+;
+  %B9 = shl i896 0, -264147265567832623176169892458258303259423663018060761063980354513336951278362429737208627943828593947337197496628564339441173779751342768625269489231469788454193341999502542084365758838213220526512116454105594202074014146375780869419198449383518238244769290448868999168
+  br label %1
+1:
+  %B39 = ashr i896 %B9, 1
+  br label %1
+}
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
index 86e7f8c113d1..bea56a72bdea 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
@@ -43,13 +43,13 @@ define void @test1_no(ptr %p) nounwind {
 ; This is unusual, since the function is memcpy, but as above, this
 ; isn't necessarily invalid.
 
-; CHECK: define void @test2_yes(ptr nocapture %p, ptr nocapture %q, i64 %n) #4 {
+; CHECK: define void @test2_yes(ptr nocapture %p, ptr nocapture %q, i64 %n) #0 {
 define void @test2_yes(ptr %p, ptr %q, i64 %n) nounwind {
   call void @llvm.memcpy.p0.p0.i64(ptr %p, ptr %q, i64 %n, i1 false), !tbaa !1
   ret void
 }
 
-; CHECK: define void @test2_no(ptr nocapture writeonly %p, ptr nocapture readonly %q, i64 %n) #5 {
+; CHECK: define void @test2_no(ptr nocapture writeonly %p, ptr nocapture readonly %q, i64 %n) #4 {
 define void @test2_no(ptr %p, ptr %q, i64 %n) nounwind {
   call void @llvm.memcpy.p0.p0.i64(ptr %p, ptr %q, i64 %n, i1 false), !tbaa !2
   ret void
@@ -63,7 +63,7 @@ define i32 @test3_yes(ptr %p) nounwind {
   ret i32 %t
 }
 
-; CHECK: define i32 @test3_no(ptr nocapture %p) #6 {
+; CHECK: define i32 @test3_no(ptr nocapture %p) #4 {
 define i32 @test3_no(ptr %p) nounwind {
   %t = va_arg ptr %p, i32, !tbaa !2
   ret i32 %t
@@ -76,10 +76,8 @@ declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1) nounwind
 ; CHECK: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
 ; CHECK: attributes #2 = { nofree nosync nounwind memory(none) }
 ; CHECK: attributes #3 = { nounwind }
-; CHECK: attributes #4 = { mustprogress nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #5 = { mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite) }
-; CHECK: attributes #6 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
-; CHECK: attributes #7 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #4 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #5 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 
 ; Root note.
 !0 = !{ }
diff --git a/llvm/test/Analysis/ValueTracking/pr75505.ll b/llvm/test/Analysis/ValueTracking/pr75505.ll
new file mode 100644
index 000000000000..79368ff18e06
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/pr75505.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instsimplify < %s | FileCheck %s
+
+; Just make sure that we don't assert.
+define i32 @test(<2 x i16> %a, i32 %b) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: <2 x i16> [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i16> [[A]], <i16 -1, i16 poison>
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i16> [[MUL]] to i32
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[B]], [[BC]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul <2 x i16> %a, <i16 -1, i16 poison>
+  %bc = bitcast <2 x i16> %mul to i32
+  %lshr = lshr i32 %b, %bc
+  ret i32 %lshr
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
index 4c07081404c8..8529dd388ba0 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
@@ -13,7 +13,7 @@ define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -47,13 +47,13 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) {
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $w1, $x0, $x2
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w9 = LDRWui killed renamable $x2, 0, implicit-def $x9, pcsections !0 :: (load (s32) from %ir.pnew)
+  ; CHECK-NEXT:   renamable $w9 = LDRWui killed renamable $x2, 0, implicit-def renamable $x9, pcsections !0 :: (load (s32) from %ir.pnew)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.cmpxchg.start:
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0, $x9
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -93,7 +93,7 @@ define i32 @val_compare_and_swap_rel(ptr %p, i32 %cmp, i32 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -249,7 +249,7 @@ define i32 @fetch_and_nand(ptr %p) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w8, 2, pcsections !0
   ; CHECK-NEXT:   $w9 = ORNWrs $wzr, killed renamable $w9, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRW killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s32) into %ir.p)
@@ -302,7 +302,7 @@ define i32 @fetch_and_or(ptr %p) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $w10 = ORRWrs renamable $w8, renamable $w9, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRW killed renamable $w10, renamable $x0, pcsections !0 :: (volatile store (s32) into %ir.p)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
@@ -735,8 +735,8 @@ define i8 @atomicrmw_add_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -761,7 +761,7 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $x0, $x1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   early-clobber renamable $w9 = STXRB renamable $w1, renamable $x0, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w9, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -785,8 +785,8 @@ define i8 @atomicrmw_sub_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -810,8 +810,8 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -835,8 +835,8 @@ define i8 @atomicrmw_or_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -860,8 +860,8 @@ define i8 @atomicrmw_xor_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -885,10 +885,10 @@ define i8 @atomicrmw_min_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 32, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 13, implicit killed $nzcv, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 13, implicit killed $nzcv, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -912,10 +912,10 @@ define i8 @atomicrmw_max_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 32, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -940,10 +940,10 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 7
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 9, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 9, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -968,10 +968,10 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 7
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -995,8 +995,8 @@ define i16 @atomicrmw_add_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1021,7 +1021,7 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $x0, $x1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   early-clobber renamable $w9 = STXRH renamable $w1, renamable $x0, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w9, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1045,8 +1045,8 @@ define i16 @atomicrmw_sub_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1070,8 +1070,8 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1095,8 +1095,8 @@ define i16 @atomicrmw_or_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1120,8 +1120,8 @@ define i16 @atomicrmw_xor_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1145,10 +1145,10 @@ define i16 @atomicrmw_min_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 40, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 13, implicit killed $nzcv, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 13, implicit killed $nzcv, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1172,10 +1172,10 @@ define i16 @atomicrmw_max_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 40, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1200,10 +1200,10 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 15
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 9, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 9, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1228,10 +1228,10 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 15
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1257,7 +1257,7 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.4(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x2, $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w0 = LDXRB renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w0 = LDXRB renamable $x8, implicit-def renamable $x0, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.4, implicit killed $nzcv, pcsections !0
@@ -1300,7 +1300,7 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.4(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x2, $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w0 = LDXRH renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w0 = LDXRH renamable $x8, implicit-def renamable $x0, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 8, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.4, implicit killed $nzcv, pcsections !0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
index 81d38a5b0804..be2de620fa45 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
@@ -298,3 +298,249 @@ body:             |
     %ext:_(s32) = G_ANYEXT %sel
     $w0 = COPY %ext(s32)
 ...
+---
+# select cond, 1, 0 --> zext(Cond)
+name:            select_cond_1_0_to_zext_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_1_0_to_zext_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %c(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %zero:_(s1) = G_CONSTANT i1 0
+    %one:_(s1) = G_CONSTANT i1 1
+    %sel:_(s1) = G_SELECT %c, %one, %zero
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 0, 1 --> zext(!Cond)
+name:            select_cond_0_1_to_sext_not_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_0_1_to_sext_not_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT [[XOR]](s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %zero:_(s1) = G_CONSTANT i1 0
+    %one:_(s1) = G_CONSTANT i1 1
+    %sel:_(s1) = G_SELECT %c, %zero, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 2, 1 --> and (zext Cond), false
+name:            select_cond_2_1_to_and_zext_cond_false
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_2_1_to_and_zext_cond_false
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s8) = G_CONSTANT i8 101
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT %c(s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_ADD [[ZEXT]], %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 102
+    %one:_(s8) = G_CONSTANT i8 101
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 1, 2 --> and (ext Cond), false
+name:            select_cond_1_2_to_and_sext_cond_false
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_1_2_to_and_sext_cond_false
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s8) = G_CONSTANT i8 102
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s8) = G_SEXT %c(s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_ADD [[SEXT]], %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 101
+    %one:_(s8) = G_CONSTANT i8 102
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 64, 0 --> (zext Cond) << log2(Pow2)
+name:            select_cond_64_0_to_shift
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_64_0_to_shift
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT %c(s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 6
+    ; CHECK-NEXT: %sel:_(s8) = G_SHL [[ZEXT]], [[C]](s8)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 64
+    %one:_(s8) = G_CONSTANT i8 0
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, -1, 0 --> sext Cond
+name:            select_cond_minus_1_0_to_sext_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_minus_1_0_to_sext_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %ext:_(s32) = G_SEXT %c(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 255
+    %one:_(s8) = G_CONSTANT i8 0
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 0, -1 --> sext (!Cond)
+name:            select_cond_0_minus_1_to_sext_not_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_0_minus_1_to_sext_not_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
+    ; CHECK-NEXT: %ext:_(s32) = G_SEXT [[XOR]](s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 0
+    %one:_(s8) = G_CONSTANT i8 255
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, -1, 101 --> or (sext Cond), 101
+name:            select_cond_minus_1_101_to_or_sext_cond_101
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_minus_1_101_to_or_sext_cond_101
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s8) = G_CONSTANT i8 101
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s8) = G_SEXT %c(s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_OR [[SEXT]], %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 255
+    %one:_(s8) = G_CONSTANT i8 101
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 101, -1 --> or (sext (not Cond), 101
+name:            select_cond_101_minus_1_to_or_sext_not_cond_101
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_101_minus_1_to_or_sext_not_cond_101
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %two:_(s8) = G_CONSTANT i8 101
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s8) = G_SEXT [[XOR]](s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_OR [[SEXT]], %two
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 101
+    %one:_(s8) = G_CONSTANT i8 255
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index 7b09a4cc7b8a..5b78b0d47731 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -23,14 +23,14 @@ declare i64 @llvm.vector.reduce.add.v3i64(<3 x i64>)
 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
 declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128>)
 
-; GISEL:        warning: Instruction selection used fallback path for addv_v2i8
-; GISEL-NEXT:   warning: Instruction selection used fallback path for addv_v3i8
-; GISEL-NEXT:   warning: Instruction selection used fallback path for addv_v4i8
-; GISEL-NEXT:   warning: Instruction selection used fallback path for addv_v2i16
-; GISEL-NEXT:   warning: Instruction selection used fallback path for addv_v3i16
-; GISEL-NEXT:   warning: Instruction selection used fallback path for addv_v3i32
-; GISEL-NEXT:   warning: Instruction selection used fallback path for addv_v3i64
-; GISEL-NEXT:   warning: Instruction selection used fallback path for addv_v2i128
+; GISEL:       warning: Instruction selection used fallback path for addv_v2i8
+; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i8
+; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v4i8
+; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v2i16
+; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i16
+; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i32
+; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i64
+; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v2i128
 
 
 define i8 @add_B(ptr %arr)  {
@@ -101,16 +101,12 @@ define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias
 ; GISEL-NEXT:    ushll v2.8h, v2.8b, #0
 ; GISEL-NEXT:    usubl v3.4s, v1.4h, v2.4h
 ; GISEL-NEXT:    usubl2 v1.4s, v1.8h, v2.8h
-; GISEL-NEXT:    cmgt v2.4s, v0.4s, v3.4s
+; GISEL-NEXT:    neg v2.4s, v3.4s
+; GISEL-NEXT:    neg v4.4s, v1.4s
+; GISEL-NEXT:    cmgt v5.4s, v0.4s, v3.4s
 ; GISEL-NEXT:    cmgt v0.4s, v0.4s, v1.4s
-; GISEL-NEXT:    neg v4.4s, v3.4s
-; GISEL-NEXT:    neg v5.4s, v1.4s
-; GISEL-NEXT:    shl v2.4s, v2.4s, #31
-; GISEL-NEXT:    shl v0.4s, v0.4s, #31
-; GISEL-NEXT:    sshr v2.4s, v2.4s, #31
-; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
-; GISEL-NEXT:    bsl v2.16b, v4.16b, v3.16b
-; GISEL-NEXT:    bsl v0.16b, v5.16b, v1.16b
+; GISEL-NEXT:    bif v2.16b, v3.16b, v5.16b
+; GISEL-NEXT:    bsl v0.16b, v4.16b, v1.16b
 ; GISEL-NEXT:    add v0.4s, v2.4s, v0.4s
 ; GISEL-NEXT:    addv s0, v0.4s
 ; GISEL-NEXT:    fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
index 5e477e8947d1..194fe5be40c2 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -516,20 +516,17 @@ define i8 @sminv_v4i8(<4 x i8> %a) {
 ; CHECK-GI-LABEL: sminv_v4i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w12, v0.h[3]
+; CHECK-GI-NEXT:    sxtb w11, w8
+; CHECK-GI-NEXT:    cmp w11, w9, sxtb
+; CHECK-GI-NEXT:    sxtb w11, w10
+; CHECK-GI-NEXT:    csel w8, w8, w9, lt
+; CHECK-GI-NEXT:    cmp w11, w12, sxtb
 ; CHECK-GI-NEXT:    sxtb w9, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
-; CHECK-GI-NEXT:    cmp w9, w10, sxtb
-; CHECK-GI-NEXT:    sxtb w9, w11
-; CHECK-GI-NEXT:    csel w8, w8, w10, lt
-; CHECK-GI-NEXT:    fmov w10, s3
-; CHECK-GI-NEXT:    cmp w9, w10, sxtb
-; CHECK-GI-NEXT:    sxtb w9, w8
-; CHECK-GI-NEXT:    csel w10, w11, w10, lt
+; CHECK-GI-NEXT:    csel w10, w10, w12, lt
 ; CHECK-GI-NEXT:    cmp w9, w10, sxtb
 ; CHECK-GI-NEXT:    csel w0, w8, w10, lt
 ; CHECK-GI-NEXT:    ret
@@ -611,19 +608,16 @@ define i16 @sminv_v3i16(<3 x i16> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    sxth w8, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
+; CHECK-GI-NEXT:    smov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[0]
+; CHECK-GI-NEXT:    umov w10, v0.h[1]
+; CHECK-GI-NEXT:    smov w11, v0.h[2]
+; CHECK-GI-NEXT:    umov w13, v0.h[2]
 ; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w10, sxth
-; CHECK-GI-NEXT:    sxth w8, w11
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    csel w9, w9, w12, lt
-; CHECK-GI-NEXT:    cmp w8, w9, sxth
-; CHECK-GI-NEXT:    csel w0, w9, w10, gt
+; CHECK-GI-NEXT:    cmp w8, w12, sxth
+; CHECK-GI-NEXT:    csel w8, w9, w10, lt
+; CHECK-GI-NEXT:    cmp w11, w8, sxth
+; CHECK-GI-NEXT:    csel w0, w8, w13, gt
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.smin.v3i16(<3 x i16> %a)
@@ -887,20 +881,17 @@ define i8 @smaxv_v4i8(<4 x i8> %a) {
 ; CHECK-GI-LABEL: smaxv_v4i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    sxtb w9, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
-; CHECK-GI-NEXT:    cmp w9, w10, sxtb
-; CHECK-GI-NEXT:    sxtb w9, w11
-; CHECK-GI-NEXT:    csel w8, w8, w10, gt
-; CHECK-GI-NEXT:    fmov w10, s3
-; CHECK-GI-NEXT:    cmp w9, w10, sxtb
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w12, v0.h[3]
+; CHECK-GI-NEXT:    sxtb w11, w8
+; CHECK-GI-NEXT:    cmp w11, w9, sxtb
+; CHECK-GI-NEXT:    sxtb w11, w10
+; CHECK-GI-NEXT:    csel w8, w8, w9, gt
+; CHECK-GI-NEXT:    cmp w11, w12, sxtb
 ; CHECK-GI-NEXT:    sxtb w9, w8
-; CHECK-GI-NEXT:    csel w10, w11, w10, gt
+; CHECK-GI-NEXT:    csel w10, w10, w12, gt
 ; CHECK-GI-NEXT:    cmp w9, w10, sxtb
 ; CHECK-GI-NEXT:    csel w0, w8, w10, gt
 ; CHECK-GI-NEXT:    ret
@@ -982,19 +973,16 @@ define i16 @smaxv_v3i16(<3 x i16> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    sxth w8, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
+; CHECK-GI-NEXT:    smov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[0]
+; CHECK-GI-NEXT:    umov w10, v0.h[1]
+; CHECK-GI-NEXT:    smov w11, v0.h[2]
+; CHECK-GI-NEXT:    umov w13, v0.h[2]
 ; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w10, sxth
-; CHECK-GI-NEXT:    sxth w8, w11
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    csel w9, w9, w12, gt
-; CHECK-GI-NEXT:    cmp w8, w9, sxth
-; CHECK-GI-NEXT:    csel w0, w9, w10, lt
+; CHECK-GI-NEXT:    cmp w8, w12, sxth
+; CHECK-GI-NEXT:    csel w8, w9, w10, gt
+; CHECK-GI-NEXT:    cmp w11, w8, sxth
+; CHECK-GI-NEXT:    csel w0, w8, w13, lt
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.smax.v3i16(<3 x i16> %a)
@@ -1256,19 +1244,16 @@ define i8 @uminv_v4i8(<4 x i8> %a) {
 ; CHECK-GI-LABEL: uminv_v4i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
-; CHECK-GI-NEXT:    fmov w12, s3
-; CHECK-GI-NEXT:    and w9, w8, #0xff
-; CHECK-GI-NEXT:    cmp w9, w10, uxtb
-; CHECK-GI-NEXT:    and w9, w11, #0xff
-; CHECK-GI-NEXT:    csel w8, w8, w10, lo
-; CHECK-GI-NEXT:    cmp w9, w12, uxtb
-; CHECK-GI-NEXT:    csel w9, w11, w12, lo
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
+; CHECK-GI-NEXT:    and w12, w8, #0xff
+; CHECK-GI-NEXT:    cmp w12, w9, uxtb
+; CHECK-GI-NEXT:    and w12, w10, #0xff
+; CHECK-GI-NEXT:    csel w8, w8, w9, lo
+; CHECK-GI-NEXT:    cmp w12, w11, uxtb
+; CHECK-GI-NEXT:    csel w9, w10, w11, lo
 ; CHECK-GI-NEXT:    and w10, w8, #0xff
 ; CHECK-GI-NEXT:    cmp w10, w9, uxtb
 ; CHECK-GI-NEXT:    csel w0, w8, w9, lo
@@ -1351,19 +1336,16 @@ define i16 @uminv_v3i16(<3 x i16> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    uxth w8, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[0]
+; CHECK-GI-NEXT:    umov w10, v0.h[1]
+; CHECK-GI-NEXT:    umov w11, v0.h[2]
+; CHECK-GI-NEXT:    umov w13, v0.h[2]
 ; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w10, uxth
-; CHECK-GI-NEXT:    uxth w8, w11
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    csel w9, w9, w12, lo
-; CHECK-GI-NEXT:    cmp w8, w9, uxth
-; CHECK-GI-NEXT:    csel w0, w9, w10, hi
+; CHECK-GI-NEXT:    cmp w8, w12, uxth
+; CHECK-GI-NEXT:    csel w8, w9, w10, lo
+; CHECK-GI-NEXT:    cmp w11, w8, uxth
+; CHECK-GI-NEXT:    csel w0, w8, w13, hi
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.umin.v3i16(<3 x i16> %a)
@@ -1625,19 +1607,16 @@ define i8 @umaxv_v4i8(<4 x i8> %a) {
 ; CHECK-GI-LABEL: umaxv_v4i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
-; CHECK-GI-NEXT:    fmov w12, s3
-; CHECK-GI-NEXT:    and w9, w8, #0xff
-; CHECK-GI-NEXT:    cmp w9, w10, uxtb
-; CHECK-GI-NEXT:    and w9, w11, #0xff
-; CHECK-GI-NEXT:    csel w8, w8, w10, hi
-; CHECK-GI-NEXT:    cmp w9, w12, uxtb
-; CHECK-GI-NEXT:    csel w9, w11, w12, hi
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
+; CHECK-GI-NEXT:    and w12, w8, #0xff
+; CHECK-GI-NEXT:    cmp w12, w9, uxtb
+; CHECK-GI-NEXT:    and w12, w10, #0xff
+; CHECK-GI-NEXT:    csel w8, w8, w9, hi
+; CHECK-GI-NEXT:    cmp w12, w11, uxtb
+; CHECK-GI-NEXT:    csel w9, w10, w11, hi
 ; CHECK-GI-NEXT:    and w10, w8, #0xff
 ; CHECK-GI-NEXT:    cmp w10, w9, uxtb
 ; CHECK-GI-NEXT:    csel w0, w8, w9, hi
@@ -1719,19 +1698,16 @@ define i16 @umaxv_v3i16(<3 x i16> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    uxth w8, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[0]
+; CHECK-GI-NEXT:    umov w10, v0.h[1]
+; CHECK-GI-NEXT:    umov w11, v0.h[2]
+; CHECK-GI-NEXT:    umov w13, v0.h[2]
 ; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w10, uxth
-; CHECK-GI-NEXT:    uxth w8, w11
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    csel w9, w9, w12, hi
-; CHECK-GI-NEXT:    cmp w8, w9, uxth
-; CHECK-GI-NEXT:    csel w0, w9, w10, lo
+; CHECK-GI-NEXT:    cmp w8, w12, uxth
+; CHECK-GI-NEXT:    csel w8, w9, w10, hi
+; CHECK-GI-NEXT:    cmp w11, w8, uxth
+; CHECK-GI-NEXT:    csel w0, w8, w13, lo
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.umax.v3i16(<3 x i16> %a)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
index 90f09379e68f..7b7ca9d8ffc2 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
@@ -73,13 +73,10 @@ define i8 @mulv_v4i8(<4 x i8> %a) {
 ; CHECK-GI-LABEL: mulv_v4i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov w11, s3
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
 ; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
@@ -113,27 +110,20 @@ define i8 @mulv_v8i8(<8 x i8> %a) {
 ; CHECK-GI-LABEL: mulv_v8i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov b5, v0.b[5]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov b6, v0.b[6]
-; CHECK-GI-NEXT:    mov b7, v0.b[7]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov w11, s3
-; CHECK-GI-NEXT:    fmov w12, s5
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    umov w10, v0.b[2]
+; CHECK-GI-NEXT:    umov w11, v0.b[3]
+; CHECK-GI-NEXT:    umov w12, v0.b[4]
+; CHECK-GI-NEXT:    umov w13, v0.b[5]
+; CHECK-GI-NEXT:    umov w14, v0.b[6]
+; CHECK-GI-NEXT:    umov w15, v0.b[7]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s4
-; CHECK-GI-NEXT:    mul w10, w10, w11
-; CHECK-GI-NEXT:    fmov w11, s6
-; CHECK-GI-NEXT:    mul w9, w9, w12
-; CHECK-GI-NEXT:    fmov w12, s7
-; CHECK-GI-NEXT:    mul w8, w8, w10
-; CHECK-GI-NEXT:    mul w11, w11, w12
-; CHECK-GI-NEXT:    mul w9, w9, w11
+; CHECK-GI-NEXT:    mul w9, w10, w11
+; CHECK-GI-NEXT:    mul w10, w12, w13
+; CHECK-GI-NEXT:    mul w11, w14, w15
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -167,27 +157,20 @@ define i8 @mulv_v16i8(<16 x i8> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    mul v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov b5, v0.b[5]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov b6, v0.b[6]
-; CHECK-GI-NEXT:    mov b7, v0.b[7]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov w11, s3
-; CHECK-GI-NEXT:    fmov w12, s5
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    umov w10, v0.b[2]
+; CHECK-GI-NEXT:    umov w11, v0.b[3]
+; CHECK-GI-NEXT:    umov w12, v0.b[4]
+; CHECK-GI-NEXT:    umov w13, v0.b[5]
+; CHECK-GI-NEXT:    umov w14, v0.b[6]
+; CHECK-GI-NEXT:    umov w15, v0.b[7]
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    mul w9, w10, w11
+; CHECK-GI-NEXT:    mul w10, w12, w13
+; CHECK-GI-NEXT:    mul w11, w14, w15
 ; CHECK-GI-NEXT:    mul w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s4
-; CHECK-GI-NEXT:    mul w10, w10, w11
-; CHECK-GI-NEXT:    fmov w11, s6
-; CHECK-GI-NEXT:    mul w9, w9, w12
-; CHECK-GI-NEXT:    fmov w12, s7
-; CHECK-GI-NEXT:    mul w8, w8, w10
-; CHECK-GI-NEXT:    mul w11, w11, w12
-; CHECK-GI-NEXT:    mul w9, w9, w11
+; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -225,27 +208,20 @@ define i8 @mulv_v32i8(<32 x i8> %a) {
 ; CHECK-GI-NEXT:    mul v0.8b, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    mul v1.8b, v1.8b, v3.8b
 ; CHECK-GI-NEXT:    mul v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov b5, v0.b[5]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov b6, v0.b[6]
-; CHECK-GI-NEXT:    mov b7, v0.b[7]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov w11, s3
-; CHECK-GI-NEXT:    fmov w12, s5
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    umov w10, v0.b[2]
+; CHECK-GI-NEXT:    umov w11, v0.b[3]
+; CHECK-GI-NEXT:    umov w12, v0.b[4]
+; CHECK-GI-NEXT:    umov w13, v0.b[5]
+; CHECK-GI-NEXT:    umov w14, v0.b[6]
+; CHECK-GI-NEXT:    umov w15, v0.b[7]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s4
-; CHECK-GI-NEXT:    mul w10, w10, w11
-; CHECK-GI-NEXT:    fmov w11, s6
-; CHECK-GI-NEXT:    mul w9, w9, w12
-; CHECK-GI-NEXT:    fmov w12, s7
-; CHECK-GI-NEXT:    mul w8, w8, w10
-; CHECK-GI-NEXT:    mul w11, w11, w12
-; CHECK-GI-NEXT:    mul w9, w9, w11
+; CHECK-GI-NEXT:    mul w9, w10, w11
+; CHECK-GI-NEXT:    mul w10, w12, w13
+; CHECK-GI-NEXT:    mul w11, w14, w15
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -289,13 +265,11 @@ define i16 @mulv_v3i16(<3 x i16> %a) {
 ; CHECK-GI-LABEL: mulv_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s2
-; CHECK-GI-NEXT:    mul w0, w8, w9
+; CHECK-GI-NEXT:    mul w0, w8, w10
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.mul.v3i16(<3 x i16> %a)
@@ -318,13 +292,10 @@ define i16 @mulv_v4i16(<4 x i16> %a) {
 ; CHECK-GI-LABEL: mulv_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov w11, s3
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
 ; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
@@ -352,13 +323,10 @@ define i16 @mulv_v8i16(<8 x i16> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov w11, s3
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
 ; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
@@ -390,15 +358,12 @@ define i16 @mulv_v16i16(<16 x i16> %a) {
 ; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v2.4h
 ; CHECK-GI-NEXT:    mul v1.4h, v1.4h, v3.4h
 ; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s3
-; CHECK-GI-NEXT:    mul w9, w10, w9
+; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 99aa28d859e1..dbc5417e2313 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -3,6 +3,19 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
+; CHECK-GI:       warning: Instruction selection used fallback path for smull_zext_v4i16_v4i32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl2_v8i16_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for umlsl2_v8i16_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl2_v4i32_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for umlsl2_v4i32_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmlsl_pmlsl2_v8i16_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl_smlsl2_v8i16_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for umlsl_umlsl2_v8i16_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl_smlsl2_v4i32_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for umlsl_umlsl2_v4i32_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for do_stuff
+
 define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: smull_v8i8_v8i16:
 ; CHECK:       // %bb.0:
@@ -226,11 +239,10 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-GI-NEXT:    movi d0, #0x00ffff0000ffff
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov w8, v0.s[0]
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    ldr d0, [x1]
 ; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov d1, x8
 ; CHECK-GI-NEXT:    mov d3, v0.d[1]
 ; CHECK-GI-NEXT:    mov v1.d[1], x9
diff --git a/llvm/test/CodeGen/AArch64/andcompare.ll b/llvm/test/CodeGen/AArch64/andcompare.ll
index 9a7fa0498299..cbacd17c846d 100644
--- a/llvm/test/CodeGen/AArch64/andcompare.ll
+++ b/llvm/test/CodeGen/AArch64/andcompare.ll
@@ -2451,7 +2451,7 @@ define i32 @cmp_to_ands3(i32 %num, i32 %a) {
 ;
 ; GISEL-LABEL: cmp_to_ands3:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #23
+; GISEL-NEXT:    mov w8, #23 // =0x17
 ; GISEL-NEXT:    and w8, w0, w8
 ; GISEL-NEXT:    cmp w8, #7
 ; GISEL-NEXT:    csel w0, w1, wzr, hi
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index 3d4749a7b8e7..2181eaaee7db 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -210,24 +210,105 @@ define void @t17(i64 %a) {
   ret void
 }
 
-define i32 @LdOffset_i8(ptr %a)  {
+; LDRBBroX
+define i8 @LdOffset_i8(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrb w0, [x8, #3704]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  ret i8 %val
+}
+
+; LDRBBroX
+define i32 @LdOffset_i8_zext32(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_zext32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrb w0, [x8, #3704]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %val to i32
+  ret i32 %conv
+}
+
+; LDRSBWroX
+define i32 @LdOffset_i8_sext32(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_sext32:
+; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #56952 // =0xde78
 ; CHECK-NEXT:    movk w8, #15, lsl #16
-; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    ldrsb w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
-  %conv = zext i8 %val to i32
+  %conv = sext i8 %val to i32
   ret i32 %conv
 }
 
-define i32 @LdOffset_i16(ptr %a)  {
+; LDRBBroX
+define i64 @LdOffset_i8_zext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_zext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrb w0, [x8, #3704]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %val to i64
+  ret i64 %conv
+}
+
+; LDRSBXroX
+define i64 @LdOffset_i8_sext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_sext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrsb x0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %val to i64
+  ret i64 %conv
+}
+
+; LDRHHroX
+define i16 @LdOffset_i16(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
 ; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrh w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  ret i16 %val
+}
+
+; LDRHHroX
+define i32 @LdOffset_i16_zext32(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_zext32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrh w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  %conv = zext i16 %val to i32
+  ret i32 %conv
+}
+
+; LDRSHWroX
+define i32 @LdOffset_i16_sext32(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_sext32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
 ; CHECK-NEXT:    ldrsh w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
@@ -236,6 +317,35 @@ define i32 @LdOffset_i16(ptr %a)  {
   ret i32 %conv
 }
 
+; LDRHHroX
+define i64 @LdOffset_i16_zext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_zext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrh w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  %conv = zext i16 %val to i64
+  ret i64 %conv
+}
+
+; LDRSHXroX
+define i64 @LdOffset_i16_sext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_sext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrsh x0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %val to i64
+  ret i64 %conv
+}
+
+; LDRWroX
 define i32 @LdOffset_i32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32:
 ; CHECK:       // %bb.0:
@@ -248,6 +358,133 @@ define i32 @LdOffset_i32(ptr %a)  {
   ret i32 %val
 }
 
+; LDRWroX
+define i64 @LdOffset_i32_zext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i32_zext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #31200 // =0x79e0
+; CHECK-NEXT:    movk w8, #63, lsl #16
+; CHECK-NEXT:    ldr w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
+  %val = load i32, ptr %arrayidx, align 2
+  %conv = zext i32 %val to i64
+  ret i64 %conv
+}
+
+; LDRSWroX
+define i64 @LdOffset_i32_sext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i32_sext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #31200 // =0x79e0
+; CHECK-NEXT:    movk w8, #63, lsl #16
+; CHECK-NEXT:    ldrsw x0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
+  %val = load i32, ptr %arrayidx, align 2
+  %conv = sext i32 %val to i64
+  ret i64 %conv
+}
+
+; LDRXroX
+define i64 @LdOffset_i64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #62400 // =0xf3c0
+; CHECK-NEXT:    movk w8, #126, lsl #16
+; CHECK-NEXT:    ldr x0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992
+  %val = load i64, ptr %arrayidx, align 4
+  ret i64 %val
+}
+
+; LDRDroX
+define <2 x i32> @LdOffset_v2i32(ptr %a)  {
+; CHECK-LABEL: LdOffset_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #62400 // =0xf3c0
+; CHECK-NEXT:    movk w8, #126, lsl #16
+; CHECK-NEXT:    ldr d0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds <2 x i32>, ptr %a, i64 1039992
+  %val = load <2 x i32>, ptr %arrayidx, align 4
+  ret <2 x i32> %val
+}
+
+; LDRQroX
+define <2 x i64> @LdOffset_v2i64(ptr %a)  {
+; CHECK-LABEL: LdOffset_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #59264 // =0xe780
+; CHECK-NEXT:    movk w8, #253, lsl #16
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds <2 x i64>, ptr %a, i64 1039992
+  %val = load <2 x i64>, ptr %arrayidx, align 4
+  ret <2 x i64> %val
+}
+
+; LDRSBWroX
+define double @LdOffset_i8_f64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrsb w8, [x0, x8]
+; CHECK-NEXT:    scvtf d0, w8
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %val to double
+  ret double %conv
+}
+
+; LDRSHWroX
+define double @LdOffset_i16_f64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrsh w8, [x0, x8]
+; CHECK-NEXT:    scvtf d0, w8
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %val to double
+  ret double %conv
+}
+
+; LDRSroX
+define double @LdOffset_i32_f64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i32_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #31200 // =0x79e0
+; CHECK-NEXT:    movk w8, #63, lsl #16
+; CHECK-NEXT:    ldr s0, [x0, x8]
+; CHECK-NEXT:    ucvtf d0, d0
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
+  %val = load i32, ptr %arrayidx, align 4
+  %conv = uitofp i32 %val to double
+  ret double %conv
+}
+
+; LDRDroX
+define double @LdOffset_i64_f64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i64_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #62400 // =0xf3c0
+; CHECK-NEXT:    movk w8, #126, lsl #16
+; CHECK-NEXT:    ldr d0, [x0, x8]
+; CHECK-NEXT:    scvtf d0, d0
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992
+  %val = load i64, ptr %arrayidx, align 8
+  %conv = sitofp i64 %val to double
+  ret double %conv
+}
+
 define i64 @LdOffset_i64_multi_offset(ptr %a) {
 ; CHECK-LABEL: LdOffset_i64_multi_offset:
 ; CHECK:       // %bb.0:
@@ -295,3 +532,27 @@ define i32 @LdOffset_i16_odd_offset(ptr nocapture noundef readonly %a)  {
   ret i32 %conv
 }
 
+; Already encoded with a single mov MOVNWi
+define i8 @LdOffset_i8_movnwi(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_movnwi:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #16777215 // =0xffffff
+; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 16777215
+  %val = load i8, ptr %arrayidx, align 1
+  ret i8 %val
+}
+
+; Negative test: the offset is too large to encoded with a add
+define i8 @LdOffset_i8_too_large(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_too_large:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    movk w8, #256, lsl #16
+; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 16777217
+  %val = load i8, ptr %arrayidx, align 1
+  ret i8 %val
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index 821f6e403a27..446526986b88 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -14,7 +14,7 @@ define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB0_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -42,7 +42,7 @@ define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; SDISEL-NEXT:  LBB1_2: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: single_different:
@@ -55,7 +55,7 @@ define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; GISEL-NEXT:  LBB1_2: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp sle i32 %a, 5
@@ -88,7 +88,7 @@ define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; SDISEL-NEXT:  LBB2_3: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: single_flagclobber:
@@ -106,7 +106,7 @@ define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; GISEL-NEXT:  LBB2_3: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -144,7 +144,7 @@ define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB3_3: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -178,13 +178,13 @@ define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
 ; SDISEL-NEXT:    ccmp w8, #16, #0, ge
 ; SDISEL-NEXT:    b.le LBB4_2
 ; SDISEL-NEXT:  ; %bb.1: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ; SDISEL-NEXT:  LBB4_2: ; %if.then
 ; SDISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: speculate_division:
@@ -194,13 +194,13 @@ define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
 ; GISEL-NEXT:    ccmp w8, #17, #0, gt
 ; GISEL-NEXT:    b.lt LBB4_2
 ; GISEL-NEXT:  ; %bb.1: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  LBB4_2: ; %if.then
 ; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, 0
@@ -230,13 +230,13 @@ define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
 ; SDISEL-NEXT:    fccmp s0, s1, #8, ge
 ; SDISEL-NEXT:    b.ge LBB5_2
 ; SDISEL-NEXT:  ; %bb.1: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ; SDISEL-NEXT:  LBB5_2: ; %if.then
 ; SDISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: single_fcmp:
@@ -248,13 +248,13 @@ define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
 ; GISEL-NEXT:    fccmp s0, s1, #8, gt
 ; GISEL-NEXT:    b.ge LBB5_2
 ; GISEL-NEXT:  ; %bb.1: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  LBB5_2: ; %if.then
 ; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, 0
@@ -318,7 +318,7 @@ define i32 @cbz_head(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB7_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 0
@@ -346,13 +346,13 @@ define i32 @immediate_range(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    cmp w1, #32
 ; CHECK-NEXT:    b.eq LBB8_3
 ; CHECK-NEXT:  ; %bb.2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  LBB8_3: ; %if.then
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -380,7 +380,7 @@ define i32 @cbz_second(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB9_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 0
@@ -408,7 +408,7 @@ define i32 @cbnz_second(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB10_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 0
@@ -466,7 +466,7 @@ define i64 @select_and(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_and:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    mov w8, #5 ; =0x5
 ; GISEL-NEXT:    cmp w8, w1
 ; GISEL-NEXT:    ccmp w0, w1, #0, ne
 ; GISEL-NEXT:    csel x0, x2, x3, lt
@@ -488,7 +488,7 @@ define i64 @select_or(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_or:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    mov w8, #5 ; =0x5
 ; GISEL-NEXT:    cmp w8, w1
 ; GISEL-NEXT:    ccmp w0, w1, #8, eq
 ; GISEL-NEXT:    csel x0, x2, x3, lt
@@ -510,7 +510,7 @@ define float @select_or_float(i32 %w0, i32 %w1, float %x2, float %x3) {
 ;
 ; GISEL-LABEL: select_or_float:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    mov w8, #5 ; =0x5
 ; GISEL-NEXT:    cmp w8, w1
 ; GISEL-NEXT:    ccmp w0, w1, #8, eq
 ; GISEL-NEXT:    fcsel s0, s0, s1, lt
@@ -528,17 +528,22 @@ define i64 @gccbug(i64 %x0, i64 %x1) {
 ; SDISEL-NEXT:    cmp x0, #2
 ; SDISEL-NEXT:    ccmp x0, #4, #4, ne
 ; SDISEL-NEXT:    ccmp x1, #0, #0, eq
-; SDISEL-NEXT:    mov w8, #1
+; SDISEL-NEXT:    mov w8, #1 ; =0x1
 ; SDISEL-NEXT:    cinc x0, x8, eq
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: gccbug:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #2
+; GISEL-NEXT:    cmp x1, #0
+; GISEL-NEXT:    cset w8, eq
 ; GISEL-NEXT:    cmp x0, #2
-; GISEL-NEXT:    ccmp x0, #4, #4, ne
-; GISEL-NEXT:    ccmp x1, #0, #0, eq
-; GISEL-NEXT:    csinc x0, x8, xzr, eq
+; GISEL-NEXT:    cset w9, eq
+; GISEL-NEXT:    cmp x0, #4
+; GISEL-NEXT:    cset w10, eq
+; GISEL-NEXT:    orr w9, w10, w9
+; GISEL-NEXT:    and w8, w9, w8
+; GISEL-NEXT:    and x8, x8, #0x1
+; GISEL-NEXT:    add x0, x8, #1
 ; GISEL-NEXT:    ret
   %cmp0 = icmp eq i64 %x1, 0
   %cmp1 = icmp eq i64 %x0, 2
@@ -592,7 +597,7 @@ define i32 @select_andor32(i32 %v1, i32 %v2, i32 %v3) {
 ; SDISEL-LABEL: select_andor32:
 ; SDISEL:       ; %bb.0:
 ; SDISEL-NEXT:    cmp w1, w2
-; SDISEL-NEXT:    mov w8, #32
+; SDISEL-NEXT:    mov w8, #32 ; =0x20
 ; SDISEL-NEXT:    ccmp w0, w8, #4, lt
 ; SDISEL-NEXT:    ccmp w0, w1, #0, eq
 ; SDISEL-NEXT:    csel w0, w0, w1, eq
@@ -600,7 +605,7 @@ define i32 @select_andor32(i32 %v1, i32 %v2, i32 %v3) {
 ;
 ; GISEL-LABEL: select_andor32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #32
+; GISEL-NEXT:    mov w8, #32 ; =0x20
 ; GISEL-NEXT:    cmp w1, w2
 ; GISEL-NEXT:    ccmp w0, w8, #4, lt
 ; GISEL-NEXT:    ccmp w0, w1, #0, eq
@@ -701,11 +706,11 @@ define i32 @select_noccmp3(i32 %v0, i32 %v1, i32 %v2) {
 ; SDISEL-NEXT:    ccmp w0, #13, #0, ge
 ; SDISEL-NEXT:    cset w8, gt
 ; SDISEL-NEXT:    cmp w0, #22
-; SDISEL-NEXT:    mov w9, #44
+; SDISEL-NEXT:    mov w9, #44 ; =0x2c
 ; SDISEL-NEXT:    ccmp w0, w9, #0, ge
 ; SDISEL-NEXT:    csel w8, wzr, w8, le
 ; SDISEL-NEXT:    cmp w0, #99
-; SDISEL-NEXT:    mov w9, #77
+; SDISEL-NEXT:    mov w9, #77 ; =0x4d
 ; SDISEL-NEXT:    ccmp w0, w9, #4, ne
 ; SDISEL-NEXT:    cset w9, eq
 ; SDISEL-NEXT:    tst w8, w9
diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index 51f8c2ceceec..00cc6b21ccaf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -5,10 +5,8 @@
 ; Test efficient codegen of vector extends up from legal type to 128 bit
 ; and 256 bit vector types.
 
-; CHECK-GI:        warning: Instruction selection used fallback path for zext_v32i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sext_v32i1
+; CHECK-GI:       warning: Instruction selection used fallback path for zext_v32i1
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for zext_v64i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sext_v64i1
 
 ;-----
 ; Vectors of i16.
@@ -402,69 +400,133 @@ define <32 x i8> @zext_v32i1(<32 x i1> %arg) {
 }
 
 define <32 x i8> @sext_v32i1(<32 x i1> %arg) {
-; CHECK-LABEL: sext_v32i1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [sp, #64]
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    ldr w9, [sp, #72]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldr w8, [sp, #80]
-; CHECK-NEXT:    mov.b v1[1], w1
-; CHECK-NEXT:    mov.b v0[1], w9
-; CHECK-NEXT:    ldr w9, [sp]
-; CHECK-NEXT:    mov.b v1[2], w2
-; CHECK-NEXT:    mov.b v0[2], w8
-; CHECK-NEXT:    ldr w8, [sp, #88]
-; CHECK-NEXT:    mov.b v1[3], w3
-; CHECK-NEXT:    mov.b v0[3], w8
-; CHECK-NEXT:    ldr w8, [sp, #96]
-; CHECK-NEXT:    mov.b v1[4], w4
-; CHECK-NEXT:    mov.b v0[4], w8
-; CHECK-NEXT:    ldr w8, [sp, #104]
-; CHECK-NEXT:    mov.b v1[5], w5
-; CHECK-NEXT:    mov.b v0[5], w8
-; CHECK-NEXT:    ldr w8, [sp, #112]
-; CHECK-NEXT:    mov.b v1[6], w6
-; CHECK-NEXT:    mov.b v0[6], w8
-; CHECK-NEXT:    ldr w8, [sp, #120]
-; CHECK-NEXT:    mov.b v1[7], w7
-; CHECK-NEXT:    mov.b v0[7], w8
-; CHECK-NEXT:    ldr w8, [sp, #128]
-; CHECK-NEXT:    mov.b v1[8], w9
-; CHECK-NEXT:    ldr w9, [sp, #8]
-; CHECK-NEXT:    mov.b v0[8], w8
-; CHECK-NEXT:    ldr w8, [sp, #136]
-; CHECK-NEXT:    mov.b v1[9], w9
-; CHECK-NEXT:    ldr w9, [sp, #16]
-; CHECK-NEXT:    mov.b v0[9], w8
-; CHECK-NEXT:    ldr w8, [sp, #144]
-; CHECK-NEXT:    mov.b v1[10], w9
-; CHECK-NEXT:    ldr w9, [sp, #24]
-; CHECK-NEXT:    mov.b v0[10], w8
-; CHECK-NEXT:    ldr w8, [sp, #152]
-; CHECK-NEXT:    mov.b v1[11], w9
-; CHECK-NEXT:    ldr w9, [sp, #32]
-; CHECK-NEXT:    mov.b v0[11], w8
-; CHECK-NEXT:    ldr w8, [sp, #160]
-; CHECK-NEXT:    mov.b v1[12], w9
-; CHECK-NEXT:    ldr w9, [sp, #40]
-; CHECK-NEXT:    mov.b v0[12], w8
-; CHECK-NEXT:    ldr w8, [sp, #168]
-; CHECK-NEXT:    mov.b v1[13], w9
-; CHECK-NEXT:    ldr w9, [sp, #48]
-; CHECK-NEXT:    mov.b v0[13], w8
-; CHECK-NEXT:    ldr w8, [sp, #176]
-; CHECK-NEXT:    mov.b v1[14], w9
-; CHECK-NEXT:    ldr w9, [sp, #56]
-; CHECK-NEXT:    mov.b v0[14], w8
-; CHECK-NEXT:    ldr w8, [sp, #184]
-; CHECK-NEXT:    mov.b v1[15], w9
-; CHECK-NEXT:    mov.b v0[15], w8
-; CHECK-NEXT:    shl.16b v1, v1, #7
-; CHECK-NEXT:    shl.16b v2, v0, #7
-; CHECK-NEXT:    cmlt.16b v0, v1, #0
-; CHECK-NEXT:    cmlt.16b v1, v2, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sext_v32i1:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr w8, [sp, #64]
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    ldr w9, [sp, #72]
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #80]
+; CHECK-SD-NEXT:    mov.b v1[1], w1
+; CHECK-SD-NEXT:    mov.b v0[1], w9
+; CHECK-SD-NEXT:    ldr w9, [sp]
+; CHECK-SD-NEXT:    mov.b v1[2], w2
+; CHECK-SD-NEXT:    mov.b v0[2], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #88]
+; CHECK-SD-NEXT:    mov.b v1[3], w3
+; CHECK-SD-NEXT:    mov.b v0[3], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #96]
+; CHECK-SD-NEXT:    mov.b v1[4], w4
+; CHECK-SD-NEXT:    mov.b v0[4], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #104]
+; CHECK-SD-NEXT:    mov.b v1[5], w5
+; CHECK-SD-NEXT:    mov.b v0[5], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #112]
+; CHECK-SD-NEXT:    mov.b v1[6], w6
+; CHECK-SD-NEXT:    mov.b v0[6], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #120]
+; CHECK-SD-NEXT:    mov.b v1[7], w7
+; CHECK-SD-NEXT:    mov.b v0[7], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #128]
+; CHECK-SD-NEXT:    mov.b v1[8], w9
+; CHECK-SD-NEXT:    ldr w9, [sp, #8]
+; CHECK-SD-NEXT:    mov.b v0[8], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #136]
+; CHECK-SD-NEXT:    mov.b v1[9], w9
+; CHECK-SD-NEXT:    ldr w9, [sp, #16]
+; CHECK-SD-NEXT:    mov.b v0[9], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #144]
+; CHECK-SD-NEXT:    mov.b v1[10], w9
+; CHECK-SD-NEXT:    ldr w9, [sp, #24]
+; CHECK-SD-NEXT:    mov.b v0[10], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #152]
+; CHECK-SD-NEXT:    mov.b v1[11], w9
+; CHECK-SD-NEXT:    ldr w9, [sp, #32]
+; CHECK-SD-NEXT:    mov.b v0[11], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #160]
+; CHECK-SD-NEXT:    mov.b v1[12], w9
+; CHECK-SD-NEXT:    ldr w9, [sp, #40]
+; CHECK-SD-NEXT:    mov.b v0[12], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #168]
+; CHECK-SD-NEXT:    mov.b v1[13], w9
+; CHECK-SD-NEXT:    ldr w9, [sp, #48]
+; CHECK-SD-NEXT:    mov.b v0[13], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #176]
+; CHECK-SD-NEXT:    mov.b v1[14], w9
+; CHECK-SD-NEXT:    ldr w9, [sp, #56]
+; CHECK-SD-NEXT:    mov.b v0[14], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #184]
+; CHECK-SD-NEXT:    mov.b v1[15], w9
+; CHECK-SD-NEXT:    mov.b v0[15], w8
+; CHECK-SD-NEXT:    shl.16b v1, v1, #7
+; CHECK-SD-NEXT:    shl.16b v2, v0, #7
+; CHECK-SD-NEXT:    cmlt.16b v0, v1, #0
+; CHECK-SD-NEXT:    cmlt.16b v1, v2, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sext_v32i1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s17, w0
+; CHECK-GI-NEXT:    fmov s19, w4
+; CHECK-GI-NEXT:    ldr s0, [sp]
+; CHECK-GI-NEXT:    ldr s21, [sp, #8]
+; CHECK-GI-NEXT:    ldr s1, [sp, #32]
+; CHECK-GI-NEXT:    ldr s22, [sp, #40]
+; CHECK-GI-NEXT:    ldr s2, [sp, #64]
+; CHECK-GI-NEXT:    ldr s23, [sp, #72]
+; CHECK-GI-NEXT:    ldr s3, [sp, #96]
+; CHECK-GI-NEXT:    ldr s24, [sp, #104]
+; CHECK-GI-NEXT:    mov.s v17[1], w1
+; CHECK-GI-NEXT:    mov.s v19[1], w5
+; CHECK-GI-NEXT:    ldr s5, [sp, #128]
+; CHECK-GI-NEXT:    ldr s20, [sp, #136]
+; CHECK-GI-NEXT:    mov.s v0[1], v21[0]
+; CHECK-GI-NEXT:    ldr s7, [sp, #160]
+; CHECK-GI-NEXT:    ldr s25, [sp, #168]
+; CHECK-GI-NEXT:    mov.s v1[1], v22[0]
+; CHECK-GI-NEXT:    mov.s v2[1], v23[0]
+; CHECK-GI-NEXT:    mov.s v3[1], v24[0]
+; CHECK-GI-NEXT:    mov.s v5[1], v20[0]
+; CHECK-GI-NEXT:    mov.s v7[1], v25[0]
+; CHECK-GI-NEXT:    ldr s16, [sp, #16]
+; CHECK-GI-NEXT:    ldr s18, [sp, #48]
+; CHECK-GI-NEXT:    ldr s20, [sp, #80]
+; CHECK-GI-NEXT:    ldr s21, [sp, #112]
+; CHECK-GI-NEXT:    ldr s22, [sp, #144]
+; CHECK-GI-NEXT:    ldr s23, [sp, #176]
+; CHECK-GI-NEXT:    mov.s v17[2], w2
+; CHECK-GI-NEXT:    mov.s v19[2], w6
+; CHECK-GI-NEXT:    mov.s v0[2], v16[0]
+; CHECK-GI-NEXT:    mov.s v1[2], v18[0]
+; CHECK-GI-NEXT:    mov.s v2[2], v20[0]
+; CHECK-GI-NEXT:    mov.s v3[2], v21[0]
+; CHECK-GI-NEXT:    mov.s v5[2], v22[0]
+; CHECK-GI-NEXT:    mov.s v7[2], v23[0]
+; CHECK-GI-NEXT:    ldr s4, [sp, #24]
+; CHECK-GI-NEXT:    ldr s6, [sp, #56]
+; CHECK-GI-NEXT:    ldr s16, [sp, #88]
+; CHECK-GI-NEXT:    ldr s18, [sp, #120]
+; CHECK-GI-NEXT:    ldr s20, [sp, #152]
+; CHECK-GI-NEXT:    ldr s21, [sp, #184]
+; CHECK-GI-NEXT:    mov.s v17[3], w3
+; CHECK-GI-NEXT:    mov.s v19[3], w7
+; CHECK-GI-NEXT:    mov.s v0[3], v4[0]
+; CHECK-GI-NEXT:    mov.s v1[3], v6[0]
+; CHECK-GI-NEXT:    mov.s v2[3], v16[0]
+; CHECK-GI-NEXT:    mov.s v3[3], v18[0]
+; CHECK-GI-NEXT:    mov.s v5[3], v20[0]
+; CHECK-GI-NEXT:    mov.s v7[3], v21[0]
+; CHECK-GI-NEXT:    uzp1.8h v4, v17, v19
+; CHECK-GI-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-GI-NEXT:    uzp1.8h v1, v2, v3
+; CHECK-GI-NEXT:    uzp1.8h v2, v5, v7
+; CHECK-GI-NEXT:    uzp1.16b v0, v4, v0
+; CHECK-GI-NEXT:    uzp1.16b v1, v1, v2
+; CHECK-GI-NEXT:    shl.16b v0, v0, #7
+; CHECK-GI-NEXT:    shl.16b v1, v1, #7
+; CHECK-GI-NEXT:    sshr.16b v0, v0, #7
+; CHECK-GI-NEXT:    sshr.16b v1, v1, #7
+; CHECK-GI-NEXT:    ret
   %res = sext <32 x i1> %arg to <32 x i8>
   ret <32 x i8> %res
 }
@@ -607,141 +669,279 @@ define <64 x i8> @zext_v64i1(<64 x i1> %arg) {
 }
 
 define <64 x i8> @sext_v64i1(<64 x i1> %arg) {
-; CHECK-LABEL: sext_v64i1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldr w8, [sp, #336]
-; CHECK-NEXT:    ldr w9, [sp, #208]
-; CHECK-NEXT:    fmov s2, w0
-; CHECK-NEXT:    ldr w10, [sp, #80]
-; CHECK-NEXT:    ldr w11, [sp, #216]
-; CHECK-NEXT:    ldr w12, [sp, #88]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    ldr w8, [sp, #344]
-; CHECK-NEXT:    fmov s3, w10
-; CHECK-NEXT:    mov.b v2[1], w1
-; CHECK-NEXT:    ldr w9, [sp, #224]
-; CHECK-NEXT:    ldr w10, [sp, #96]
-; CHECK-NEXT:    mov.b v0[1], w8
-; CHECK-NEXT:    mov.b v1[1], w11
-; CHECK-NEXT:    ldr w8, [sp, #352]
-; CHECK-NEXT:    mov.b v3[1], w12
-; CHECK-NEXT:    ldr w11, [sp, #144]
-; CHECK-NEXT:    mov.b v2[2], w2
-; CHECK-NEXT:    mov.b v0[2], w8
-; CHECK-NEXT:    mov.b v1[2], w9
-; CHECK-NEXT:    ldr w8, [sp, #360]
-; CHECK-NEXT:    mov.b v3[2], w10
-; CHECK-NEXT:    ldr w9, [sp, #232]
-; CHECK-NEXT:    ldr w10, [sp, #104]
-; CHECK-NEXT:    mov.b v2[3], w3
-; CHECK-NEXT:    mov.b v0[3], w8
-; CHECK-NEXT:    mov.b v1[3], w9
-; CHECK-NEXT:    ldr w8, [sp, #368]
-; CHECK-NEXT:    mov.b v3[3], w10
-; CHECK-NEXT:    ldr w9, [sp, #240]
-; CHECK-NEXT:    ldr w10, [sp, #112]
-; CHECK-NEXT:    mov.b v2[4], w4
-; CHECK-NEXT:    mov.b v0[4], w8
-; CHECK-NEXT:    mov.b v1[4], w9
-; CHECK-NEXT:    ldr w8, [sp, #376]
-; CHECK-NEXT:    mov.b v3[4], w10
-; CHECK-NEXT:    ldr w9, [sp, #248]
-; CHECK-NEXT:    ldr w10, [sp, #120]
-; CHECK-NEXT:    mov.b v2[5], w5
-; CHECK-NEXT:    mov.b v0[5], w8
-; CHECK-NEXT:    mov.b v1[5], w9
-; CHECK-NEXT:    ldr w8, [sp, #384]
-; CHECK-NEXT:    mov.b v3[5], w10
-; CHECK-NEXT:    ldr w9, [sp, #256]
-; CHECK-NEXT:    ldr w10, [sp, #128]
-; CHECK-NEXT:    mov.b v2[6], w6
-; CHECK-NEXT:    mov.b v0[6], w8
-; CHECK-NEXT:    mov.b v1[6], w9
-; CHECK-NEXT:    ldr w8, [sp, #392]
-; CHECK-NEXT:    mov.b v3[6], w10
-; CHECK-NEXT:    ldr w9, [sp, #264]
-; CHECK-NEXT:    ldr w10, [sp, #136]
-; CHECK-NEXT:    mov.b v2[7], w7
-; CHECK-NEXT:    mov.b v0[7], w8
-; CHECK-NEXT:    mov.b v1[7], w9
-; CHECK-NEXT:    ldr w8, [sp, #16]
-; CHECK-NEXT:    mov.b v3[7], w10
-; CHECK-NEXT:    ldr w9, [sp, #400]
-; CHECK-NEXT:    ldr w10, [sp, #272]
-; CHECK-NEXT:    mov.b v2[8], w8
-; CHECK-NEXT:    ldr w8, [sp, #24]
-; CHECK-NEXT:    mov.b v0[8], w9
-; CHECK-NEXT:    mov.b v1[8], w10
-; CHECK-NEXT:    ldr w9, [sp, #408]
-; CHECK-NEXT:    mov.b v3[8], w11
-; CHECK-NEXT:    ldr w10, [sp, #280]
-; CHECK-NEXT:    ldr w11, [sp, #152]
-; CHECK-NEXT:    mov.b v2[9], w8
-; CHECK-NEXT:    ldr w8, [sp, #32]
-; CHECK-NEXT:    mov.b v0[9], w9
-; CHECK-NEXT:    mov.b v1[9], w10
-; CHECK-NEXT:    ldr w9, [sp, #416]
-; CHECK-NEXT:    mov.b v3[9], w11
-; CHECK-NEXT:    ldr w10, [sp, #288]
-; CHECK-NEXT:    ldr w11, [sp, #160]
-; CHECK-NEXT:    mov.b v2[10], w8
-; CHECK-NEXT:    ldr w8, [sp, #40]
-; CHECK-NEXT:    mov.b v0[10], w9
-; CHECK-NEXT:    mov.b v1[10], w10
-; CHECK-NEXT:    ldr w9, [sp, #424]
-; CHECK-NEXT:    mov.b v3[10], w11
-; CHECK-NEXT:    ldr w10, [sp, #296]
-; CHECK-NEXT:    ldr w11, [sp, #168]
-; CHECK-NEXT:    mov.b v2[11], w8
-; CHECK-NEXT:    ldr w8, [sp, #48]
-; CHECK-NEXT:    mov.b v0[11], w9
-; CHECK-NEXT:    mov.b v1[11], w10
-; CHECK-NEXT:    ldr w9, [sp, #432]
-; CHECK-NEXT:    mov.b v3[11], w11
-; CHECK-NEXT:    ldr w10, [sp, #304]
-; CHECK-NEXT:    ldr w11, [sp, #176]
-; CHECK-NEXT:    mov.b v2[12], w8
-; CHECK-NEXT:    ldr w8, [sp, #56]
-; CHECK-NEXT:    mov.b v0[12], w9
-; CHECK-NEXT:    mov.b v1[12], w10
-; CHECK-NEXT:    ldr w9, [sp, #440]
-; CHECK-NEXT:    mov.b v3[12], w11
-; CHECK-NEXT:    ldr w10, [sp, #312]
-; CHECK-NEXT:    ldr w11, [sp, #184]
-; CHECK-NEXT:    mov.b v2[13], w8
-; CHECK-NEXT:    ldr w8, [sp, #64]
-; CHECK-NEXT:    mov.b v0[13], w9
-; CHECK-NEXT:    mov.b v1[13], w10
-; CHECK-NEXT:    ldr w9, [sp, #448]
-; CHECK-NEXT:    mov.b v3[13], w11
-; CHECK-NEXT:    ldr w10, [sp, #320]
-; CHECK-NEXT:    ldr w11, [sp, #192]
-; CHECK-NEXT:    mov.b v2[14], w8
-; CHECK-NEXT:    ldr w8, [sp, #72]
-; CHECK-NEXT:    mov.b v0[14], w9
-; CHECK-NEXT:    mov.b v1[14], w10
-; CHECK-NEXT:    ldr w9, [sp, #456]
-; CHECK-NEXT:    mov.b v3[14], w11
-; CHECK-NEXT:    ldr w10, [sp, #328]
-; CHECK-NEXT:    ldr w11, [sp, #200]
-; CHECK-NEXT:    mov.b v2[15], w8
-; CHECK-NEXT:    mov.b v0[15], w9
-; CHECK-NEXT:    mov.b v1[15], w10
-; CHECK-NEXT:    mov.b v3[15], w11
-; CHECK-NEXT:    shl.16b v2, v2, #7
-; CHECK-NEXT:    shl.16b v4, v1, #7
-; CHECK-NEXT:    shl.16b v5, v0, #7
-; CHECK-NEXT:    shl.16b v3, v3, #7
-; CHECK-NEXT:    cmlt.16b v0, v2, #0
-; CHECK-NEXT:    cmlt.16b v2, v4, #0
-; CHECK-NEXT:    cmlt.16b v1, v3, #0
-; CHECK-NEXT:    cmlt.16b v3, v5, #0
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sext_v64i1:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w29, -16
+; CHECK-SD-NEXT:    ldr w8, [sp, #336]
+; CHECK-SD-NEXT:    ldr w9, [sp, #208]
+; CHECK-SD-NEXT:    fmov s2, w0
+; CHECK-SD-NEXT:    ldr w10, [sp, #80]
+; CHECK-SD-NEXT:    ldr w11, [sp, #216]
+; CHECK-SD-NEXT:    ldr w12, [sp, #88]
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    fmov s1, w9
+; CHECK-SD-NEXT:    ldr w8, [sp, #344]
+; CHECK-SD-NEXT:    fmov s3, w10
+; CHECK-SD-NEXT:    mov.b v2[1], w1
+; CHECK-SD-NEXT:    ldr w9, [sp, #224]
+; CHECK-SD-NEXT:    ldr w10, [sp, #96]
+; CHECK-SD-NEXT:    mov.b v0[1], w8
+; CHECK-SD-NEXT:    mov.b v1[1], w11
+; CHECK-SD-NEXT:    ldr w8, [sp, #352]
+; CHECK-SD-NEXT:    mov.b v3[1], w12
+; CHECK-SD-NEXT:    ldr w11, [sp, #144]
+; CHECK-SD-NEXT:    mov.b v2[2], w2
+; CHECK-SD-NEXT:    mov.b v0[2], w8
+; CHECK-SD-NEXT:    mov.b v1[2], w9
+; CHECK-SD-NEXT:    ldr w8, [sp, #360]
+; CHECK-SD-NEXT:    mov.b v3[2], w10
+; CHECK-SD-NEXT:    ldr w9, [sp, #232]
+; CHECK-SD-NEXT:    ldr w10, [sp, #104]
+; CHECK-SD-NEXT:    mov.b v2[3], w3
+; CHECK-SD-NEXT:    mov.b v0[3], w8
+; CHECK-SD-NEXT:    mov.b v1[3], w9
+; CHECK-SD-NEXT:    ldr w8, [sp, #368]
+; CHECK-SD-NEXT:    mov.b v3[3], w10
+; CHECK-SD-NEXT:    ldr w9, [sp, #240]
+; CHECK-SD-NEXT:    ldr w10, [sp, #112]
+; CHECK-SD-NEXT:    mov.b v2[4], w4
+; CHECK-SD-NEXT:    mov.b v0[4], w8
+; CHECK-SD-NEXT:    mov.b v1[4], w9
+; CHECK-SD-NEXT:    ldr w8, [sp, #376]
+; CHECK-SD-NEXT:    mov.b v3[4], w10
+; CHECK-SD-NEXT:    ldr w9, [sp, #248]
+; CHECK-SD-NEXT:    ldr w10, [sp, #120]
+; CHECK-SD-NEXT:    mov.b v2[5], w5
+; CHECK-SD-NEXT:    mov.b v0[5], w8
+; CHECK-SD-NEXT:    mov.b v1[5], w9
+; CHECK-SD-NEXT:    ldr w8, [sp, #384]
+; CHECK-SD-NEXT:    mov.b v3[5], w10
+; CHECK-SD-NEXT:    ldr w9, [sp, #256]
+; CHECK-SD-NEXT:    ldr w10, [sp, #128]
+; CHECK-SD-NEXT:    mov.b v2[6], w6
+; CHECK-SD-NEXT:    mov.b v0[6], w8
+; CHECK-SD-NEXT:    mov.b v1[6], w9
+; CHECK-SD-NEXT:    ldr w8, [sp, #392]
+; CHECK-SD-NEXT:    mov.b v3[6], w10
+; CHECK-SD-NEXT:    ldr w9, [sp, #264]
+; CHECK-SD-NEXT:    ldr w10, [sp, #136]
+; CHECK-SD-NEXT:    mov.b v2[7], w7
+; CHECK-SD-NEXT:    mov.b v0[7], w8
+; CHECK-SD-NEXT:    mov.b v1[7], w9
+; CHECK-SD-NEXT:    ldr w8, [sp, #16]
+; CHECK-SD-NEXT:    mov.b v3[7], w10
+; CHECK-SD-NEXT:    ldr w9, [sp, #400]
+; CHECK-SD-NEXT:    ldr w10, [sp, #272]
+; CHECK-SD-NEXT:    mov.b v2[8], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #24]
+; CHECK-SD-NEXT:    mov.b v0[8], w9
+; CHECK-SD-NEXT:    mov.b v1[8], w10
+; CHECK-SD-NEXT:    ldr w9, [sp, #408]
+; CHECK-SD-NEXT:    mov.b v3[8], w11
+; CHECK-SD-NEXT:    ldr w10, [sp, #280]
+; CHECK-SD-NEXT:    ldr w11, [sp, #152]
+; CHECK-SD-NEXT:    mov.b v2[9], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #32]
+; CHECK-SD-NEXT:    mov.b v0[9], w9
+; CHECK-SD-NEXT:    mov.b v1[9], w10
+; CHECK-SD-NEXT:    ldr w9, [sp, #416]
+; CHECK-SD-NEXT:    mov.b v3[9], w11
+; CHECK-SD-NEXT:    ldr w10, [sp, #288]
+; CHECK-SD-NEXT:    ldr w11, [sp, #160]
+; CHECK-SD-NEXT:    mov.b v2[10], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #40]
+; CHECK-SD-NEXT:    mov.b v0[10], w9
+; CHECK-SD-NEXT:    mov.b v1[10], w10
+; CHECK-SD-NEXT:    ldr w9, [sp, #424]
+; CHECK-SD-NEXT:    mov.b v3[10], w11
+; CHECK-SD-NEXT:    ldr w10, [sp, #296]
+; CHECK-SD-NEXT:    ldr w11, [sp, #168]
+; CHECK-SD-NEXT:    mov.b v2[11], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #48]
+; CHECK-SD-NEXT:    mov.b v0[11], w9
+; CHECK-SD-NEXT:    mov.b v1[11], w10
+; CHECK-SD-NEXT:    ldr w9, [sp, #432]
+; CHECK-SD-NEXT:    mov.b v3[11], w11
+; CHECK-SD-NEXT:    ldr w10, [sp, #304]
+; CHECK-SD-NEXT:    ldr w11, [sp, #176]
+; CHECK-SD-NEXT:    mov.b v2[12], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #56]
+; CHECK-SD-NEXT:    mov.b v0[12], w9
+; CHECK-SD-NEXT:    mov.b v1[12], w10
+; CHECK-SD-NEXT:    ldr w9, [sp, #440]
+; CHECK-SD-NEXT:    mov.b v3[12], w11
+; CHECK-SD-NEXT:    ldr w10, [sp, #312]
+; CHECK-SD-NEXT:    ldr w11, [sp, #184]
+; CHECK-SD-NEXT:    mov.b v2[13], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #64]
+; CHECK-SD-NEXT:    mov.b v0[13], w9
+; CHECK-SD-NEXT:    mov.b v1[13], w10
+; CHECK-SD-NEXT:    ldr w9, [sp, #448]
+; CHECK-SD-NEXT:    mov.b v3[13], w11
+; CHECK-SD-NEXT:    ldr w10, [sp, #320]
+; CHECK-SD-NEXT:    ldr w11, [sp, #192]
+; CHECK-SD-NEXT:    mov.b v2[14], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #72]
+; CHECK-SD-NEXT:    mov.b v0[14], w9
+; CHECK-SD-NEXT:    mov.b v1[14], w10
+; CHECK-SD-NEXT:    ldr w9, [sp, #456]
+; CHECK-SD-NEXT:    mov.b v3[14], w11
+; CHECK-SD-NEXT:    ldr w10, [sp, #328]
+; CHECK-SD-NEXT:    ldr w11, [sp, #200]
+; CHECK-SD-NEXT:    mov.b v2[15], w8
+; CHECK-SD-NEXT:    mov.b v0[15], w9
+; CHECK-SD-NEXT:    mov.b v1[15], w10
+; CHECK-SD-NEXT:    mov.b v3[15], w11
+; CHECK-SD-NEXT:    shl.16b v2, v2, #7
+; CHECK-SD-NEXT:    shl.16b v4, v1, #7
+; CHECK-SD-NEXT:    shl.16b v5, v0, #7
+; CHECK-SD-NEXT:    shl.16b v3, v3, #7
+; CHECK-SD-NEXT:    cmlt.16b v0, v2, #0
+; CHECK-SD-NEXT:    cmlt.16b v2, v4, #0
+; CHECK-SD-NEXT:    cmlt.16b v1, v3, #0
+; CHECK-SD-NEXT:    cmlt.16b v3, v5, #0
+; CHECK-SD-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sext_v64i1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x29, [sp, #16] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    ldr s0, [sp, #32]
+; CHECK-GI-NEXT:    ldr s4, [sp, #40]
+; CHECK-GI-NEXT:    ldr s2, [sp, #96]
+; CHECK-GI-NEXT:    ldr s5, [sp, #104]
+; CHECK-GI-NEXT:    ldr s1, [sp, #64]
+; CHECK-GI-NEXT:    ldr s23, [sp, #72]
+; CHECK-GI-NEXT:    mov.s v0[1], v4[0]
+; CHECK-GI-NEXT:    ldr s28, [sp, #200]
+; CHECK-GI-NEXT:    ldr s3, [sp, #128]
+; CHECK-GI-NEXT:    mov.s v2[1], v5[0]
+; CHECK-GI-NEXT:    mov.s v1[1], v23[0]
+; CHECK-GI-NEXT:    ldr s5, [sp, #192]
+; CHECK-GI-NEXT:    ldr s7, [sp, #136]
+; CHECK-GI-NEXT:    ldr s4, [sp, #160]
+; CHECK-GI-NEXT:    ldr s24, [sp, #168]
+; CHECK-GI-NEXT:    mov.s v5[1], v28[0]
+; CHECK-GI-NEXT:    ldr s6, [sp, #48]
+; CHECK-GI-NEXT:    ldr s21, [sp, #80]
+; CHECK-GI-NEXT:    mov.s v3[1], v7[0]
+; CHECK-GI-NEXT:    mov.s v4[1], v24[0]
+; CHECK-GI-NEXT:    ldr s16, [sp, #112]
+; CHECK-GI-NEXT:    ldr s29, [sp, #208]
+; CHECK-GI-NEXT:    mov.s v0[2], v6[0]
+; CHECK-GI-NEXT:    mov.s v1[2], v21[0]
+; CHECK-GI-NEXT:    ldr s6, [sp, #224]
+; CHECK-GI-NEXT:    ldr s30, [sp, #232]
+; CHECK-GI-NEXT:    mov.s v2[2], v16[0]
+; CHECK-GI-NEXT:    ldr s20, [sp, #144]
+; CHECK-GI-NEXT:    ldr s27, [sp, #176]
+; CHECK-GI-NEXT:    mov.s v5[2], v29[0]
+; CHECK-GI-NEXT:    mov.s v6[1], v30[0]
+; CHECK-GI-NEXT:    ldr s18, [sp, #88]
+; CHECK-GI-NEXT:    ldr s19, [sp, #120]
+; CHECK-GI-NEXT:    ldr s7, [sp, #256]
+; CHECK-GI-NEXT:    ldr s31, [sp, #264]
+; CHECK-GI-NEXT:    mov.s v3[2], v20[0]
+; CHECK-GI-NEXT:    mov.s v4[2], v27[0]
+; CHECK-GI-NEXT:    ldr s25, [sp, #216]
+; CHECK-GI-NEXT:    ldr s26, [sp, #240]
+; CHECK-GI-NEXT:    ldr s17, [sp, #56]
+; CHECK-GI-NEXT:    ldr s22, [sp, #152]
+; CHECK-GI-NEXT:    mov.s v1[3], v18[0]
+; CHECK-GI-NEXT:    ldr s23, [sp, #184]
+; CHECK-GI-NEXT:    mov.s v2[3], v19[0]
+; CHECK-GI-NEXT:    ldr s18, [sp, #320]
+; CHECK-GI-NEXT:    ldr s27, [sp, #328]
+; CHECK-GI-NEXT:    mov.s v7[1], v31[0]
+; CHECK-GI-NEXT:    ldr s19, [sp, #352]
+; CHECK-GI-NEXT:    ldr s29, [sp, #360]
+; CHECK-GI-NEXT:    mov.s v5[3], v25[0]
+; CHECK-GI-NEXT:    mov.s v6[2], v26[0]
+; CHECK-GI-NEXT:    fmov s25, w0
+; CHECK-GI-NEXT:    fmov s26, w4
+; CHECK-GI-NEXT:    ldr s28, [sp, #272]
+; CHECK-GI-NEXT:    mov.s v0[3], v17[0]
+; CHECK-GI-NEXT:    ldr s17, [sp, #288]
+; CHECK-GI-NEXT:    ldr s8, [sp, #296]
+; CHECK-GI-NEXT:    mov.s v3[3], v22[0]
+; CHECK-GI-NEXT:    ldr s20, [sp, #384]
+; CHECK-GI-NEXT:    mov.s v4[3], v23[0]
+; CHECK-GI-NEXT:    ldr s30, [sp, #392]
+; CHECK-GI-NEXT:    ldr s22, [sp, #416]
+; CHECK-GI-NEXT:    ldr s31, [sp, #424]
+; CHECK-GI-NEXT:    ldr s23, [sp, #448]
+; CHECK-GI-NEXT:    mov.s v18[1], v27[0]
+; CHECK-GI-NEXT:    mov.s v19[1], v29[0]
+; CHECK-GI-NEXT:    ldr s27, [sp, #456]
+; CHECK-GI-NEXT:    ldr s24, [sp, #336]
+; CHECK-GI-NEXT:    mov.s v17[1], v8[0]
+; CHECK-GI-NEXT:    mov.s v7[2], v28[0]
+; CHECK-GI-NEXT:    mov.s v25[1], w1
+; CHECK-GI-NEXT:    mov.s v26[1], w5
+; CHECK-GI-NEXT:    mov.s v20[1], v30[0]
+; CHECK-GI-NEXT:    ldr s28, [sp, #368]
+; CHECK-GI-NEXT:    mov.s v22[1], v31[0]
+; CHECK-GI-NEXT:    mov.s v23[1], v27[0]
+; CHECK-GI-NEXT:    ldr s9, [sp, #304]
+; CHECK-GI-NEXT:    ldr s27, [sp, #400]
+; CHECK-GI-NEXT:    mov.s v18[2], v24[0]
+; CHECK-GI-NEXT:    ldr s24, [sp, #432]
+; CHECK-GI-NEXT:    mov.s v19[2], v28[0]
+; CHECK-GI-NEXT:    ldr s28, [sp, #464]
+; CHECK-GI-NEXT:    ldr s16, [sp, #248]
+; CHECK-GI-NEXT:    ldr s21, [sp, #280]
+; CHECK-GI-NEXT:    mov.s v17[2], v9[0]
+; CHECK-GI-NEXT:    mov.s v25[2], w2
+; CHECK-GI-NEXT:    mov.s v26[2], w6
+; CHECK-GI-NEXT:    mov.s v20[2], v27[0]
+; CHECK-GI-NEXT:    mov.s v22[2], v24[0]
+; CHECK-GI-NEXT:    mov.s v23[2], v28[0]
+; CHECK-GI-NEXT:    ldr s29, [sp, #312]
+; CHECK-GI-NEXT:    ldr s27, [sp, #344]
+; CHECK-GI-NEXT:    ldr s24, [sp, #376]
+; CHECK-GI-NEXT:    ldr s28, [sp, #408]
+; CHECK-GI-NEXT:    mov.s v6[3], v16[0]
+; CHECK-GI-NEXT:    ldr s16, [sp, #440]
+; CHECK-GI-NEXT:    mov.s v7[3], v21[0]
+; CHECK-GI-NEXT:    ldr s21, [sp, #472]
+; CHECK-GI-NEXT:    mov.s v25[3], w3
+; CHECK-GI-NEXT:    mov.s v26[3], w7
+; CHECK-GI-NEXT:    mov.s v17[3], v29[0]
+; CHECK-GI-NEXT:    mov.s v18[3], v27[0]
+; CHECK-GI-NEXT:    mov.s v19[3], v24[0]
+; CHECK-GI-NEXT:    mov.s v20[3], v28[0]
+; CHECK-GI-NEXT:    mov.s v22[3], v16[0]
+; CHECK-GI-NEXT:    mov.s v23[3], v21[0]
+; CHECK-GI-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-GI-NEXT:    uzp1.8h v1, v2, v3
+; CHECK-GI-NEXT:    uzp1.8h v2, v4, v5
+; CHECK-GI-NEXT:    uzp1.8h v3, v6, v7
+; CHECK-GI-NEXT:    ldr x29, [sp, #16] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    uzp1.8h v16, v25, v26
+; CHECK-GI-NEXT:    uzp1.8h v4, v17, v18
+; CHECK-GI-NEXT:    uzp1.8h v5, v19, v20
+; CHECK-GI-NEXT:    uzp1.8h v6, v22, v23
+; CHECK-GI-NEXT:    uzp1.16b v1, v1, v2
+; CHECK-GI-NEXT:    uzp1.16b v0, v16, v0
+; CHECK-GI-NEXT:    uzp1.16b v2, v3, v4
+; CHECK-GI-NEXT:    uzp1.16b v3, v5, v6
+; CHECK-GI-NEXT:    shl.16b v1, v1, #7
+; CHECK-GI-NEXT:    shl.16b v0, v0, #7
+; CHECK-GI-NEXT:    shl.16b v2, v2, #7
+; CHECK-GI-NEXT:    shl.16b v3, v3, #7
+; CHECK-GI-NEXT:    sshr.16b v1, v1, #7
+; CHECK-GI-NEXT:    sshr.16b v0, v0, #7
+; CHECK-GI-NEXT:    sshr.16b v2, v2, #7
+; CHECK-GI-NEXT:    sshr.16b v3, v3, #7
+; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
   %res = sext <64 x i1> %arg to <64 x i8>
   ret <64 x i8> %res
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index cff60bdf44ca..7c71449a3163 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -284,26 +284,18 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-GI-NEXT:    usubl2.4s v3, v3, v4
 ; CHECK-GI-NEXT:    usubl.4s v4, v0, v1
 ; CHECK-GI-NEXT:    usubl2.4s v0, v0, v1
+; CHECK-GI-NEXT:    neg.4s v6, v5
+; CHECK-GI-NEXT:    neg.4s v7, v3
 ; CHECK-GI-NEXT:    cmgt.4s v1, v2, v5
-; CHECK-GI-NEXT:    cmgt.4s v6, v2, v3
-; CHECK-GI-NEXT:    neg.4s v16, v5
-; CHECK-GI-NEXT:    cmgt.4s v7, v2, v4
+; CHECK-GI-NEXT:    neg.4s v16, v4
+; CHECK-GI-NEXT:    neg.4s v17, v0
+; CHECK-GI-NEXT:    cmgt.4s v18, v2, v3
+; CHECK-GI-NEXT:    cmgt.4s v19, v2, v4
 ; CHECK-GI-NEXT:    cmgt.4s v2, v2, v0
-; CHECK-GI-NEXT:    neg.4s v17, v3
-; CHECK-GI-NEXT:    neg.4s v18, v4
-; CHECK-GI-NEXT:    neg.4s v19, v0
-; CHECK-GI-NEXT:    shl.4s v1, v1, #31
-; CHECK-GI-NEXT:    shl.4s v6, v6, #31
-; CHECK-GI-NEXT:    shl.4s v7, v7, #31
-; CHECK-GI-NEXT:    shl.4s v2, v2, #31
-; CHECK-GI-NEXT:    sshr.4s v1, v1, #31
-; CHECK-GI-NEXT:    sshr.4s v6, v6, #31
-; CHECK-GI-NEXT:    sshr.4s v7, v7, #31
-; CHECK-GI-NEXT:    sshr.4s v2, v2, #31
-; CHECK-GI-NEXT:    bsl.16b v1, v16, v5
-; CHECK-GI-NEXT:    bit.16b v3, v17, v6
-; CHECK-GI-NEXT:    bit.16b v4, v18, v7
-; CHECK-GI-NEXT:    bit.16b v0, v19, v2
+; CHECK-GI-NEXT:    bsl.16b v1, v6, v5
+; CHECK-GI-NEXT:    bit.16b v3, v7, v18
+; CHECK-GI-NEXT:    bit.16b v4, v16, v19
+; CHECK-GI-NEXT:    bit.16b v0, v17, v2
 ; CHECK-GI-NEXT:    add.4s v1, v1, v3
 ; CHECK-GI-NEXT:    add.4s v0, v4, v0
 ; CHECK-GI-NEXT:    add.4s v0, v1, v0
@@ -340,26 +332,18 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-GI-NEXT:    ssubl2.4s v3, v3, v4
 ; CHECK-GI-NEXT:    ssubl.4s v4, v0, v1
 ; CHECK-GI-NEXT:    ssubl2.4s v0, v0, v1
+; CHECK-GI-NEXT:    neg.4s v6, v5
+; CHECK-GI-NEXT:    neg.4s v7, v3
 ; CHECK-GI-NEXT:    cmgt.4s v1, v2, v5
-; CHECK-GI-NEXT:    cmgt.4s v6, v2, v3
-; CHECK-GI-NEXT:    neg.4s v16, v5
-; CHECK-GI-NEXT:    cmgt.4s v7, v2, v4
+; CHECK-GI-NEXT:    neg.4s v16, v4
+; CHECK-GI-NEXT:    neg.4s v17, v0
+; CHECK-GI-NEXT:    cmgt.4s v18, v2, v3
+; CHECK-GI-NEXT:    cmgt.4s v19, v2, v4
 ; CHECK-GI-NEXT:    cmgt.4s v2, v2, v0
-; CHECK-GI-NEXT:    neg.4s v17, v3
-; CHECK-GI-NEXT:    neg.4s v18, v4
-; CHECK-GI-NEXT:    neg.4s v19, v0
-; CHECK-GI-NEXT:    shl.4s v1, v1, #31
-; CHECK-GI-NEXT:    shl.4s v6, v6, #31
-; CHECK-GI-NEXT:    shl.4s v7, v7, #31
-; CHECK-GI-NEXT:    shl.4s v2, v2, #31
-; CHECK-GI-NEXT:    sshr.4s v1, v1, #31
-; CHECK-GI-NEXT:    sshr.4s v6, v6, #31
-; CHECK-GI-NEXT:    sshr.4s v7, v7, #31
-; CHECK-GI-NEXT:    sshr.4s v2, v2, #31
-; CHECK-GI-NEXT:    bsl.16b v1, v16, v5
-; CHECK-GI-NEXT:    bit.16b v3, v17, v6
-; CHECK-GI-NEXT:    bit.16b v4, v18, v7
-; CHECK-GI-NEXT:    bit.16b v0, v19, v2
+; CHECK-GI-NEXT:    bsl.16b v1, v6, v5
+; CHECK-GI-NEXT:    bit.16b v3, v7, v18
+; CHECK-GI-NEXT:    bit.16b v4, v16, v19
+; CHECK-GI-NEXT:    bit.16b v0, v17, v2
 ; CHECK-GI-NEXT:    add.4s v1, v1, v3
 ; CHECK-GI-NEXT:    add.4s v0, v4, v0
 ; CHECK-GI-NEXT:    add.4s v0, v1, v0
@@ -397,16 +381,12 @@ define i32 @uabd8h_rdx(ptr %a, ptr %b) {
 ; CHECK-GI-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-GI-NEXT:    usubl.4s v3, v1, v2
 ; CHECK-GI-NEXT:    usubl2.4s v1, v1, v2
-; CHECK-GI-NEXT:    cmgt.4s v2, v0, v3
+; CHECK-GI-NEXT:    neg.4s v2, v3
+; CHECK-GI-NEXT:    neg.4s v4, v1
+; CHECK-GI-NEXT:    cmgt.4s v5, v0, v3
 ; CHECK-GI-NEXT:    cmgt.4s v0, v0, v1
-; CHECK-GI-NEXT:    neg.4s v4, v3
-; CHECK-GI-NEXT:    neg.4s v5, v1
-; CHECK-GI-NEXT:    shl.4s v2, v2, #31
-; CHECK-GI-NEXT:    shl.4s v0, v0, #31
-; CHECK-GI-NEXT:    sshr.4s v2, v2, #31
-; CHECK-GI-NEXT:    sshr.4s v0, v0, #31
-; CHECK-GI-NEXT:    bsl.16b v2, v4, v3
-; CHECK-GI-NEXT:    bsl.16b v0, v5, v1
+; CHECK-GI-NEXT:    bif.16b v2, v3, v5
+; CHECK-GI-NEXT:    bsl.16b v0, v4, v1
 ; CHECK-GI-NEXT:    add.4s v0, v2, v0
 ; CHECK-GI-NEXT:    addv.4s s0, v0
 ; CHECK-GI-NEXT:    fmov w0, s0
@@ -433,19 +413,15 @@ define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; CHECK-GI-LABEL: sabd8h_rdx:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-GI-NEXT:    ssubl.4s v3, v0, v1
 ; CHECK-GI-NEXT:    ssubl2.4s v0, v0, v1
-; CHECK-GI-NEXT:    cmgt.4s v1, v2, v3
+; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-GI-NEXT:    neg.4s v1, v3
+; CHECK-GI-NEXT:    neg.4s v4, v0
+; CHECK-GI-NEXT:    cmgt.4s v5, v2, v3
 ; CHECK-GI-NEXT:    cmgt.4s v2, v2, v0
-; CHECK-GI-NEXT:    neg.4s v4, v3
-; CHECK-GI-NEXT:    neg.4s v5, v0
-; CHECK-GI-NEXT:    shl.4s v1, v1, #31
-; CHECK-GI-NEXT:    shl.4s v2, v2, #31
-; CHECK-GI-NEXT:    sshr.4s v1, v1, #31
-; CHECK-GI-NEXT:    sshr.4s v2, v2, #31
-; CHECK-GI-NEXT:    bsl.16b v1, v4, v3
-; CHECK-GI-NEXT:    bit.16b v0, v5, v2
+; CHECK-GI-NEXT:    bif.16b v1, v3, v5
+; CHECK-GI-NEXT:    bit.16b v0, v4, v2
 ; CHECK-GI-NEXT:    add.4s v0, v1, v0
 ; CHECK-GI-NEXT:    addv.4s s0, v0
 ; CHECK-GI-NEXT:    fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmax.ll b/llvm/test/CodeGen/AArch64/arm64-vmax.ll
index d0a36b76cc61..5a132a33c5da 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmax.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmax.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -global-isel -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @smax_8b(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: smax_8b:
diff --git a/llvm/test/CodeGen/AArch64/arm64-vminmaxnm.ll b/llvm/test/CodeGen/AArch64/arm64-vminmaxnm.ll
index b9cd1bec1774..332fca23815c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vminmaxnm.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vminmaxnm.ll
@@ -1,57 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s
 
 define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.2s	v0, v0, v1
-; CHECK: ret
+; CHECK-LABEL: f1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnm.2s v0, v0, v1
+; CHECK-NEXT:    ret
   %vmaxnm2.i = tail call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
   ret <2 x float> %vmaxnm2.i
 }
 
 define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.4s	v0, v0, v1
-; CHECK: ret
+; CHECK-LABEL: f2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnm.4s v0, v0, v1
+; CHECK-NEXT:    ret
   %vmaxnm2.i = tail call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
   ret <4 x float> %vmaxnm2.i
 }
 
 define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.2d	v0, v0, v1
-; CHECK: ret
+; CHECK-LABEL: f3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnm.2d v0, v0, v1
+; CHECK-NEXT:    ret
   %vmaxnm2.i = tail call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
   ret <2 x double> %vmaxnm2.i
 }
 
 define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
-; CHECK: fminnm.2s	v0, v0, v1
-; CHECK: ret
+; CHECK-LABEL: f4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm.2s v0, v0, v1
+; CHECK-NEXT:    ret
   %vminnm2.i = tail call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
   ret <2 x float> %vminnm2.i
 }
 
 define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
-; CHECK: fminnm.4s	v0, v0, v1
-; CHECK: ret
+; CHECK-LABEL: f5:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm.4s v0, v0, v1
+; CHECK-NEXT:    ret
   %vminnm2.i = tail call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
   ret <4 x float> %vminnm2.i
 }
 
 define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
-; CHECK: fminnm.2d	v0, v0, v1
-; CHECK: ret
+; CHECK-LABEL: f6:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm.2d v0, v0, v1
+; CHECK-NEXT:    ret
   %vminnm2.i = tail call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
   ret <2 x double> %vminnm2.i
 }
 
 define float @f7(float %a, float %b) nounwind readnone ssp {
-; CHECK: fmaxnm	s0, s0, s1
-; CHECK: ret
+; CHECK-LABEL: f7:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnm s0, s0, s1
+; CHECK-NEXT:    ret
   %vmaxnm2.i = tail call float @llvm.aarch64.neon.fmaxnm.f32(float %a, float %b) nounwind
   ret float %vmaxnm2.i
 }
 
 define double @f8(double %a, double %b) nounwind readnone ssp {
-; CHECK: fminnm	d0, d0, d1
-; CHECK: ret
+; CHECK-LABEL: f8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm d0, d0, d1
+; CHECK-NEXT:    ret
   %vmaxnm2.i = tail call double @llvm.aarch64.neon.fminnm.f64(double %a, double %b) nounwind
   ret double %vmaxnm2.i
 }
@@ -67,14 +85,18 @@ declare double @llvm.aarch64.neon.fminnm.f64(double, double) nounwind readnone
 
 define double @test_fmaxnmv(<2 x double> %in) {
 ; CHECK-LABEL: test_fmaxnmv:
-; CHECK: fmaxnmp.2d d0, v0
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnmp.2d d0, v0
+; CHECK-NEXT:    ret
   %max = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
   ret double %max
 }
 
 define double @test_fminnmv(<2 x double> %in) {
 ; CHECK-LABEL: test_fminnmv:
-; CHECK: fminnmp.2d d0, v0
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnmp.2d d0, v0
+; CHECK-NEXT:    ret
   %min = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %in)
   ret double %min
 }
diff --git a/llvm/test/CodeGen/AArch64/call-rv-marker.ll b/llvm/test/CodeGen/AArch64/call-rv-marker.ll
index fc06809ad09f..de8f5bbfb484 100644
--- a/llvm/test/CodeGen/AArch64/call-rv-marker.ll
+++ b/llvm/test/CodeGen/AArch64/call-rv-marker.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -o - %s | FileCheck --check-prefix=SELDAG --check-prefix=CHECK %s
 ; RUN: llc -global-isel -o - %s | FileCheck --check-prefix=GISEL --check-prefix=CHECK %s
 
@@ -25,37 +26,93 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 @fptr = dso_local global ptr null, align 8
 
 define dso_local ptr @rv_marker_1_retain() {
-; CHECK-LABEL: _rv_marker_1_retain:
-; CHECK:         bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_1_retain:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
 ;
+; GISEL-LABEL: rv_marker_1_retain:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
 entry:
   %call = call ptr @foo1() [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
   ret ptr %call
 }
 
 define dso_local ptr @rv_marker_1_unsafeClaim() {
-; CHECK-LABEL: _rv_marker_1_unsafeClaim:
-; CHECK:         bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_unsafeClaimAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_1_unsafeClaim:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_unsafeClaimAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
 ;
+; GISEL-LABEL: rv_marker_1_unsafeClaim:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_unsafeClaimAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
 entry:
   %call = call ptr @foo1() [ "clang.arc.attachedcall"(ptr @objc_unsafeClaimAutoreleasedReturnValue) ]
   ret ptr %call
 }
 
 define dso_local void @rv_marker_2_select(i32 %c) {
-; CHECK-LABEL: _rv_marker_2_select:
-; SELDAG:        cinc  w0, w8, eq
-; GISEL:         csinc w0, w8, wzr, eq
-; CHECK-NEXT:    bl _foo0
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
-; CHECK-NEXT:    ldp x29, x30, [sp], #16
-; CHECK-NEXT:    b _foo2
+; SELDAG-LABEL: rv_marker_2_select:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    mov w8, #1 ; =0x1
+; SELDAG-NEXT:    cmp w0, #0
+; SELDAG-NEXT:    cinc w0, w8, eq
+; SELDAG-NEXT:    bl _foo0
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    b _foo2
 ;
+; GISEL-LABEL: rv_marker_2_select:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    mov w8, #1 ; =0x1
+; GISEL-NEXT:    cmp w0, #0
+; GISEL-NEXT:    cinc w0, w8, eq
+; GISEL-NEXT:    bl _foo0
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    b _foo2
 entry:
   %tobool.not = icmp eq i32 %c, 0
   %.sink = select i1 %tobool.not, i32 2, i32 1
@@ -65,11 +122,121 @@ entry:
 }
 
 define dso_local void @rv_marker_3() personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: _rv_marker_3:
-; CHECK:         bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_3:
+; SELDAG:       Lfunc_begin0:
+; SELDAG-NEXT:    .cfi_startproc
+; SELDAG-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; SELDAG-NEXT:    .cfi_lsda 16, Lexception0
+; SELDAG-NEXT:  ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 32
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    .cfi_offset w19, -24
+; SELDAG-NEXT:    .cfi_offset w20, -32
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    mov x19, x0
+; SELDAG-NEXT:  Ltmp0:
+; SELDAG-NEXT:    bl _objc_object
+; SELDAG-NEXT:  Ltmp1:
+; SELDAG-NEXT:  ; %bb.1: ; %invoke.cont
+; SELDAG-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; SELDAG-NEXT:    b _objc_release
+; SELDAG-NEXT:  LBB3_2: ; %lpad
+; SELDAG-NEXT:  Ltmp2:
+; SELDAG-NEXT:    mov x20, x0
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    bl _objc_release
+; SELDAG-NEXT:    mov x0, x20
+; SELDAG-NEXT:    bl __Unwind_Resume
+; SELDAG-NEXT:  Lfunc_end0:
+; SELDAG-NEXT:    .cfi_endproc
+; SELDAG-NEXT:    .section __TEXT,__gcc_except_tab
+; SELDAG-NEXT:    .p2align 2, 0x0
+; SELDAG-NEXT:  GCC_except_table3:
+; SELDAG-NEXT:  Lexception0:
+; SELDAG-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; SELDAG-NEXT:    .byte 255 ; @TType Encoding = omit
+; SELDAG-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; SELDAG-NEXT:    .uleb128 Lcst_end0-Lcst_begin0
+; SELDAG-NEXT:  Lcst_begin0:
+; SELDAG-NEXT:    .uleb128 Lfunc_begin0-Lfunc_begin0 ; >> Call Site 1 <<
+; SELDAG-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; Call between Lfunc_begin0 and Ltmp0
+; SELDAG-NEXT:    .byte 0 ; has no landing pad
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; >> Call Site 2 <<
+; SELDAG-NEXT:    .uleb128 Ltmp1-Ltmp0 ; Call between Ltmp0 and Ltmp1
+; SELDAG-NEXT:    .uleb128 Ltmp2-Lfunc_begin0 ; jumps to Ltmp2
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp1-Lfunc_begin0 ; >> Call Site 3 <<
+; SELDAG-NEXT:    .uleb128 Lfunc_end0-Ltmp1 ; Call between Ltmp1 and Lfunc_end0
+; SELDAG-NEXT:    .byte 0 ; has no landing pad
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:  Lcst_end0:
+; SELDAG-NEXT:    .p2align 2, 0x0
 ;
+; GISEL-LABEL: rv_marker_3:
+; GISEL:       Lfunc_begin0:
+; GISEL-NEXT:    .cfi_startproc
+; GISEL-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; GISEL-NEXT:    .cfi_lsda 16, Lexception0
+; GISEL-NEXT:  ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 32
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    .cfi_offset w19, -24
+; GISEL-NEXT:    .cfi_offset w20, -32
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:  Ltmp0:
+; GISEL-NEXT:    bl _objc_object
+; GISEL-NEXT:  Ltmp1:
+; GISEL-NEXT:  ; %bb.1: ; %invoke.cont
+; GISEL-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; GISEL-NEXT:    b _objc_release
+; GISEL-NEXT:  LBB3_2: ; %lpad
+; GISEL-NEXT:  Ltmp2:
+; GISEL-NEXT:    mov x20, x0
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    bl _objc_release
+; GISEL-NEXT:    mov x0, x20
+; GISEL-NEXT:    bl __Unwind_Resume
+; GISEL-NEXT:  Lfunc_end0:
+; GISEL-NEXT:    .cfi_endproc
+; GISEL-NEXT:    .section __TEXT,__gcc_except_tab
+; GISEL-NEXT:    .p2align 2, 0x0
+; GISEL-NEXT:  GCC_except_table3:
+; GISEL-NEXT:  Lexception0:
+; GISEL-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; GISEL-NEXT:    .byte 255 ; @TType Encoding = omit
+; GISEL-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; GISEL-NEXT:    .uleb128 Lcst_end0-Lcst_begin0
+; GISEL-NEXT:  Lcst_begin0:
+; GISEL-NEXT:    .uleb128 Lfunc_begin0-Lfunc_begin0 ; >> Call Site 1 <<
+; GISEL-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; Call between Lfunc_begin0 and Ltmp0
+; GISEL-NEXT:    .byte 0 ; has no landing pad
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; >> Call Site 2 <<
+; GISEL-NEXT:    .uleb128 Ltmp1-Ltmp0 ; Call between Ltmp0 and Ltmp1
+; GISEL-NEXT:    .uleb128 Ltmp2-Lfunc_begin0 ; jumps to Ltmp2
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp1-Lfunc_begin0 ; >> Call Site 3 <<
+; GISEL-NEXT:    .uleb128 Lfunc_end0-Ltmp1 ; Call between Ltmp1 and Lfunc_end0
+; GISEL-NEXT:    .byte 0 ; has no landing pad
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:  Lcst_end0:
+; GISEL-NEXT:    .p2align 2, 0x0
 entry:
   %call = call ptr @foo1() [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
   invoke void @objc_object(ptr %call) #5
@@ -87,13 +254,151 @@ lpad:                                             ; preds = %entry
 }
 
 define dso_local void @rv_marker_4() personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: _rv_marker_4:
-; CHECK:       Ltmp3:
-; CHECK-NEXT:    bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
-; CHECK-NEXT:  Ltmp4:
+; SELDAG-LABEL: rv_marker_4:
+; SELDAG:       Lfunc_begin1:
+; SELDAG-NEXT:    .cfi_startproc
+; SELDAG-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; SELDAG-NEXT:    .cfi_lsda 16, Lexception1
+; SELDAG-NEXT:  ; %bb.0: ; %entry
+; SELDAG-NEXT:    sub sp, sp, #48
+; SELDAG-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
+; SELDAG-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 48
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    .cfi_offset w19, -24
+; SELDAG-NEXT:    .cfi_offset w20, -32
+; SELDAG-NEXT:  Ltmp3:
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:  Ltmp4:
+; SELDAG-NEXT:  ; %bb.1: ; %invoke.cont
+; SELDAG-NEXT:  Ltmp6:
+; SELDAG-NEXT:    mov x19, x0
+; SELDAG-NEXT:    bl _objc_object
+; SELDAG-NEXT:  Ltmp7:
+; SELDAG-NEXT:  ; %bb.2: ; %invoke.cont2
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    bl _objc_release
+; SELDAG-NEXT:    add x0, sp, #15
+; SELDAG-NEXT:    bl __ZN1SD1Ev
+; SELDAG-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; SELDAG-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
+; SELDAG-NEXT:    add sp, sp, #48
+; SELDAG-NEXT:    ret
+; SELDAG-NEXT:  LBB4_3: ; %lpad1
+; SELDAG-NEXT:  Ltmp8:
+; SELDAG-NEXT:    mov x20, x0
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    bl _objc_release
+; SELDAG-NEXT:    b LBB4_5
+; SELDAG-NEXT:  LBB4_4: ; %lpad
+; SELDAG-NEXT:  Ltmp5:
+; SELDAG-NEXT:    mov x20, x0
+; SELDAG-NEXT:  LBB4_5: ; %ehcleanup
+; SELDAG-NEXT:    add x0, sp, #15
+; SELDAG-NEXT:    bl __ZN1SD1Ev
+; SELDAG-NEXT:    mov x0, x20
+; SELDAG-NEXT:    bl __Unwind_Resume
+; SELDAG-NEXT:  Lfunc_end1:
+; SELDAG-NEXT:    .cfi_endproc
+; SELDAG-NEXT:    .section __TEXT,__gcc_except_tab
+; SELDAG-NEXT:    .p2align 2, 0x0
+; SELDAG-NEXT:  GCC_except_table4:
+; SELDAG-NEXT:  Lexception1:
+; SELDAG-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; SELDAG-NEXT:    .byte 255 ; @TType Encoding = omit
+; SELDAG-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; SELDAG-NEXT:    .uleb128 Lcst_end1-Lcst_begin1
+; SELDAG-NEXT:  Lcst_begin1:
+; SELDAG-NEXT:    .uleb128 Ltmp3-Lfunc_begin1 ; >> Call Site 1 <<
+; SELDAG-NEXT:    .uleb128 Ltmp4-Ltmp3 ; Call between Ltmp3 and Ltmp4
+; SELDAG-NEXT:    .uleb128 Ltmp5-Lfunc_begin1 ; jumps to Ltmp5
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp6-Lfunc_begin1 ; >> Call Site 2 <<
+; SELDAG-NEXT:    .uleb128 Ltmp7-Ltmp6 ; Call between Ltmp6 and Ltmp7
+; SELDAG-NEXT:    .uleb128 Ltmp8-Lfunc_begin1 ; jumps to Ltmp8
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp7-Lfunc_begin1 ; >> Call Site 3 <<
+; SELDAG-NEXT:    .uleb128 Lfunc_end1-Ltmp7 ; Call between Ltmp7 and Lfunc_end1
+; SELDAG-NEXT:    .byte 0 ; has no landing pad
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:  Lcst_end1:
+; SELDAG-NEXT:    .p2align 2, 0x0
 ;
+; GISEL-LABEL: rv_marker_4:
+; GISEL:       Lfunc_begin1:
+; GISEL-NEXT:    .cfi_startproc
+; GISEL-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; GISEL-NEXT:    .cfi_lsda 16, Lexception1
+; GISEL-NEXT:  ; %bb.0: ; %entry
+; GISEL-NEXT:    sub sp, sp, #48
+; GISEL-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 48
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    .cfi_offset w19, -24
+; GISEL-NEXT:    .cfi_offset w20, -32
+; GISEL-NEXT:  Ltmp3:
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:  Ltmp4:
+; GISEL-NEXT:  ; %bb.1: ; %invoke.cont
+; GISEL-NEXT:  Ltmp6:
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:    bl _objc_object
+; GISEL-NEXT:  Ltmp7:
+; GISEL-NEXT:  ; %bb.2: ; %invoke.cont2
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    bl _objc_release
+; GISEL-NEXT:    add x0, sp, #15
+; GISEL-NEXT:    bl __ZN1SD1Ev
+; GISEL-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
+; GISEL-NEXT:    add sp, sp, #48
+; GISEL-NEXT:    ret
+; GISEL-NEXT:  LBB4_3: ; %lpad1
+; GISEL-NEXT:  Ltmp8:
+; GISEL-NEXT:    mov x20, x0
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    bl _objc_release
+; GISEL-NEXT:    b LBB4_5
+; GISEL-NEXT:  LBB4_4: ; %lpad
+; GISEL-NEXT:  Ltmp5:
+; GISEL-NEXT:    mov x20, x0
+; GISEL-NEXT:  LBB4_5: ; %ehcleanup
+; GISEL-NEXT:    add x0, sp, #15
+; GISEL-NEXT:    bl __ZN1SD1Ev
+; GISEL-NEXT:    mov x0, x20
+; GISEL-NEXT:    bl __Unwind_Resume
+; GISEL-NEXT:  Lfunc_end1:
+; GISEL-NEXT:    .cfi_endproc
+; GISEL-NEXT:    .section __TEXT,__gcc_except_tab
+; GISEL-NEXT:    .p2align 2, 0x0
+; GISEL-NEXT:  GCC_except_table4:
+; GISEL-NEXT:  Lexception1:
+; GISEL-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; GISEL-NEXT:    .byte 255 ; @TType Encoding = omit
+; GISEL-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; GISEL-NEXT:    .uleb128 Lcst_end1-Lcst_begin1
+; GISEL-NEXT:  Lcst_begin1:
+; GISEL-NEXT:    .uleb128 Ltmp3-Lfunc_begin1 ; >> Call Site 1 <<
+; GISEL-NEXT:    .uleb128 Ltmp4-Ltmp3 ; Call between Ltmp3 and Ltmp4
+; GISEL-NEXT:    .uleb128 Ltmp5-Lfunc_begin1 ; jumps to Ltmp5
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp6-Lfunc_begin1 ; >> Call Site 2 <<
+; GISEL-NEXT:    .uleb128 Ltmp7-Ltmp6 ; Call between Ltmp6 and Ltmp7
+; GISEL-NEXT:    .uleb128 Ltmp8-Lfunc_begin1 ; jumps to Ltmp8
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp7-Lfunc_begin1 ; >> Call Site 3 <<
+; GISEL-NEXT:    .uleb128 Lfunc_end1-Ltmp7 ; Call between Ltmp7 and Lfunc_end1
+; GISEL-NEXT:    .byte 0 ; has no landing pad
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:  Lcst_end1:
+; GISEL-NEXT:    .p2align 2, 0x0
 entry:
   %s = alloca %struct.S, align 1
   call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %s) #2
@@ -129,11 +434,53 @@ ehcleanup:                                        ; preds = %lpad1, %lpad
 }
 
 define dso_local ptr @rv_marker_5_indirect_call() {
-; CHECK-LABEL: _rv_marker_5_indirect_call:
-; CHECK:         ldr [[ADDR:x[0-9]+]], [
-; CHECK-NEXT:    blr [[ADDR]]
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_5_indirect_call:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 32
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    .cfi_offset w19, -24
+; SELDAG-NEXT:    .cfi_offset w20, -32
+; SELDAG-NEXT:  Lloh0:
+; SELDAG-NEXT:    adrp x8, _fptr@PAGE
+; SELDAG-NEXT:  Lloh1:
+; SELDAG-NEXT:    ldr x8, [x8, _fptr@PAGEOFF]
+; SELDAG-NEXT:    blr x8
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    mov x19, x0
+; SELDAG-NEXT:    bl _foo2
+; SELDAG-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
+; SELDAG-NEXT:    .loh AdrpLdr Lloh0, Lloh1
+;
+; GISEL-LABEL: rv_marker_5_indirect_call:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 32
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    .cfi_offset w19, -24
+; GISEL-NEXT:    .cfi_offset w20, -32
+; GISEL-NEXT:  Lloh0:
+; GISEL-NEXT:    adrp x8, _fptr@PAGE
+; GISEL-NEXT:  Lloh1:
+; GISEL-NEXT:    ldr x8, [x8, _fptr@PAGEOFF]
+; GISEL-NEXT:    blr x8
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:    bl _foo2
+; GISEL-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
+; GISEL-NEXT:    .loh AdrpLdr Lloh0, Lloh1
 entry:
   %0 = load ptr, ptr @fptr, align 8
   %call = call ptr %0() [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
@@ -144,13 +491,35 @@ entry:
 declare ptr @foo(i64, i64, i64)
 
 define dso_local void @rv_marker_multiarg(i64 %a, i64 %b, i64 %c) {
-; CHECK-LABEL: _rv_marker_multiarg:
-; CHECK:         mov [[TMP:x[0-9]+]], x0
-; CHECK-NEXT:    mov x0, x2
-; CHECK-NEXT:    mov x2, [[TMP]]
-; CHECK-NEXT:    bl  _foo
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_multiarg:
+; SELDAG:       ; %bb.0:
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    mov x8, x0
+; SELDAG-NEXT:    mov x0, x2
+; SELDAG-NEXT:    mov x2, x8
+; SELDAG-NEXT:    bl _foo
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
+;
+; GISEL-LABEL: rv_marker_multiarg:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    mov x3, x0
+; GISEL-NEXT:    mov x0, x2
+; GISEL-NEXT:    mov x2, x3
+; GISEL-NEXT:    bl _foo
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
   call ptr @foo(i64 %c, i64 %b, i64 %a) [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
   ret void
 }
@@ -158,3 +527,5 @@ define dso_local void @rv_marker_multiarg(i64 %a, i64 %b, i64 %c) {
 declare ptr @objc_retainAutoreleasedReturnValue(ptr)
 declare ptr @objc_unsafeClaimAutoreleasedReturnValue(ptr)
 declare i32 @__gxx_personality_v0(...)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 3c74508bb12b..82e29d0f8a19 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1,81 +1,1784 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-
-declare void @bar(i32)
-
-define void @test_float(float %a, float %b) {
-; CHECK-LABEL: test_float:
-
-  %tst1 = fcmp oeq float %a, %b
-  br i1 %tst1, label %end, label %t2
-; CHECK: fcmp {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK: b.eq .L
-
-t2:
-  %tst2 = fcmp une float %b, 0.0
-  br i1 %tst2, label %t3, label %end
-; CHECK: fcmp {{s[0-9]+}}, #0.0
-; CHECK: b.eq .L
-
-
-t3:
-; This test can't be implemented with just one A64 conditional
-; branch. LLVM converts "ordered and not equal" to "unordered or
-; equal" before instruction selection, which is what we currently
-; test. Obviously, other sequences are valid.
-  %tst3 = fcmp one float %a,  %b
-  br i1 %tst3, label %t4, label %end
-; CHECK: fcmp {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-NEXT: b.eq .[[T4:LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: b.vs .[[T4]]
-t4:
-  %tst4 = fcmp uge float %a, -0.0
-  br i1 %tst4, label %t5, label %end
-; CHECK-NOT: fcmp {{s[0-9]+}}, #0.0
-; CHECK: b.mi .LBB
-
-t5:
-  call void @bar(i32 0)
-  ret void
-end:
-  ret void
-
-}
-
-define void @test_double(double %a, double %b) {
-; CHECK-LABEL: test_double:
-
-  %tst1 = fcmp oeq double %a, %b
-  br i1 %tst1, label %end, label %t2
-; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: b.eq .L
-
-t2:
-  %tst2 = fcmp une double %b, 0.0
-  br i1 %tst2, label %t3, label %end
-; CHECK: fcmp {{d[0-9]+}}, #0.0
-; CHECK: b.eq .L
-
-
-t3:
-; This test can't be implemented with just one A64 conditional
-; branch. LLVM converts "ordered and not equal" to "unordered or
-; equal" before instruction selection, which is what we currently
-; test. Obviously, other sequences are valid.
-  %tst3 = fcmp one double %a,  %b
-  br i1 %tst3, label %t4, label %end
-; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: b.eq .[[T4:LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: b.vs .[[T4]]
-t4:
-  %tst4 = fcmp uge double %a, -0.0
-  br i1 %tst4, label %t5, label %end
-; CHECK-NOT: fcmp {{d[0-9]+}}, #0.0
-; CHECK: b.mi .LBB
-
-t5:
-  call void @bar(i32 0)
-  ret void
-end:
-  ret void
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
+; CHECK-GI:       warning: Instruction selection used fallback path for v3f64_double
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3f64_i32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3f32_float
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3f32_i32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v7f16_half
+; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for v16f16_half
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v7f16_i32
+; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for v16f16_i32
+
+define double @f64_double(double %a, double %b, double %d, double %e) {
+; CHECK-LABEL: f64_double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmp d0, d1
+; CHECK-NEXT:    fcsel d0, d2, d3, mi
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt double %a, %b
+  %s = select i1 %c, double %d, double %e
+  ret double %s
+}
+
+define i32 @f64_i32(double %a, double %b, i32 %d, i32 %e) {
+; CHECK-LABEL: f64_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmp d0, d1
+; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt double %a, %b
+  %s = select i1 %c, i32 %d, i32 %e
+  ret i32 %s
+}
+
+define float @f32_float(float %a, float %b, float %d, float %e) {
+; CHECK-LABEL: f32_float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    fcsel s0, s2, s3, mi
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt float %a, %b
+  %s = select i1 %c, float %d, float %e
+  ret float %s
+}
+
+define i32 @f32_i32(float %a, float %b, i32 %d, i32 %e) {
+; CHECK-LABEL: f32_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt float %a, %b
+  %s = select i1 %c, i32 %d, i32 %e
+  ret i32 %s
+}
+
+define half @f16_half(half %a, half %b, half %d, half %e) {
+; CHECK-SD-NOFP16-LABEL: f16_half:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    // kill: def $h3 killed $h3 def $s3
+; CHECK-SD-NOFP16-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-SD-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-SD-NOFP16-NEXT:    fcsel s0, s2, s3, mi
+; CHECK-SD-NOFP16-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: f16_half:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    fcmp h0, h1
+; CHECK-SD-FP16-NEXT:    fcsel h0, h2, h3, mi
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-LABEL: f16_half:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcvt s0, h0
+; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-GI-NEXT:    // kill: def $h3 killed $h3 def $s3
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    fcmp s0, s1
+; CHECK-GI-NEXT:    csel w8, w8, w9, mi
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp olt half %a, %b
+  %s = select i1 %c, half %d, half %e
+  ret half %s
+}
+
+define i32 @f16_i32(half %a, half %b, i32 %d, i32 %e) {
+; CHECK-SD-NOFP16-LABEL: f16_i32:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-SD-NOFP16-NEXT:    csel w0, w0, w1, mi
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: f16_i32:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    fcmp h0, h1
+; CHECK-SD-FP16-NEXT:    csel w0, w0, w1, mi
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-LABEL: f16_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcvt s0, h0
+; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    fcmp s0, s1
+; CHECK-GI-NEXT:    csel w0, w0, w1, mi
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp olt half %a, %b
+  %s = select i1 %c, i32 %d, i32 %e
+  ret i32 %s
+}
+
+define <2 x double> @v2f64_double(<2 x double> %a, <2 x double> %b, <2 x double> %d, <2 x double> %e) {
+; CHECK-LABEL: v2f64_double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <2 x double> %a, %b
+  %s = select <2 x i1> %c, <2 x double> %d, <2 x double> %e
+  ret <2 x double> %s
+}
+
+define <3 x double> @v3f64_double(<3 x double> %a, <3 x double> %b, <3 x double> %d, <3 x double> %e) {
+; CHECK-LABEL: v3f64_double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d6 killed $d6 def $q6
+; CHECK-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    // kill: def $d5 killed $d5 def $q5
+; CHECK-NEXT:    ldr d16, [sp, #24]
+; CHECK-NEXT:    ldr d17, [sp]
+; CHECK-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    mov v6.d[1], v7.d[0]
+; CHECK-NEXT:    ldp d1, d4, [sp, #8]
+; CHECK-NEXT:    fcmgt v2.2d, v5.2d, v2.2d
+; CHECK-NEXT:    mov v1.d[1], v4.d[0]
+; CHECK-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
+; CHECK-NEXT:    bsl v2.16b, v17.16b, v16.16b
+; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-NEXT:    bsl v0.16b, v6.16b, v1.16b
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <3 x double> %a, %b
+  %s = select <3 x i1> %c, <3 x double> %d, <3 x double> %e
+  ret <3 x double> %s
+}
+
+define <4 x double> @v4f64_double(<4 x double> %a, <4 x double> %b, <4 x double> %d, <4 x double> %e) {
+; CHECK-SD-LABEL: v4f64_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmgt v1.2d, v3.2d, v1.2d
+; CHECK-SD-NEXT:    fcmgt v0.2d, v2.2d, v0.2d
+; CHECK-SD-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-SD-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4f64_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmgt v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    fcmgt v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-GI-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp olt <4 x double> %a, %b
+  %s = select <4 x i1> %c, <4 x double> %d, <4 x double> %e
+  ret <4 x double> %s
+}
+
+define <2 x i32> @v2f64_i32(<2 x double> %a, <2 x double> %b, <2 x i32> %d, <2 x i32> %e) {
+; CHECK-LABEL: v2f64_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <2 x double> %a, %b
+  %s = select <2 x i1> %c, <2 x i32> %d, <2 x i32> %e
+  ret <2 x i32> %s
+}
+
+define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x i32> %e) {
+; CHECK-LABEL: v3f64_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d5 killed $d5 def $q5
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-NEXT:    fcmgt v1.2d, v5.2d, v2.2d
+; CHECK-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bsl v0.16b, v6.16b, v7.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <3 x double> %a, %b
+  %s = select <3 x i1> %c, <3 x i32> %d, <3 x i32> %e
+  ret <3 x i32> %s
+}
+
+define <4 x i32> @v4f64_i32(<4 x double> %a, <4 x double> %b, <4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-LABEL: v4f64_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmgt v1.2d, v3.2d, v1.2d
+; CHECK-SD-NEXT:    fcmgt v0.2d, v2.2d, v0.2d
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    bsl v0.16b, v4.16b, v5.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4f64_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmgt v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    fcmgt v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT:    bsl v0.16b, v4.16b, v5.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp olt <4 x double> %a, %b
+  %s = select <4 x i1> %c, <4 x i32> %d, <4 x i32> %e
+  ret <4 x i32> %s
+}
+
+define <2 x float> @v2f32_float(<2 x float> %a, <2 x float> %b, <2 x float> %d, <2 x float> %e) {
+; CHECK-LABEL: v2f32_float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <2 x float> %a, %b
+  %s = select <2 x i1> %c, <2 x float> %d, <2 x float> %e
+  ret <2 x float> %s
+}
+
+define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d, <3 x float> %e) {
+; CHECK-LABEL: v3f32_float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <3 x float> %a, %b
+  %s = select <3 x i1> %c, <3 x float> %d, <3 x float> %e
+  ret <3 x float> %s
+}
+
+define <4 x float> @v4f32_float(<4 x float> %a, <4 x float> %b, <4 x float> %d, <4 x float> %e) {
+; CHECK-LABEL: v4f32_float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <4 x float> %a, %b
+  %s = select <4 x i1> %c, <4 x float> %d, <4 x float> %e
+  ret <4 x float> %s
+}
+
+define <8 x float> @v8f32_float(<8 x float> %a, <8 x float> %b, <8 x float> %d, <8 x float> %e) {
+; CHECK-SD-LABEL: v8f32_float:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmgt v1.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT:    fcmgt v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-SD-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8f32_float:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmgt v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    fcmgt v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-GI-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp olt <8 x float> %a, %b
+  %s = select <8 x i1> %c, <8 x float> %d, <8 x float> %e
+  ret <8 x float> %s
+}
+
+define <2 x i32> @v2f32_i32(<2 x float> %a, <2 x float> %b, <2 x i32> %d, <2 x i32> %e) {
+; CHECK-LABEL: v2f32_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <2 x float> %a, %b
+  %s = select <2 x i1> %c, <2 x i32> %d, <2 x i32> %e
+  ret <2 x i32> %s
+}
+
+define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i32> %e) {
+; CHECK-LABEL: v3f32_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <3 x float> %a, %b
+  %s = select <3 x i1> %c, <3 x i32> %d, <3 x i32> %e
+  ret <3 x i32> %s
+}
+
+define <4 x i32> @v4f32_i32(<4 x float> %a, <4 x float> %b, <4 x i32> %d, <4 x i32> %e) {
+; CHECK-LABEL: v4f32_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = fcmp olt <4 x float> %a, %b
+  %s = select <4 x i1> %c, <4 x i32> %d, <4 x i32> %e
+  ret <4 x i32> %s
+}
+
+define <8 x i32> @v8f32_i32(<8 x float> %a, <8 x float> %b, <8 x i32> %d, <8 x i32> %e) {
+; CHECK-SD-LABEL: v8f32_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmgt v1.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT:    fcmgt v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-SD-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8f32_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmgt v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    fcmgt v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-GI-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = fcmp olt <8 x float> %a, %b
+  %s = select <8 x i1> %c, <8 x i32> %d, <8 x i32> %e
+  ret <8 x i32> %s
+}
+
+define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x half> %e) {
+; CHECK-SD-NOFP16-LABEL: v7f16_half:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h0
+; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcmp s5, s4
+; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    csetm w9, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s4, s7
+; CHECK-SD-NOFP16-NEXT:    fmov s4, w9
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[1], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s6, s5
+; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[2], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    fcmp s16, s7
+; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[3], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s6, s5
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[4], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s6, s5
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[5], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[6], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[7], w8
+; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v4.16b
+; CHECK-SD-NOFP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: v7f16_half:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-SD-FP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: v7f16_half:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h1
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h0
+; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-GI-NOFP16-NEXT:    fcmp s5, s4
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s7, s6
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    csetm w9, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s4, s7
+; CHECK-GI-NOFP16-NEXT:    fmov s4, w9
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s5
+; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], w8
+; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-GI-NOFP16-NEXT:    fcmp s16, s7
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], w8
+; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s5
+; CHECK-GI-NOFP16-NEXT:    fcvt s5, h7
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[4], w8
+; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s5
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[5], w8
+; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[6], w8
+; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[7], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: v7f16_half:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = fcmp olt <7 x half> %a, %b
+  %s = select <7 x i1> %c, <7 x half> %d, <7 x half> %e
+  ret <7 x half> %s
+}
+
+define <4 x half> @v4f16_half(<4 x half> %a, <4 x half> %b, <4 x half> %d, <4 x half> %e) {
+; CHECK-SD-NOFP16-LABEL: v4f16_half:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NOFP16-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: v4f16_half:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    fcmgt v0.4h, v1.4h, v0.4h
+; CHECK-SD-FP16-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: v4f16_half:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h1
+; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h17
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
+; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
+; CHECK-GI-NOFP16-NEXT:    cset w9, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w9
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    shl v0.4h, v4.4h, #15
+; CHECK-GI-NOFP16-NEXT:    sshr v0.4h, v0.4h, #15
+; CHECK-GI-NOFP16-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: v4f16_half:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    fcmgt v0.4h, v1.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = fcmp olt <4 x half> %a, %b
+  %s = select <4 x i1> %c, <4 x half> %d, <4 x half> %e
+  ret <4 x half> %s
+}
+
+define <8 x half> @v8f16_half(<8 x half> %a, <8 x half> %b, <8 x half> %d, <8 x half> %e) {
+; CHECK-SD-NOFP16-LABEL: v8f16_half:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h0
+; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcmp s5, s4
+; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    csetm w9, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s4, s7
+; CHECK-SD-NOFP16-NEXT:    fmov s4, w9
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[1], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s6, s5
+; CHECK-SD-NOFP16-NEXT:    mov h5, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[2], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    fcmp s16, s7
+; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[3], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s6, s5
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[4], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s6, s5
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[5], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[6], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[7], w8
+; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v4.16b
+; CHECK-SD-NOFP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: v8f16_half:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-SD-FP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: v8f16_half:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h1
+; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h17
+; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
+; CHECK-GI-NOFP16-NEXT:    cset w9, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s7, s16
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h17
+; CHECK-GI-NOFP16-NEXT:    fmov s16, w9
+; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h18
+; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v16.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
+; CHECK-GI-NOFP16-NEXT:    fmov s6, w8
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s5, s17
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvt s5, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h18
+; CHECK-GI-NOFP16-NEXT:    fmov s17, w8
+; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v17.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-GI-NOFP16-NEXT:    fcmp s5, s16
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[4], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[5], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[6], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[7], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    shl v0.8h, v4.8h, #15
+; CHECK-GI-NOFP16-NEXT:    sshr v0.8h, v0.8h, #15
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: v8f16_half:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = fcmp olt <8 x half> %a, %b
+  %s = select <8 x i1> %c, <8 x half> %d, <8 x half> %e
+  ret <8 x half> %s
+}
+
+define <16 x half> @v16f16_half(<16 x half> %a, <16 x half> %b, <16 x half> %d, <16 x half> %e) {
+; CHECK-SD-NOFP16-LABEL: v16f16_half:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[1]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    mov h18, v3.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt s20, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s21, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s21, s20
+; CHECK-SD-NOFP16-NEXT:    mov h20, v3.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    csetm w14, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
+; CHECK-SD-NOFP16-NEXT:    mov h18, v3.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
+; CHECK-SD-NOFP16-NEXT:    fcvt s21, h21
+; CHECK-SD-NOFP16-NEXT:    csetm w13, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT:    mov h3, v3.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    csetm w11, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s21, s20
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    csetm w12, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
+; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[1]
+; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    csetm w10, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h18
+; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
+; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    csetm w9, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT:    csetm w15, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    mov h16, v2.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-SD-NOFP16-NEXT:    csetm w16, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s3, s1
+; CHECK-SD-NOFP16-NEXT:    fmov s1, w14
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    csetm w14, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
+; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[4]
+; CHECK-SD-NOFP16-NEXT:    fmov s3, w14
+; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    mov h16, v2.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], w16
+; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], w13
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
+; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], w11
+; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], w12
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], w10
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], w9
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], w15
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], w8
+; CHECK-SD-NOFP16-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: v16f16_half:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    fcmgt v1.8h, v3.8h, v1.8h
+; CHECK-SD-FP16-NEXT:    fcmgt v0.8h, v2.8h, v0.8h
+; CHECK-SD-FP16-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-SD-FP16-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: v16f16_half:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvt s18, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s19, h2
+; CHECK-GI-NOFP16-NEXT:    mov h20, v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h21, v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
+; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvt s20, h20
+; CHECK-GI-NOFP16-NEXT:    fcvt s21, h21
+; CHECK-GI-NOFP16-NEXT:    cset w14, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
+; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-GI-NOFP16-NEXT:    cset w15, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s20, s21
+; CHECK-GI-NOFP16-NEXT:    mov h20, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h21, v2.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-GI-NOFP16-NEXT:    cset w13, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
+; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvt s20, h20
+; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-GI-NOFP16-NEXT:    fcvt s21, h21
+; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[7]
+; CHECK-GI-NOFP16-NEXT:    cset w12, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h18
+; CHECK-GI-NOFP16-NEXT:    fcvt s18, h1
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h19
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s19, h3
+; CHECK-GI-NOFP16-NEXT:    cset w11, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s20, s21
+; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT:    mov h20, v3.h[3]
+; CHECK-GI-NOFP16-NEXT:    cset w9, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
+; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s2
+; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h2, v3.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-GI-NOFP16-NEXT:    cset w10, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
+; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s19, h2
+; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    cset w16, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
+; CHECK-GI-NOFP16-NEXT:    mov h16, v3.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h18
+; CHECK-GI-NOFP16-NEXT:    fmov s18, w16
+; CHECK-GI-NOFP16-NEXT:    cset w17, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s19
+; CHECK-GI-NOFP16-NEXT:    fcvt s19, h20
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w14
+; CHECK-GI-NOFP16-NEXT:    fmov s20, w15
+; CHECK-GI-NOFP16-NEXT:    fmov s21, w17
+; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-GI-NOFP16-NEXT:    cset w14, mi
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v20.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[1], v21.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h20, v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h21, v3.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcmp s17, s19
+; CHECK-GI-NOFP16-NEXT:    fmov s17, w13
+; CHECK-GI-NOFP16-NEXT:    fmov s19, w14
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v17.h[0]
+; CHECK-GI-NOFP16-NEXT:    cset w13, mi
+; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[2], v19.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h19, v3.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcmp s2, s16
+; CHECK-GI-NOFP16-NEXT:    fcvt s2, h20
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h21
+; CHECK-GI-NOFP16-NEXT:    fmov s20, w12
+; CHECK-GI-NOFP16-NEXT:    fmov s21, w13
+; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v3.h[7]
+; CHECK-GI-NOFP16-NEXT:    cset w12, mi
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v20.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[3], v21.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcmp s2, s16
+; CHECK-GI-NOFP16-NEXT:    fmov s2, w11
+; CHECK-GI-NOFP16-NEXT:    fmov s16, w12
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    cset w11, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s17, s19
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[4], v16.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvt s2, h3
+; CHECK-GI-NOFP16-NEXT:    fmov s3, w9
+; CHECK-GI-NOFP16-NEXT:    fmov s16, w11
+; CHECK-GI-NOFP16-NEXT:    cset w9, mi
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[5], v16.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcmp s1, s2
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
+; CHECK-GI-NOFP16-NEXT:    fmov s2, w9
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w10
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[6], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s2, w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[7], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    shl v0.8h, v0.8h, #15
+; CHECK-GI-NOFP16-NEXT:    shl v1.8h, v18.8h, #15
+; CHECK-GI-NOFP16-NEXT:    sshr v0.8h, v0.8h, #15
+; CHECK-GI-NOFP16-NEXT:    sshr v1.8h, v1.8h, #15
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: v16f16_half:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v3.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v2.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = fcmp olt <16 x half> %a, %b
+  %s = select <16 x i1> %c, <16 x half> %d, <16 x half> %e
+  ret <16 x half> %s
+}
+
+define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32> %e) {
+; CHECK-SD-NOFP16-LABEL: v7f16_i32:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcmp s3, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
+; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    csetm w9, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s5, s4
+; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-SD-NOFP16-NEXT:    csetm w10, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s3, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h4
+; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h5
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    csetm w11, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    csetm w12, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s3, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h5
+; CHECK-SD-NOFP16-NEXT:    fmov s4, w9
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    add x9, sp, #8
+; CHECK-SD-NOFP16-NEXT:    csetm w13, mi
+; CHECK-SD-NOFP16-NEXT:    fmov s5, w13
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[1], w8
+; CHECK-SD-NOFP16-NEXT:    mov x8, sp
+; CHECK-SD-NOFP16-NEXT:    fcmp s3, s2
+; CHECK-SD-NOFP16-NEXT:    fmov s2, w7
+; CHECK-SD-NOFP16-NEXT:    fmov s3, w0
+; CHECK-SD-NOFP16-NEXT:    mov v5.h[1], w12
+; CHECK-SD-NOFP16-NEXT:    ld1 { v2.s }[1], [x8]
+; CHECK-SD-NOFP16-NEXT:    mov v3.s[1], w1
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[2], w10
+; CHECK-SD-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-SD-NOFP16-NEXT:    fmov s1, w4
+; CHECK-SD-NOFP16-NEXT:    ldr s0, [sp, #24]
+; CHECK-SD-NOFP16-NEXT:    mov v5.h[2], w8
+; CHECK-SD-NOFP16-NEXT:    ld1 { v2.s }[2], [x9]
+; CHECK-SD-NOFP16-NEXT:    add x9, sp, #32
+; CHECK-SD-NOFP16-NEXT:    mov v3.s[2], w2
+; CHECK-SD-NOFP16-NEXT:    mov v1.s[1], w5
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    mov v4.h[3], w11
+; CHECK-SD-NOFP16-NEXT:    ld1 { v0.s }[1], [x9]
+; CHECK-SD-NOFP16-NEXT:    mov v5.h[3], w8
+; CHECK-SD-NOFP16-NEXT:    add x8, sp, #16
+; CHECK-SD-NOFP16-NEXT:    ld1 { v2.s }[3], [x8]
+; CHECK-SD-NOFP16-NEXT:    mov v3.s[3], w3
+; CHECK-SD-NOFP16-NEXT:    add x8, sp, #40
+; CHECK-SD-NOFP16-NEXT:    mov v1.s[2], w6
+; CHECK-SD-NOFP16-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-SD-NOFP16-NEXT:    ld1 { v0.s }[2], [x8]
+; CHECK-SD-NOFP16-NEXT:    sshll v5.4s, v5.4h, #0
+; CHECK-SD-NOFP16-NEXT:    bit v2.16b, v3.16b, v4.16b
+; CHECK-SD-NOFP16-NEXT:    bit v0.16b, v1.16b, v5.16b
+; CHECK-SD-NOFP16-NEXT:    mov w1, v2.s[1]
+; CHECK-SD-NOFP16-NEXT:    mov w2, v2.s[2]
+; CHECK-SD-NOFP16-NEXT:    mov w3, v2.s[3]
+; CHECK-SD-NOFP16-NEXT:    fmov w0, s2
+; CHECK-SD-NOFP16-NEXT:    mov w5, v0.s[1]
+; CHECK-SD-NOFP16-NEXT:    mov w6, v0.s[2]
+; CHECK-SD-NOFP16-NEXT:    fmov w4, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: v7f16_i32:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    fmov s2, w0
+; CHECK-SD-FP16-NEXT:    fmov s3, w7
+; CHECK-SD-FP16-NEXT:    mov x8, sp
+; CHECK-SD-FP16-NEXT:    fmov s5, w4
+; CHECK-SD-FP16-NEXT:    ldr s4, [sp, #24]
+; CHECK-SD-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-SD-FP16-NEXT:    add x9, sp, #32
+; CHECK-SD-FP16-NEXT:    mov v2.s[1], w1
+; CHECK-SD-FP16-NEXT:    ld1 { v3.s }[1], [x8]
+; CHECK-SD-FP16-NEXT:    add x8, sp, #8
+; CHECK-SD-FP16-NEXT:    mov v5.s[1], w5
+; CHECK-SD-FP16-NEXT:    ld1 { v4.s }[1], [x9]
+; CHECK-SD-FP16-NEXT:    add x9, sp, #16
+; CHECK-SD-FP16-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-SD-FP16-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-SD-FP16-NEXT:    ld1 { v3.s }[2], [x8]
+; CHECK-SD-FP16-NEXT:    add x8, sp, #40
+; CHECK-SD-FP16-NEXT:    mov v2.s[2], w2
+; CHECK-SD-FP16-NEXT:    ld1 { v4.s }[2], [x8]
+; CHECK-SD-FP16-NEXT:    mov v5.s[2], w6
+; CHECK-SD-FP16-NEXT:    ld1 { v3.s }[3], [x9]
+; CHECK-SD-FP16-NEXT:    mov v2.s[3], w3
+; CHECK-SD-FP16-NEXT:    bsl v0.16b, v5.16b, v4.16b
+; CHECK-SD-FP16-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; CHECK-SD-FP16-NEXT:    mov w5, v0.s[1]
+; CHECK-SD-FP16-NEXT:    mov w6, v0.s[2]
+; CHECK-SD-FP16-NEXT:    fmov w4, s0
+; CHECK-SD-FP16-NEXT:    mov w1, v1.s[1]
+; CHECK-SD-FP16-NEXT:    mov w2, v1.s[2]
+; CHECK-SD-FP16-NEXT:    mov w3, v1.s[3]
+; CHECK-SD-FP16-NEXT:    fmov w0, s1
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: v7f16_i32:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h1
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-GI-NOFP16-NEXT:    fcmp s3, s2
+; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s7, s6
+; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-GI-NOFP16-NEXT:    csetm w9, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s5, s4
+; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-GI-NOFP16-NEXT:    csetm w10, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s3, s2
+; CHECK-GI-NOFP16-NEXT:    fcvt s2, h4
+; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-GI-NOFP16-NEXT:    fcvt s3, h5
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-GI-NOFP16-NEXT:    csetm w11, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s7, s6
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    csetm w12, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s3, s2
+; CHECK-GI-NOFP16-NEXT:    fcvt s2, h4
+; CHECK-GI-NOFP16-NEXT:    fcvt s3, h5
+; CHECK-GI-NOFP16-NEXT:    fmov s4, w9
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    add x9, sp, #8
+; CHECK-GI-NOFP16-NEXT:    csetm w13, mi
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w13
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov x8, sp
+; CHECK-GI-NOFP16-NEXT:    fcmp s3, s2
+; CHECK-GI-NOFP16-NEXT:    fmov s2, w7
+; CHECK-GI-NOFP16-NEXT:    fmov s3, w0
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], w12
+; CHECK-GI-NOFP16-NEXT:    ld1 { v2.s }[1], [x8]
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w1
+; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], w10
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w4
+; CHECK-GI-NOFP16-NEXT:    ldr s0, [sp, #24]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], w8
+; CHECK-GI-NOFP16-NEXT:    ld1 { v2.s }[2], [x9]
+; CHECK-GI-NOFP16-NEXT:    add x9, sp, #32
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w2
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], w5
+; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], w11
+; CHECK-GI-NOFP16-NEXT:    ld1 { v0.s }[1], [x9]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], w8
+; CHECK-GI-NOFP16-NEXT:    add x8, sp, #16
+; CHECK-GI-NOFP16-NEXT:    ld1 { v2.s }[3], [x8]
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[3], w3
+; CHECK-GI-NOFP16-NEXT:    add x8, sp, #40
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], w6
+; CHECK-GI-NOFP16-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-GI-NOFP16-NEXT:    ld1 { v0.s }[2], [x8]
+; CHECK-GI-NOFP16-NEXT:    sshll v5.4s, v5.4h, #0
+; CHECK-GI-NOFP16-NEXT:    bit v2.16b, v3.16b, v4.16b
+; CHECK-GI-NOFP16-NEXT:    bit v0.16b, v1.16b, v5.16b
+; CHECK-GI-NOFP16-NEXT:    mov w1, v2.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w2, v2.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov w3, v2.s[3]
+; CHECK-GI-NOFP16-NEXT:    fmov w0, s2
+; CHECK-GI-NOFP16-NEXT:    mov w5, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w6, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    fmov w4, s0
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: v7f16_i32:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    fmov s2, w0
+; CHECK-GI-FP16-NEXT:    fmov s3, w7
+; CHECK-GI-FP16-NEXT:    mov x8, sp
+; CHECK-GI-FP16-NEXT:    fmov s5, w4
+; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #24]
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    add x9, sp, #32
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w1
+; CHECK-GI-FP16-NEXT:    ld1 { v3.s }[1], [x8]
+; CHECK-GI-FP16-NEXT:    add x8, sp, #8
+; CHECK-GI-FP16-NEXT:    mov v5.s[1], w5
+; CHECK-GI-FP16-NEXT:    ld1 { v4.s }[1], [x9]
+; CHECK-GI-FP16-NEXT:    add x9, sp, #16
+; CHECK-GI-FP16-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-FP16-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-FP16-NEXT:    ld1 { v3.s }[2], [x8]
+; CHECK-GI-FP16-NEXT:    add x8, sp, #40
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w2
+; CHECK-GI-FP16-NEXT:    ld1 { v4.s }[2], [x8]
+; CHECK-GI-FP16-NEXT:    mov v5.s[2], w6
+; CHECK-GI-FP16-NEXT:    ld1 { v3.s }[3], [x9]
+; CHECK-GI-FP16-NEXT:    mov v2.s[3], w3
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v5.16b, v4.16b
+; CHECK-GI-FP16-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    mov w5, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov w6, v0.s[2]
+; CHECK-GI-FP16-NEXT:    fmov w4, s0
+; CHECK-GI-FP16-NEXT:    mov w1, v1.s[1]
+; CHECK-GI-FP16-NEXT:    mov w2, v1.s[2]
+; CHECK-GI-FP16-NEXT:    mov w3, v1.s[3]
+; CHECK-GI-FP16-NEXT:    fmov w0, s1
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = fcmp olt <7 x half> %a, %b
+  %s = select <7 x i1> %c, <7 x i32> %d, <7 x i32> %e
+  ret <7 x i32> %s
+}
+
+define <4 x i32> @v4f16_i32(<4 x half> %a, <4 x half> %b, <4 x i32> %d, <4 x i32> %e) {
+; CHECK-SD-NOFP16-LABEL: v4f16_i32:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-SD-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-SD-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-SD-NOFP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: v4f16_i32:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    fcmgt v0.4h, v1.4h, v0.4h
+; CHECK-SD-FP16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-FP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: v4f16_i32:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h1
+; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h17
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
+; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
+; CHECK-GI-NOFP16-NEXT:    cset w9, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w9
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    ushll v0.4s, v4.4h, #0
+; CHECK-GI-NOFP16-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-GI-NOFP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: v4f16_i32:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    fcmgt v0.4h, v1.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = fcmp olt <4 x half> %a, %b
+  %s = select <4 x i1> %c, <4 x i32> %d, <4 x i32> %e
+  ret <4 x i32> %s
+}
+
+define <8 x i32> @v8f16_i32(<8 x half> %a, <8 x half> %b, <8 x i32> %d, <8 x i32> %e) {
+; CHECK-SD-NOFP16-LABEL: v8f16_i32:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
+; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-SD-NOFP16-NEXT:    csetm w9, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    csetm w10, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h0
+; CHECK-SD-NOFP16-NEXT:    csetm w11, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    csetm w12, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h16
+; CHECK-SD-NOFP16-NEXT:    fmov s16, w9
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h17
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    csetm w13, mi
+; CHECK-SD-NOFP16-NEXT:    fmov s17, w13
+; CHECK-SD-NOFP16-NEXT:    mov v16.h[1], w8
+; CHECK-SD-NOFP16-NEXT:    fcmp s7, s6
+; CHECK-SD-NOFP16-NEXT:    mov v17.h[1], w12
+; CHECK-SD-NOFP16-NEXT:    mov v16.h[2], w10
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-SD-NOFP16-NEXT:    mov v17.h[2], w8
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    mov v16.h[3], w11
+; CHECK-SD-NOFP16-NEXT:    mov v17.h[3], w8
+; CHECK-SD-NOFP16-NEXT:    sshll v1.4s, v16.4h, #0
+; CHECK-SD-NOFP16-NEXT:    sshll v0.4s, v17.4h, #0
+; CHECK-SD-NOFP16-NEXT:    bsl v1.16b, v3.16b, v5.16b
+; CHECK-SD-NOFP16-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: v8f16_i32:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-SD-FP16-NEXT:    sshll v6.4s, v0.4h, #0
+; CHECK-SD-FP16-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-SD-FP16-NEXT:    mov v1.16b, v0.16b
+; CHECK-SD-FP16-NEXT:    mov v0.16b, v6.16b
+; CHECK-SD-FP16-NEXT:    bsl v1.16b, v3.16b, v5.16b
+; CHECK-SD-FP16-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: v8f16_i32:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h1
+; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
+; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-GI-NOFP16-NEXT:    cset w9, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
+; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-GI-NOFP16-NEXT:    cset w10, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
+; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-GI-NOFP16-NEXT:    cset w11, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h17
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    cset w12, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    fmov s16, w12
+; CHECK-GI-NOFP16-NEXT:    cset w13, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
+; CHECK-GI-NOFP16-NEXT:    fmov s6, w8
+; CHECK-GI-NOFP16-NEXT:    fmov s7, w9
+; CHECK-GI-NOFP16-NEXT:    fmov s17, w13
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w10
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w11
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[3], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    ushll v0.4s, v6.4h, #0
+; CHECK-GI-NOFP16-NEXT:    ushll v1.4s, v16.4h, #0
+; CHECK-GI-NOFP16-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-GI-NOFP16-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-GI-NOFP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-NOFP16-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v1.16b, v3.16b, v5.16b
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: v8f16_i32:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-FP16-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-FP16-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-FP16-NEXT:    sshr v6.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    mov v0.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    mov v1.16b, v6.16b
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v2.16b, v4.16b
+; CHECK-GI-FP16-NEXT:    bsl v1.16b, v3.16b, v5.16b
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = fcmp olt <8 x half> %a, %b
+  %s = select <8 x i1> %c, <8 x i32> %d, <8 x i32> %e
+  ret <8 x i32> %s
+}
+
+define <16 x i32> @v16f16_i32(<16 x half> %a, <16 x half> %b, <16 x i32> %d, <16 x i32> %e) {
+; CHECK-SD-NOFP16-LABEL: v16f16_i32:
+; CHECK-SD-NOFP16:       // %bb.0: // %entry
+; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h18, v3.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h20, v3.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
+; CHECK-SD-NOFP16-NEXT:    fcvt s21, h21
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
+; CHECK-SD-NOFP16-NEXT:    mov h18, v3.h[1]
+; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    csetm w10, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s21, s20
+; CHECK-SD-NOFP16-NEXT:    fcvt s20, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt s21, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-SD-NOFP16-NEXT:    csetm w9, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    mov h16, v3.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h3, v3.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    csetm w11, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
+; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    csetm w12, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s21, s20
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-SD-NOFP16-NEXT:    csetm w14, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    mov h16, v2.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    csetm w13, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s1, s3
+; CHECK-SD-NOFP16-NEXT:    mov h1, v2.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    csetm w15, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
+; CHECK-SD-NOFP16-NEXT:    mov h18, v2.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    csetm w16, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    mov h16, v2.h[1]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-SD-NOFP16-NEXT:    csetm w17, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s3, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h17
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h0
+; CHECK-SD-NOFP16-NEXT:    csetm w18, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s19, s18
+; CHECK-SD-NOFP16-NEXT:    fmov s18, w14
+; CHECK-SD-NOFP16-NEXT:    fmov s19, w17
+; CHECK-SD-NOFP16-NEXT:    csetm w0, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s3, s1
+; CHECK-SD-NOFP16-NEXT:    mov h1, v2.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov v18.h[1], w12
+; CHECK-SD-NOFP16-NEXT:    mov v19.h[1], w16
+; CHECK-SD-NOFP16-NEXT:    csetm w1, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s17, s16
+; CHECK-SD-NOFP16-NEXT:    fmov s16, w10
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    csetm w2, mi
+; CHECK-SD-NOFP16-NEXT:    mov v16.h[1], w8
+; CHECK-SD-NOFP16-NEXT:    mov v18.h[2], w13
+; CHECK-SD-NOFP16-NEXT:    fmov s17, w2
+; CHECK-SD-NOFP16-NEXT:    mov v19.h[2], w18
+; CHECK-SD-NOFP16-NEXT:    fcmp s3, s1
+; CHECK-SD-NOFP16-NEXT:    mov v17.h[1], w1
+; CHECK-SD-NOFP16-NEXT:    mov v16.h[2], w9
+; CHECK-SD-NOFP16-NEXT:    mov v18.h[3], w15
+; CHECK-SD-NOFP16-NEXT:    mov v19.h[3], w0
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    fcmp s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov v17.h[2], w8
+; CHECK-SD-NOFP16-NEXT:    mov v16.h[3], w11
+; CHECK-SD-NOFP16-NEXT:    csetm w8, mi
+; CHECK-SD-NOFP16-NEXT:    mov v17.h[3], w8
+; CHECK-SD-NOFP16-NEXT:    sshll v2.4s, v16.4h, #0
+; CHECK-SD-NOFP16-NEXT:    sshll v16.4s, v18.4h, #0
+; CHECK-SD-NOFP16-NEXT:    ldp q0, q18, [sp]
+; CHECK-SD-NOFP16-NEXT:    sshll v1.4s, v17.4h, #0
+; CHECK-SD-NOFP16-NEXT:    sshll v17.4s, v19.4h, #0
+; CHECK-SD-NOFP16-NEXT:    ldp q19, q3, [sp, #32]
+; CHECK-SD-NOFP16-NEXT:    bit v0.16b, v4.16b, v1.16b
+; CHECK-SD-NOFP16-NEXT:    mov v1.16b, v17.16b
+; CHECK-SD-NOFP16-NEXT:    bit v3.16b, v7.16b, v2.16b
+; CHECK-SD-NOFP16-NEXT:    mov v2.16b, v16.16b
+; CHECK-SD-NOFP16-NEXT:    bsl v1.16b, v5.16b, v18.16b
+; CHECK-SD-NOFP16-NEXT:    bsl v2.16b, v6.16b, v19.16b
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: v16f16_i32:
+; CHECK-SD-FP16:       // %bb.0: // %entry
+; CHECK-SD-FP16-NEXT:    fcmgt v0.8h, v2.8h, v0.8h
+; CHECK-SD-FP16-NEXT:    fcmgt v1.8h, v3.8h, v1.8h
+; CHECK-SD-FP16-NEXT:    ldp q2, q20, [sp]
+; CHECK-SD-FP16-NEXT:    ldp q18, q19, [sp, #32]
+; CHECK-SD-FP16-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-SD-FP16-NEXT:    sshll v16.4s, v1.4h, #0
+; CHECK-SD-FP16-NEXT:    sshll2 v17.4s, v1.8h, #0
+; CHECK-SD-FP16-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-FP16-NEXT:    mov v0.16b, v3.16b
+; CHECK-SD-FP16-NEXT:    mov v3.16b, v17.16b
+; CHECK-SD-FP16-NEXT:    bsl v1.16b, v5.16b, v20.16b
+; CHECK-SD-FP16-NEXT:    bsl v0.16b, v4.16b, v2.16b
+; CHECK-SD-FP16-NEXT:    mov v2.16b, v16.16b
+; CHECK-SD-FP16-NEXT:    bsl v3.16b, v7.16b, v19.16b
+; CHECK-SD-FP16-NEXT:    bsl v2.16b, v6.16b, v18.16b
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: v16f16_i32:
+; CHECK-GI-NOFP16:       // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvt s18, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s19, h2
+; CHECK-GI-NOFP16-NEXT:    mov h20, v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h21, v2.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
+; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvt s20, h20
+; CHECK-GI-NOFP16-NEXT:    fcvt s21, h21
+; CHECK-GI-NOFP16-NEXT:    cset w14, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
+; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-GI-NOFP16-NEXT:    cset w15, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s20, s21
+; CHECK-GI-NOFP16-NEXT:    mov h20, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h21, v2.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-GI-NOFP16-NEXT:    cset w9, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
+; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvt s20, h20
+; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-GI-NOFP16-NEXT:    fcvt s21, h21
+; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[7]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h18
+; CHECK-GI-NOFP16-NEXT:    fcvt s18, h1
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h19
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s19, h3
+; CHECK-GI-NOFP16-NEXT:    cset w16, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s20, s21
+; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT:    cset w17, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
+; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    cset w11, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s2
+; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h2, v3.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-GI-NOFP16-NEXT:    cset w10, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
+; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h19, v3.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT:    cset w18, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
+; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v3.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-GI-NOFP16-NEXT:    cset w0, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s2
+; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h2, v3.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-GI-NOFP16-NEXT:    cset w13, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
+; CHECK-GI-NOFP16-NEXT:    mov h18, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov h19, v3.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT:    mov h3, v3.h[7]
+; CHECK-GI-NOFP16-NEXT:    cset w12, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h18
+; CHECK-GI-NOFP16-NEXT:    fmov s18, w15
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h19
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    fmov s19, w17
+; CHECK-GI-NOFP16-NEXT:    cset w1, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s0, s2
+; CHECK-GI-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-GI-NOFP16-NEXT:    fmov s2, w16
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w14
+; CHECK-GI-NOFP16-NEXT:    cset w14, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s16, s17
+; CHECK-GI-NOFP16-NEXT:    fmov s16, w18
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v19.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s17, w0
+; CHECK-GI-NOFP16-NEXT:    fmov s19, w14
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v18.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s18, w1
+; CHECK-GI-NOFP16-NEXT:    cset w14, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s1, s3
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w11
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s17, w9
+; CHECK-GI-NOFP16-NEXT:    fmov s3, w14
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[1], v19.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w13
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v17.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fmov s3, w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w10
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[3], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w12
+; CHECK-GI-NOFP16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[3], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    ushll v3.4s, v18.4h, #0
+; CHECK-GI-NOFP16-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NOFP16-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-GI-NOFP16-NEXT:    ldp q18, q19, [sp, #32]
+; CHECK-GI-NOFP16-NEXT:    shl v3.4s, v3.4s, #31
+; CHECK-GI-NOFP16-NEXT:    ushll v2.4s, v16.4h, #0
+; CHECK-GI-NOFP16-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-GI-NOFP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-NOFP16-NEXT:    ldp q16, q17, [sp]
+; CHECK-GI-NOFP16-NEXT:    sshr v3.4s, v3.4s, #31
+; CHECK-GI-NOFP16-NEXT:    shl v2.4s, v2.4s, #31
+; CHECK-GI-NOFP16-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v4.16b, v16.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v3.16b, v7.16b, v19.16b
+; CHECK-GI-NOFP16-NEXT:    sshr v2.4s, v2.4s, #31
+; CHECK-GI-NOFP16-NEXT:    bsl v1.16b, v5.16b, v17.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v2.16b, v6.16b, v18.16b
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: v16f16_i32:
+; CHECK-GI-FP16:       // %bb.0: // %entry
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v2.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v3.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    ldp q2, q20, [sp]
+; CHECK-GI-FP16-NEXT:    ldp q18, q19, [sp, #32]
+; CHECK-GI-FP16-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-FP16-NEXT:    sshll v16.4s, v1.4h, #0
+; CHECK-GI-FP16-NEXT:    sshll2 v17.4s, v1.8h, #0
+; CHECK-GI-FP16-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-GI-FP16-NEXT:    mov v0.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    mov v3.16b, v17.16b
+; CHECK-GI-FP16-NEXT:    bsl v1.16b, v5.16b, v20.16b
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v4.16b, v2.16b
+; CHECK-GI-FP16-NEXT:    mov v2.16b, v16.16b
+; CHECK-GI-FP16-NEXT:    bsl v3.16b, v7.16b, v19.16b
+; CHECK-GI-FP16-NEXT:    bsl v2.16b, v6.16b, v18.16b
+; CHECK-GI-FP16-NEXT:    ret
+entry:
+  %c = fcmp olt <16 x half> %a, %b
+  %s = select <16 x i1> %c, <16 x i32> %d, <16 x i32> %e
+  ret <16 x i32> %s
 }
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index f30dad966492..23ba85d54c7a 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -5846,11 +5846,9 @@ define <3 x i8> @fptos_v3f16_v3i8(<3 x half> %a) {
 ; CHECK-GI-FP16-LABEL: fptos_v3f16_v3i8:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fcvtzs v0.4h, v0.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmov w0, s0
-; CHECK-GI-FP16-NEXT:    fmov w1, s1
-; CHECK-GI-FP16-NEXT:    fmov w2, s2
+; CHECK-GI-FP16-NEXT:    umov w0, v0.h[0]
+; CHECK-GI-FP16-NEXT:    umov w1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    umov w2, v0.h[2]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fptosi <3 x half> %a to <3 x i8>
@@ -5890,11 +5888,9 @@ define <3 x i8> @fptou_v3f16_v3i8(<3 x half> %a) {
 ; CHECK-GI-FP16-LABEL: fptou_v3f16_v3i8:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fcvtzu v0.4h, v0.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmov w0, s0
-; CHECK-GI-FP16-NEXT:    fmov w1, s1
-; CHECK-GI-FP16-NEXT:    fmov w2, s2
+; CHECK-GI-FP16-NEXT:    umov w0, v0.h[0]
+; CHECK-GI-FP16-NEXT:    umov w1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    umov w2, v0.h[2]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fptoui <3 x half> %a to <3 x i8>
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
new file mode 100644
index 000000000000..d2b44bb5e3f9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -0,0 +1,253 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for v3i64_i64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64_i64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3i32_i32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16_i16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8_i8
+
+define i64 @i64_i64(i64 %a, i64 %b, i64 %d, i64 %e) {
+; CHECK-LABEL: i64_i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp x0, x1
+; CHECK-NEXT:    csel x0, x2, x3, lt
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt i64 %a, %b
+  %s = select i1 %c, i64 %d, i64 %e
+  ret i64 %s
+}
+
+define i32 @i32_i32(i32 %a, i32 %b, i32 %d, i32 %e) {
+; CHECK-LABEL: i32_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    csel w0, w2, w3, lt
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt i32 %a, %b
+  %s = select i1 %c, i32 %d, i32 %e
+  ret i32 %s
+}
+
+define i16 @i16_i16(i16 %a, i16 %b, i16 %d, i16 %e) {
+; CHECK-LABEL: i16_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxth w8, w0
+; CHECK-NEXT:    cmp w8, w1, sxth
+; CHECK-NEXT:    csel w0, w2, w3, lt
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt i16 %a, %b
+  %s = select i1 %c, i16 %d, i16 %e
+  ret i16 %s
+}
+
+define i8 @i8_i8(i8 %a, i8 %b, i8 %d, i8 %e) {
+; CHECK-LABEL: i8_i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    cmp w8, w1, sxtb
+; CHECK-NEXT:    csel w0, w2, w3, lt
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt i8 %a, %b
+  %s = select i1 %c, i8 %d, i8 %e
+  ret i8 %s
+}
+
+define <2 x i64> @v2i64_i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %d, <2 x i64> %e) {
+; CHECK-LABEL: v2i64_i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt <2 x i64> %a, %b
+  %s = select <2 x i1> %c, <2 x i64> %d, <2 x i64> %e
+  ret <2 x i64> %s
+}
+
+define <3 x i64> @v3i64_i64(<3 x i64> %a, <3 x i64> %b, <3 x i64> %d, <3 x i64> %e) {
+; CHECK-LABEL: v3i64_i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d4 killed $d4 def $q4
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d6 killed $d6 def $q6
+; CHECK-NEXT:    // kill: def $d7 killed $d7 def $q7
+; CHECK-NEXT:    // kill: def $d5 killed $d5 def $q5
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    ldr d16, [sp, #24]
+; CHECK-NEXT:    ldr d17, [sp]
+; CHECK-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    mov v6.d[1], v7.d[0]
+; CHECK-NEXT:    ldp d1, d4, [sp, #8]
+; CHECK-NEXT:    mov v1.d[1], v4.d[0]
+; CHECK-NEXT:    cmgt v0.2d, v3.2d, v0.2d
+; CHECK-NEXT:    bsl v0.16b, v6.16b, v1.16b
+; CHECK-NEXT:    cmgt v1.2d, v5.2d, v2.2d
+; CHECK-NEXT:    mov v2.16b, v1.16b
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    bsl v2.16b, v17.16b, v16.16b
+; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt <3 x i64> %a, %b
+  %s = select <3 x i1> %c, <3 x i64> %d, <3 x i64> %e
+  ret <3 x i64> %s
+}
+
+define <4 x i64> @v4i64_i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %d, <4 x i64> %e) {
+; CHECK-LABEL: v4i64_i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v1.2d, v3.2d, v1.2d
+; CHECK-NEXT:    cmgt v0.2d, v2.2d, v0.2d
+; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt <4 x i64> %a, %b
+  %s = select <4 x i1> %c, <4 x i64> %d, <4 x i64> %e
+  ret <4 x i64> %s
+}
+
+define <2 x i32> @v2i32_i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %d, <2 x i32> %e) {
+; CHECK-LABEL: v2i32_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt <2 x i32> %a, %b
+  %s = select <2 x i1> %c, <2 x i32> %d, <2 x i32> %e
+  ret <2 x i32> %s
+}
+
+define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32> %e) {
+; CHECK-LABEL: v3i32_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt <3 x i32> %a, %b
+  %s = select <3 x i1> %c, <3 x i32> %d, <3 x i32> %e
+  ret <3 x i32> %s
+}
+
+define <4 x i32> @v4i32_i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %d, <4 x i32> %e) {
+; CHECK-LABEL: v4i32_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt <4 x i32> %a, %b
+  %s = select <4 x i1> %c, <4 x i32> %d, <4 x i32> %e
+  ret <4 x i32> %s
+}
+
+define <8 x i32> @v8i32_i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %d, <8 x i32> %e) {
+; CHECK-SD-LABEL: v8i32_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmgt v1.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT:    cmgt v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-SD-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmgt v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    cmgt v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-GI-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = icmp slt <8 x i32> %a, %b
+  %s = select <8 x i1> %c, <8 x i32> %d, <8 x i32> %e
+  ret <8 x i32> %s
+}
+
+define <4 x i16> @v4i16_i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %d, <4 x i16> %e) {
+; CHECK-LABEL: v4i16_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt <4 x i16> %a, %b
+  %s = select <4 x i1> %c, <4 x i16> %d, <4 x i16> %e
+  ret <4 x i16> %s
+}
+
+define <8 x i16> @v8i16_i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %d, <8 x i16> %e) {
+; CHECK-LABEL: v8i16_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt <8 x i16> %a, %b
+  %s = select <8 x i1> %c, <8 x i16> %d, <8 x i16> %e
+  ret <8 x i16> %s
+}
+
+define <16 x i16> @v16i16_i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %d, <16 x i16> %e) {
+; CHECK-LABEL: v16i16_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v1.8h, v3.8h, v1.8h
+; CHECK-NEXT:    cmgt v0.8h, v2.8h, v0.8h
+; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt <16 x i16> %a, %b
+  %s = select <16 x i1> %c, <16 x i16> %d, <16 x i16> %e
+  ret <16 x i16> %s
+}
+
+define <8 x i8> @v8i8_i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %d, <8 x i8> %e) {
+; CHECK-LABEL: v8i8_i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt <8 x i8> %a, %b
+  %s = select <8 x i1> %c, <8 x i8> %d, <8 x i8> %e
+  ret <8 x i8> %s
+}
+
+define <16 x i8> @v16i8_i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %d, <16 x i8> %e) {
+; CHECK-LABEL: v16i8_i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt <16 x i8> %a, %b
+  %s = select <16 x i1> %c, <16 x i8> %d, <16 x i8> %e
+  ret <16 x i8> %s
+}
+
+define <32 x i8> @v32i8_i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %d, <32 x i8> %e) {
+; CHECK-LABEL: v32i8_i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmgt v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    cmgt v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    bsl v1.16b, v5.16b, v7.16b
+; CHECK-NEXT:    bsl v0.16b, v4.16b, v6.16b
+; CHECK-NEXT:    ret
+entry:
+  %c = icmp slt <32 x i8> %a, %b
+  %s = select <32 x i1> %c, <32 x i8> %d, <32 x i8> %e
+  ret <32 x i8> %s
+}
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index fa5902c65ce0..fa1ab61a6216 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -10,7 +10,6 @@
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for utofp_v3i8_v3f32
 ; CHECK-GI-NOFP16-NEXT:  warning: Instruction selection used fallback path for stofp_v3i8_v3f16
 ; CHECK-GI-NOFP16-NEXT:  warning: Instruction selection used fallback path for utofp_v3i8_v3f16
-; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for stofp_v2i8_v2f16
 
 define double @stofp_i64_f64(i64 %a) {
 ; CHECK-LABEL: stofp_i64_f64:
@@ -5563,13 +5562,21 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16-LABEL: stofp_v2i8_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
-; CHECK-GI-FP16-NEXT:    fmov w9, s0
-; CHECK-GI-FP16-NEXT:    sxtb w9, w9
-; CHECK-GI-FP16-NEXT:    sxtb w8, w8
-; CHECK-GI-FP16-NEXT:    scvtf h0, w9
-; CHECK-GI-FP16-NEXT:    scvtf h1, w8
+; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
+; CHECK-GI-FP16-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-GI-FP16-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
+; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
new file mode 100755
index 000000000000..15b6700398ea
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass aarch64-ldst-opt %s -o - | FileCheck %s
+
+
+---
+name:            LdOffset
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+body:             |
+  bb.0.entry:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: LdOffset
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x8 = ADDXri $x0, 253, 12
+    ; CHECK-NEXT: renamable $w0 = LDRBBui killed renamable $x8, 3704
+    ; CHECK-NEXT: RET undef $lr, implicit $w0
+    renamable $w8 = MOVZWi 56952, 0
+    renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0
+    RET undef $lr, implicit $w0
+...
+
+# Negative test: the IndexReg missing killed flags
+---
+name:            LdOffset_missing_killed
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+body:             |
+  bb.0.entry:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: LdOffset_missing_killed
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $w8 = MOVZWi 56952, 0
+    ; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    ; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, renamable $x8, 0, 0
+    ; CHECK-NEXT: RET undef $lr, implicit $w0
+    renamable $w8 = MOVZWi 56952, 0
+    renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    renamable $w0 = LDRBBroX killed renamable $x0, renamable $x8, 0, 0
+    RET undef $lr, implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
index e43fcef30b00..765c81e26e13 100644
--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1789,6 +1789,27 @@ define <8 x i1> @not_cmle8xi8(<8 x i8> %0) {
   ret <8 x i1> %cmp.i
 }
 
+define <4 x i1> @not_cmle16xi8(<4 x i32> %0) {
+; CHECK-SD-LABEL: not_cmle16xi8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.8h, #1
+; CHECK-SD-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: not_cmle16xi8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI134_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI134_0]
+; CHECK-GI-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %bc = bitcast <16 x i8> <i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0> to <4 x i32>
+  %cmp.i = icmp slt <4 x i32> %0, %bc
+  ret <4 x i1> %cmp.i
+}
+
 define <8 x i8> @cmltz8xi8_alt(<8 x i8> %A) {
 ; CHECK-SD-LABEL: cmltz8xi8_alt:
 ; CHECK-SD:       // %bb.0:
@@ -2082,8 +2103,8 @@ define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
 ;
 ; CHECK-GI-LABEL: cmhsz2xi64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI154_0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI154_0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI155_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI155_0]
 ; CHECK-GI-NEXT:    cmhs v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    ret
   %tmp3 = icmp uge <2 x i64> %A, <i64 2, i64 2>
@@ -2168,8 +2189,8 @@ define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
 ;
 ; CHECK-GI-LABEL: cmhiz2xi64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI161_0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI161_0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI162_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI162_0]
 ; CHECK-GI-NEXT:    cmhi v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    ret
   %tmp3 = icmp ugt <2 x i64> %A, <i64 1, i64 1>
@@ -2344,8 +2365,8 @@ define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
 ;
 ; CHECK-GI-LABEL: cmloz2xi64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI175_0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI175_0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI176_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI176_0]
 ; CHECK-GI-NEXT:    cmhi v0.2d, v1.2d, v0.2d
 ; CHECK-GI-NEXT:    ret
   %tmp3 = icmp ult <2 x i64> %A, <i64 2, i64 2>
@@ -4531,10 +4552,6 @@ define <4 x i64> @fcmoeq4xdouble(<4 x double> %A, <4 x double> %B) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fcmeq v0.2d, v0.2d, v2.2d
 ; CHECK-GI-NEXT:    fcmeq v1.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #63
-; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #63
-; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #63
-; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #63
 ; CHECK-GI-NEXT:    ret
   %tmp3 = fcmp oeq <4 x double> %A, %B
   %tmp4 = sext <4 x i1> %tmp3 to <4 x i64>
@@ -4552,10 +4569,6 @@ define <8 x i32> @fcmoeq8xfloat(<8 x float> %A, <8 x float> %B) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fcmeq v0.4s, v0.4s, v2.4s
 ; CHECK-GI-NEXT:    fcmeq v1.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
 ; CHECK-GI-NEXT:    ret
   %tmp3 = fcmp oeq <8 x float> %A, %B
   %tmp4 = sext <8 x i1> %tmp3 to <8 x i32>
diff --git a/llvm/test/CodeGen/AArch64/reduce-and.ll b/llvm/test/CodeGen/AArch64/reduce-and.ll
index a20a76c00418..8b7438a42b71 100644
--- a/llvm/test/CodeGen/AArch64/reduce-and.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-and.ll
@@ -53,13 +53,10 @@ define i1 @test_redand_v4i1(<4 x i1> %a) {
 ; GISEL-LABEL: test_redand_v4i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w8, w8, w9
@@ -82,27 +79,20 @@ define i1 @test_redand_v8i1(<8 x i1> %a) {
 ; GISEL-LABEL: test_redand_v8i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w9, w14, w9
-; GISEL-NEXT:    and w9, w11, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    and w10, w12, w13
+; GISEL-NEXT:    and w11, w14, w15
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
@@ -122,49 +112,34 @@ define i1 @test_redand_v16i1(<16 x i1> %a) {
 ;
 ; GISEL-LABEL: test_redand_v16i1:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    mov b16, v0.b[8]
-; GISEL-NEXT:    mov b17, v0.b[9]
-; GISEL-NEXT:    mov b18, v0.b[10]
-; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    mov b20, v0.b[12]
-; GISEL-NEXT:    mov b21, v0.b[13]
-; GISEL-NEXT:    fmov w13, s7
-; GISEL-NEXT:    mov b22, v0.b[14]
-; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    umov w16, v0.b[8]
+; GISEL-NEXT:    umov w17, v0.b[9]
+; GISEL-NEXT:    umov w18, v0.b[10]
+; GISEL-NEXT:    umov w0, v0.b[11]
 ; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    umov w1, v0.b[12]
+; GISEL-NEXT:    umov w2, v0.b[13]
 ; GISEL-NEXT:    and w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    and w10, w12, w13
+; GISEL-NEXT:    umov w3, v0.b[14]
+; GISEL-NEXT:    and w11, w14, w15
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w11, s5
-; GISEL-NEXT:    fmov w14, s18
-; GISEL-NEXT:    fmov w15, s19
-; GISEL-NEXT:    fmov w16, s22
-; GISEL-NEXT:    fmov w17, s23
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    umov w4, v0.b[15]
+; GISEL-NEXT:    and w12, w16, w17
+; GISEL-NEXT:    and w13, w18, w0
 ; GISEL-NEXT:    and w9, w10, w11
-; GISEL-NEXT:    fmov w13, s17
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    and w12, w12, w13
-; GISEL-NEXT:    and w13, w14, w15
-; GISEL-NEXT:    fmov w14, s20
-; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    and w14, w1, w2
 ; GISEL-NEXT:    and w10, w12, w13
-; GISEL-NEXT:    and w14, w14, w15
-; GISEL-NEXT:    and w15, w16, w17
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w15, w3, w4
 ; GISEL-NEXT:    and w11, w14, w15
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w8, w8, w9
@@ -184,49 +159,34 @@ define <16 x i1> @test_redand_ins_v16i1(<16 x i1> %a) {
 ;
 ; GISEL-LABEL: test_redand_ins_v16i1:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    mov b16, v0.b[8]
-; GISEL-NEXT:    mov b17, v0.b[9]
-; GISEL-NEXT:    mov b18, v0.b[10]
-; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    mov b20, v0.b[12]
-; GISEL-NEXT:    mov b21, v0.b[13]
-; GISEL-NEXT:    fmov w13, s7
-; GISEL-NEXT:    mov b22, v0.b[14]
-; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    umov w16, v0.b[8]
+; GISEL-NEXT:    umov w17, v0.b[9]
+; GISEL-NEXT:    umov w18, v0.b[10]
+; GISEL-NEXT:    umov w0, v0.b[11]
 ; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    umov w1, v0.b[12]
+; GISEL-NEXT:    umov w2, v0.b[13]
 ; GISEL-NEXT:    and w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    and w10, w12, w13
+; GISEL-NEXT:    umov w3, v0.b[14]
+; GISEL-NEXT:    and w11, w14, w15
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w11, s5
-; GISEL-NEXT:    fmov w14, s18
-; GISEL-NEXT:    fmov w15, s19
-; GISEL-NEXT:    fmov w16, s22
-; GISEL-NEXT:    fmov w17, s23
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    umov w4, v0.b[15]
+; GISEL-NEXT:    and w12, w16, w17
+; GISEL-NEXT:    and w13, w18, w0
 ; GISEL-NEXT:    and w9, w10, w11
-; GISEL-NEXT:    fmov w13, s17
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    and w12, w12, w13
-; GISEL-NEXT:    and w13, w14, w15
-; GISEL-NEXT:    fmov w14, s20
-; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    and w14, w1, w2
 ; GISEL-NEXT:    and w10, w12, w13
-; GISEL-NEXT:    and w14, w14, w15
-; GISEL-NEXT:    and w15, w16, w17
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w15, w3, w4
 ; GISEL-NEXT:    and w11, w14, w15
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w8, w8, w9
@@ -287,13 +247,10 @@ define i8 @test_redand_v4i8(<4 x i8> %a) {
 ; GISEL-LABEL: test_redand_v4i8:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
@@ -315,27 +272,20 @@ define i8 @test_redand_v8i8(<8 x i8> %a) {
 ; GISEL-LABEL: test_redand_v8i8:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w9, w14, w9
-; GISEL-NEXT:    and w9, w11, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    and w10, w12, w13
+; GISEL-NEXT:    and w11, w14, w15
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
 ; GISEL-NEXT:    ret
   %and_result = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
@@ -358,27 +308,20 @@ define i8 @test_redand_v16i8(<16 x i8> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w9, w14, w9
-; GISEL-NEXT:    and w9, w11, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    and w10, w12, w13
+; GISEL-NEXT:    and w11, w14, w15
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
 ; GISEL-NEXT:    ret
   %and_result = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
@@ -403,27 +346,20 @@ define i8 @test_redand_v32i8(<32 x i8> %a) {
 ; GISEL-NEXT:    and v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    and w10, w12, w13
+; GISEL-NEXT:    and w11, w14, w15
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w9, w14, w9
-; GISEL-NEXT:    and w9, w11, w9
+; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
 ; GISEL-NEXT:    ret
   %and_result = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %a)
@@ -442,13 +378,10 @@ define i16 @test_redand_v4i16(<4 x i16> %a) {
 ; GISEL-LABEL: test_redand_v4i16:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
@@ -472,13 +405,10 @@ define i16 @test_redand_v8i16(<8 x i16> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
@@ -504,13 +434,10 @@ define i16 @test_redand_v16i16(<16 x i16> %a) {
 ; GISEL-NEXT:    and v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll
index 4c30a3293496..c4ac01f32e36 100644
--- a/llvm/test/CodeGen/AArch64/reduce-or.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-or.ll
@@ -53,13 +53,10 @@ define i1 @test_redor_v4i1(<4 x i1> %a) {
 ; GISEL-LABEL: test_redor_v4i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w8, w8, w9
@@ -82,27 +79,20 @@ define i1 @test_redor_v8i1(<8 x i1> %a) {
 ; GISEL-LABEL: test_redor_v8i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    orr w8, w8, w10
-; GISEL-NEXT:    orr w9, w14, w9
-; GISEL-NEXT:    orr w9, w11, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    orr w11, w14, w15
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
@@ -122,49 +112,34 @@ define i1 @test_redor_v16i1(<16 x i1> %a) {
 ;
 ; GISEL-LABEL: test_redor_v16i1:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    mov b16, v0.b[8]
-; GISEL-NEXT:    mov b17, v0.b[9]
-; GISEL-NEXT:    mov b18, v0.b[10]
-; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    mov b20, v0.b[12]
-; GISEL-NEXT:    mov b21, v0.b[13]
-; GISEL-NEXT:    fmov w13, s7
-; GISEL-NEXT:    mov b22, v0.b[14]
-; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    umov w16, v0.b[8]
+; GISEL-NEXT:    umov w17, v0.b[9]
+; GISEL-NEXT:    umov w18, v0.b[10]
+; GISEL-NEXT:    umov w0, v0.b[11]
 ; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    umov w1, v0.b[12]
+; GISEL-NEXT:    umov w2, v0.b[13]
 ; GISEL-NEXT:    orr w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    umov w3, v0.b[14]
+; GISEL-NEXT:    orr w11, w14, w15
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w11, s5
-; GISEL-NEXT:    fmov w14, s18
-; GISEL-NEXT:    fmov w15, s19
-; GISEL-NEXT:    fmov w16, s22
-; GISEL-NEXT:    fmov w17, s23
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    umov w4, v0.b[15]
+; GISEL-NEXT:    orr w12, w16, w17
+; GISEL-NEXT:    orr w13, w18, w0
 ; GISEL-NEXT:    orr w9, w10, w11
-; GISEL-NEXT:    fmov w13, s17
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    orr w12, w12, w13
-; GISEL-NEXT:    orr w13, w14, w15
-; GISEL-NEXT:    fmov w14, s20
-; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    orr w14, w1, w2
 ; GISEL-NEXT:    orr w10, w12, w13
-; GISEL-NEXT:    orr w14, w14, w15
-; GISEL-NEXT:    orr w15, w16, w17
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w15, w3, w4
 ; GISEL-NEXT:    orr w11, w14, w15
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w8, w8, w9
@@ -184,49 +159,34 @@ define <16 x i1> @test_redor_ins_v16i1(<16 x i1> %a) {
 ;
 ; GISEL-LABEL: test_redor_ins_v16i1:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    mov b16, v0.b[8]
-; GISEL-NEXT:    mov b17, v0.b[9]
-; GISEL-NEXT:    mov b18, v0.b[10]
-; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    mov b20, v0.b[12]
-; GISEL-NEXT:    mov b21, v0.b[13]
-; GISEL-NEXT:    fmov w13, s7
-; GISEL-NEXT:    mov b22, v0.b[14]
-; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    umov w16, v0.b[8]
+; GISEL-NEXT:    umov w17, v0.b[9]
+; GISEL-NEXT:    umov w18, v0.b[10]
+; GISEL-NEXT:    umov w0, v0.b[11]
 ; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    umov w1, v0.b[12]
+; GISEL-NEXT:    umov w2, v0.b[13]
 ; GISEL-NEXT:    orr w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    umov w3, v0.b[14]
+; GISEL-NEXT:    orr w11, w14, w15
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w11, s5
-; GISEL-NEXT:    fmov w14, s18
-; GISEL-NEXT:    fmov w15, s19
-; GISEL-NEXT:    fmov w16, s22
-; GISEL-NEXT:    fmov w17, s23
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    umov w4, v0.b[15]
+; GISEL-NEXT:    orr w12, w16, w17
+; GISEL-NEXT:    orr w13, w18, w0
 ; GISEL-NEXT:    orr w9, w10, w11
-; GISEL-NEXT:    fmov w13, s17
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    orr w12, w12, w13
-; GISEL-NEXT:    orr w13, w14, w15
-; GISEL-NEXT:    fmov w14, s20
-; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    orr w14, w1, w2
 ; GISEL-NEXT:    orr w10, w12, w13
-; GISEL-NEXT:    orr w14, w14, w15
-; GISEL-NEXT:    orr w15, w16, w17
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w15, w3, w4
 ; GISEL-NEXT:    orr w11, w14, w15
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w8, w8, w9
@@ -289,13 +249,10 @@ define i8 @test_redor_v4i8(<4 x i8> %a) {
 ; GISEL-LABEL: test_redor_v4i8:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
@@ -317,27 +274,20 @@ define i8 @test_redor_v8i8(<8 x i8> %a) {
 ; GISEL-LABEL: test_redor_v8i8:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    orr w8, w8, w10
-; GISEL-NEXT:    orr w9, w14, w9
-; GISEL-NEXT:    orr w9, w11, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    orr w11, w14, w15
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
 ; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
@@ -360,27 +310,20 @@ define i8 @test_redor_v16i8(<16 x i8> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    orr w8, w8, w10
-; GISEL-NEXT:    orr w9, w14, w9
-; GISEL-NEXT:    orr w9, w11, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    orr w11, w14, w15
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
 ; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
@@ -405,27 +348,20 @@ define i8 @test_redor_v32i8(<32 x i8> %a) {
 ; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    orr w11, w14, w15
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    orr w8, w8, w10
-; GISEL-NEXT:    orr w9, w14, w9
-; GISEL-NEXT:    orr w9, w11, w9
+; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
 ; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %a)
@@ -444,13 +380,10 @@ define i16 @test_redor_v4i16(<4 x i16> %a) {
 ; GISEL-LABEL: test_redor_v4i16:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
@@ -474,13 +407,10 @@ define i16 @test_redor_v8i16(<8 x i16> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
@@ -506,13 +436,10 @@ define i16 @test_redor_v16i16(<16 x i16> %a) {
 ; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
diff --git a/llvm/test/CodeGen/AArch64/reduce-xor.ll b/llvm/test/CodeGen/AArch64/reduce-xor.ll
index c74b3734a1b7..5c2a808ef2e8 100644
--- a/llvm/test/CodeGen/AArch64/reduce-xor.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-xor.ll
@@ -48,13 +48,10 @@ define i1 @test_redxor_v4i1(<4 x i1> %a) {
 ; GISEL-LABEL: test_redxor_v4i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w8, w8, w9
@@ -75,27 +72,20 @@ define i1 @test_redxor_v8i1(<8 x i1> %a) {
 ; GISEL-LABEL: test_redxor_v8i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    eor w8, w8, w10
-; GISEL-NEXT:    eor w9, w14, w9
-; GISEL-NEXT:    eor w9, w11, w9
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    eor w10, w12, w13
+; GISEL-NEXT:    eor w11, w14, w15
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
@@ -113,49 +103,34 @@ define i1 @test_redxor_v16i1(<16 x i1> %a) {
 ;
 ; GISEL-LABEL: test_redxor_v16i1:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    mov b16, v0.b[8]
-; GISEL-NEXT:    mov b17, v0.b[9]
-; GISEL-NEXT:    mov b18, v0.b[10]
-; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    mov b20, v0.b[12]
-; GISEL-NEXT:    mov b21, v0.b[13]
-; GISEL-NEXT:    fmov w13, s7
-; GISEL-NEXT:    mov b22, v0.b[14]
-; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    umov w16, v0.b[8]
+; GISEL-NEXT:    umov w17, v0.b[9]
+; GISEL-NEXT:    umov w18, v0.b[10]
+; GISEL-NEXT:    umov w0, v0.b[11]
 ; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    umov w1, v0.b[12]
+; GISEL-NEXT:    umov w2, v0.b[13]
 ; GISEL-NEXT:    eor w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    eor w10, w12, w13
+; GISEL-NEXT:    umov w3, v0.b[14]
+; GISEL-NEXT:    eor w11, w14, w15
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w11, s5
-; GISEL-NEXT:    fmov w14, s18
-; GISEL-NEXT:    fmov w15, s19
-; GISEL-NEXT:    fmov w16, s22
-; GISEL-NEXT:    fmov w17, s23
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    umov w4, v0.b[15]
+; GISEL-NEXT:    eor w12, w16, w17
+; GISEL-NEXT:    eor w13, w18, w0
 ; GISEL-NEXT:    eor w9, w10, w11
-; GISEL-NEXT:    fmov w13, s17
-; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    eor w12, w12, w13
-; GISEL-NEXT:    eor w13, w14, w15
-; GISEL-NEXT:    fmov w14, s20
-; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    eor w14, w1, w2
 ; GISEL-NEXT:    eor w10, w12, w13
-; GISEL-NEXT:    eor w14, w14, w15
-; GISEL-NEXT:    eor w15, w16, w17
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w15, w3, w4
 ; GISEL-NEXT:    eor w11, w14, w15
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w8, w8, w9
@@ -173,49 +148,34 @@ define <16 x i1> @test_redxor_ins_v16i1(<16 x i1> %a) {
 ;
 ; GISEL-LABEL: test_redxor_ins_v16i1:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    mov b16, v0.b[8]
-; GISEL-NEXT:    mov b17, v0.b[9]
-; GISEL-NEXT:    mov b18, v0.b[10]
-; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    mov b20, v0.b[12]
-; GISEL-NEXT:    mov b21, v0.b[13]
-; GISEL-NEXT:    fmov w13, s7
-; GISEL-NEXT:    mov b22, v0.b[14]
-; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    umov w16, v0.b[8]
+; GISEL-NEXT:    umov w17, v0.b[9]
+; GISEL-NEXT:    umov w18, v0.b[10]
+; GISEL-NEXT:    umov w0, v0.b[11]
 ; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    umov w1, v0.b[12]
+; GISEL-NEXT:    umov w2, v0.b[13]
 ; GISEL-NEXT:    eor w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    eor w10, w12, w13
+; GISEL-NEXT:    umov w3, v0.b[14]
+; GISEL-NEXT:    eor w11, w14, w15
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w11, s5
-; GISEL-NEXT:    fmov w14, s18
-; GISEL-NEXT:    fmov w15, s19
-; GISEL-NEXT:    fmov w16, s22
-; GISEL-NEXT:    fmov w17, s23
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    umov w4, v0.b[15]
+; GISEL-NEXT:    eor w12, w16, w17
+; GISEL-NEXT:    eor w13, w18, w0
 ; GISEL-NEXT:    eor w9, w10, w11
-; GISEL-NEXT:    fmov w13, s17
-; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    eor w12, w12, w13
-; GISEL-NEXT:    eor w13, w14, w15
-; GISEL-NEXT:    fmov w14, s20
-; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    eor w14, w1, w2
 ; GISEL-NEXT:    eor w10, w12, w13
-; GISEL-NEXT:    eor w14, w14, w15
-; GISEL-NEXT:    eor w15, w16, w17
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w15, w3, w4
 ; GISEL-NEXT:    eor w11, w14, w15
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w8, w8, w9
@@ -278,13 +238,10 @@ define i8 @test_redxor_v4i8(<4 x i8> %a) {
 ; GISEL-LABEL: test_redxor_v4i8:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
@@ -306,27 +263,20 @@ define i8 @test_redxor_v8i8(<8 x i8> %a) {
 ; GISEL-LABEL: test_redxor_v8i8:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    eor w8, w8, w10
-; GISEL-NEXT:    eor w9, w14, w9
-; GISEL-NEXT:    eor w9, w11, w9
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    eor w10, w12, w13
+; GISEL-NEXT:    eor w11, w14, w15
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
 ; GISEL-NEXT:    ret
   %xor_result = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
@@ -349,27 +299,20 @@ define i8 @test_redxor_v16i8(<16 x i8> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    eor w8, w8, w10
-; GISEL-NEXT:    eor w9, w14, w9
-; GISEL-NEXT:    eor w9, w11, w9
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    eor w10, w12, w13
+; GISEL-NEXT:    eor w11, w14, w15
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
 ; GISEL-NEXT:    ret
   %xor_result = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
@@ -394,27 +337,20 @@ define i8 @test_redxor_v32i8(<32 x i8> %a) {
 ; GISEL-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    eor w10, w12, w13
+; GISEL-NEXT:    eor w11, w14, w15
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    eor w8, w8, w10
-; GISEL-NEXT:    eor w9, w14, w9
-; GISEL-NEXT:    eor w9, w11, w9
+; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
 ; GISEL-NEXT:    ret
   %xor_result = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %a)
@@ -433,13 +369,10 @@ define i16 @test_redxor_v4i16(<4 x i16> %a) {
 ; GISEL-LABEL: test_redxor_v4i16:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
@@ -463,13 +396,10 @@ define i16 @test_redxor_v8i16(<8 x i16> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
@@ -495,13 +425,10 @@ define i16 @test_redxor_v16i16(<16 x i16> %a) {
 ; GISEL-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
index 18431ae021f9..d41870ec6e79 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
@@ -15,7 +15,7 @@ declare <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
 
 ;.
-; CHECK: @llvm.compiler.used = appending global [16 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [32 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_svcos_f64_x, ptr @armpl_svcos_f32_x, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_svsin_f64_x, ptr @armpl_svsin_f32_x, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_svexp_f64_x, ptr @armpl_svexp_f32_x, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_svexp2_f64_x, ptr @armpl_svexp2_f32_x, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_svexp10_f64_x, ptr @armpl_svexp10_f32_x, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_svlog_f64_x, ptr @armpl_svlog_f32_x, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_svlog2_f64_x, ptr @armpl_svlog2_f32_x, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32, ptr @armpl_svlog10_f64_x, ptr @armpl_svlog10_f32_x], section "llvm.metadata"
 ;.
 define <2 x double> @llvm_cos_f64(<2 x double> %in) {
 ; CHECK-LABEL: define <2 x double> @llvm_cos_f64
@@ -40,7 +40,7 @@ define <4 x float> @llvm_cos_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_cos_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
@@ -50,7 +50,7 @@ define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_cos_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
@@ -85,7 +85,7 @@ define <4 x float> @llvm_sin_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_sin_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
@@ -95,7 +95,7 @@ define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_sin_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)
@@ -130,7 +130,7 @@ define <4 x float> @llvm_exp_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
@@ -140,7 +140,7 @@ define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
@@ -175,7 +175,7 @@ define <4 x float> @llvm_exp2_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp2_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
@@ -185,7 +185,7 @@ define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp2_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
@@ -220,7 +220,7 @@ define <4 x float> @llvm_exp10_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp10_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
@@ -230,7 +230,7 @@ define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) #
 define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp10_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
@@ -265,7 +265,7 @@ define <4 x float> @llvm_log_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
@@ -275,7 +275,7 @@ define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
@@ -310,7 +310,7 @@ define <4 x float> @llvm_log2_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log2_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
@@ -320,7 +320,7 @@ define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log2_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
@@ -355,7 +355,7 @@ define <4 x float> @llvm_log10_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log10_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
@@ -365,7 +365,7 @@ define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) #
 define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log10_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
@@ -380,7 +380,7 @@ declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4
 ;
 ; There is a bug in the replace-with-veclib pass, and for intrinsics which take
 ; more than one arguments, but has just one overloaded type, it incorrectly
-; reconstructs the scalar name, for pow specificlly it is searching for:
+; reconstructs the scalar name, for pow specifically it is searching for:
 ; llvm.pow.f64.f64 and llvm.pow.f32.f32
 ;
 
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
index 8b06c41bcb1a..c2ff6014bc69 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
@@ -3,8 +3,9 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; NOTE: The existing TLI mappings are not used since the -replace-with-veclib pass is broken for scalable vectors.
-
+;.
+; CHECK: @llvm.compiler.used = appending global [16 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf], section "llvm.metadata"
+;.
 define <vscale x 2 x double> @llvm_ceil_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_ceil_vscale_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
@@ -43,7 +44,7 @@ define <vscale x 4 x float> @llvm_copysign_vscale_f32(<vscale x 4 x float> %mag,
 
 define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_cos_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
@@ -52,7 +53,7 @@ define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_cos_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
@@ -61,7 +62,7 @@ define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
@@ -70,7 +71,7 @@ define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
@@ -79,7 +80,7 @@ define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp2_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
@@ -88,7 +89,7 @@ define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp2_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
@@ -97,7 +98,7 @@ define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp10_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
@@ -106,7 +107,7 @@ define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp10_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
@@ -169,7 +170,7 @@ define <vscale x 4 x float> @llvm_fma_vscale_f32(<vscale x 4 x float> %a, <vscal
 
 define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
@@ -178,7 +179,7 @@ define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
@@ -187,7 +188,7 @@ define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log10_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
@@ -196,7 +197,7 @@ define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log10_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
@@ -205,7 +206,7 @@ define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log2_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
@@ -214,7 +215,7 @@ define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log2_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
@@ -331,7 +332,7 @@ define <vscale x 4 x float> @llvm_round_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_sin_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
@@ -340,7 +341,7 @@ define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_sin_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 4d26228caf62..dd53780be14c 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -2,11 +2,8 @@
 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:      warning: Instruction selection used fallback path for sext_v3i8_v3i16
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sext_v3i8_v3i32
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sext_v3i10_v3i16
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sext_v3i10_v3i32
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sext_v16i10_v16i16
+; CHECK-GI:       warning: Instruction selection used fallback path for sext_v3i8_v3i32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sext_v3i10_v3i32
 
 define i16 @sext_i8_to_i16(i8 %a) {
 ; CHECK-LABEL: sext_i8_to_i16:
@@ -214,14 +211,25 @@ entry:
 }
 
 define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) {
-; CHECK-LABEL: sext_v3i8_v3i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov v0.h[1], w1
-; CHECK-NEXT:    mov v0.h[2], w2
-; CHECK-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sext_v3i8_v3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.h[1], w1
+; CHECK-SD-NEXT:    mov v0.h[2], w2
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sext_v3i8_v3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    mov v0.s[2], w2
+; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i8> %a to <3 x i16>
   ret <3 x i16> %c
@@ -266,10 +274,9 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    fmov d0, x0
 ; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT:    lsl x8, x2, #56
-; CHECK-GI-NEXT:    asr x8, x8, #56
-; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    sxtb x8, w2
 ; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov v0.d[1], x1
 ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
 ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #56
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
@@ -289,18 +296,14 @@ define <3 x i32> @sext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-LABEL: sext_v3i16_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    sxth w8, w8
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    sxth w9, w9
-; CHECK-GI-NEXT:    sxth w8, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w9
-; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    smov w8, v0.h[0]
+; CHECK-GI-NEXT:    smov w9, v0.h[1]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    smov w8, v0.h[2]
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i16> %a to <3 x i32>
@@ -322,15 +325,10 @@ define <3 x i64> @sext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-GI-LABEL: sext_v3i16_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    sxth x8, w8
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    smov x8, v0.h[0]
+; CHECK-GI-NEXT:    smov x9, v0.h[1]
+; CHECK-GI-NEXT:    smov x10, v0.h[2]
 ; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    sxth x9, w9
-; CHECK-GI-NEXT:    sxth x10, w10
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
@@ -352,15 +350,10 @@ define <3 x i64> @sext_v3i32_v3i64(<3 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    sxtw x8, w8
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    smov x8, v0.s[0]
+; CHECK-GI-NEXT:    smov x9, v0.s[1]
+; CHECK-GI-NEXT:    smov x10, v0.s[2]
 ; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    sxtw x9, w9
-; CHECK-GI-NEXT:    sxtw x10, w10
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
@@ -370,14 +363,25 @@ entry:
 }
 
 define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) {
-; CHECK-LABEL: sext_v3i10_v3i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov v0.h[1], w1
-; CHECK-NEXT:    mov v0.h[2], w2
-; CHECK-NEXT:    shl v0.4h, v0.4h, #6
-; CHECK-NEXT:    sshr v0.4h, v0.4h, #6
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sext_v3i10_v3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.h[1], w1
+; CHECK-SD-NEXT:    mov v0.h[2], w2
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #6
+; CHECK-SD-NEXT:    sshr v0.4h, v0.4h, #6
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sext_v3i10_v3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    mov v0.s[2], w2
+; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #6
+; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #6
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i10> %a to <3 x i16>
   ret <3 x i16> %c
@@ -422,10 +426,9 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) {
 ; CHECK-GI-NEXT:    fmov d0, x0
 ; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT:    lsl x8, x2, #54
-; CHECK-GI-NEXT:    asr x8, x8, #54
-; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    sbfx x8, x2, #0, #10
 ; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov v0.d[1], x1
 ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
@@ -967,37 +970,69 @@ entry:
 }
 
 define <16 x i16> @sext_v16i10_v16i16(<16 x i10> %a) {
-; CHECK-LABEL: sext_v16i10_v16i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr w8, [sp]
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    ldr w9, [sp, #8]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldr w8, [sp, #16]
-; CHECK-NEXT:    mov v1.h[1], w1
-; CHECK-NEXT:    mov v0.h[1], w9
-; CHECK-NEXT:    mov v1.h[2], w2
-; CHECK-NEXT:    mov v0.h[2], w8
-; CHECK-NEXT:    ldr w8, [sp, #24]
-; CHECK-NEXT:    mov v1.h[3], w3
-; CHECK-NEXT:    mov v0.h[3], w8
-; CHECK-NEXT:    ldr w8, [sp, #32]
-; CHECK-NEXT:    mov v1.h[4], w4
-; CHECK-NEXT:    mov v0.h[4], w8
-; CHECK-NEXT:    ldr w8, [sp, #40]
-; CHECK-NEXT:    mov v1.h[5], w5
-; CHECK-NEXT:    mov v0.h[5], w8
-; CHECK-NEXT:    ldr w8, [sp, #48]
-; CHECK-NEXT:    mov v1.h[6], w6
-; CHECK-NEXT:    mov v0.h[6], w8
-; CHECK-NEXT:    ldr w8, [sp, #56]
-; CHECK-NEXT:    mov v1.h[7], w7
-; CHECK-NEXT:    mov v0.h[7], w8
-; CHECK-NEXT:    shl v1.8h, v1.8h, #6
-; CHECK-NEXT:    shl v2.8h, v0.8h, #6
-; CHECK-NEXT:    sshr v0.8h, v1.8h, #6
-; CHECK-NEXT:    sshr v1.8h, v2.8h, #6
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sext_v16i10_v16i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr w8, [sp]
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    ldr w9, [sp, #8]
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #16]
+; CHECK-SD-NEXT:    mov v1.h[1], w1
+; CHECK-SD-NEXT:    mov v0.h[1], w9
+; CHECK-SD-NEXT:    mov v1.h[2], w2
+; CHECK-SD-NEXT:    mov v0.h[2], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #24]
+; CHECK-SD-NEXT:    mov v1.h[3], w3
+; CHECK-SD-NEXT:    mov v0.h[3], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #32]
+; CHECK-SD-NEXT:    mov v1.h[4], w4
+; CHECK-SD-NEXT:    mov v0.h[4], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #40]
+; CHECK-SD-NEXT:    mov v1.h[5], w5
+; CHECK-SD-NEXT:    mov v0.h[5], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #48]
+; CHECK-SD-NEXT:    mov v1.h[6], w6
+; CHECK-SD-NEXT:    mov v0.h[6], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #56]
+; CHECK-SD-NEXT:    mov v1.h[7], w7
+; CHECK-SD-NEXT:    mov v0.h[7], w8
+; CHECK-SD-NEXT:    shl v1.8h, v1.8h, #6
+; CHECK-SD-NEXT:    shl v2.8h, v0.8h, #6
+; CHECK-SD-NEXT:    sshr v0.8h, v1.8h, #6
+; CHECK-SD-NEXT:    sshr v1.8h, v2.8h, #6
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sext_v16i10_v16i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    fmov s5, w4
+; CHECK-GI-NEXT:    ldr s0, [sp]
+; CHECK-GI-NEXT:    ldr s1, [sp, #8]
+; CHECK-GI-NEXT:    ldr s2, [sp, #32]
+; CHECK-GI-NEXT:    ldr s3, [sp, #40]
+; CHECK-GI-NEXT:    mov v4.s[1], w1
+; CHECK-GI-NEXT:    mov v5.s[1], w5
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
+; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    ldr s3, [sp, #48]
+; CHECK-GI-NEXT:    mov v4.s[2], w2
+; CHECK-GI-NEXT:    mov v5.s[2], w6
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-GI-NEXT:    ldr s1, [sp, #24]
+; CHECK-GI-NEXT:    ldr s3, [sp, #56]
+; CHECK-GI-NEXT:    mov v4.s[3], w3
+; CHECK-GI-NEXT:    mov v5.s[3], w7
+; CHECK-GI-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[3], v3.s[0]
+; CHECK-GI-NEXT:    uzp1 v1.8h, v4.8h, v5.8h
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    shl v1.8h, v1.8h, #6
+; CHECK-GI-NEXT:    shl v2.8h, v0.8h, #6
+; CHECK-GI-NEXT:    sshr v0.8h, v1.8h, #6
+; CHECK-GI-NEXT:    sshr v1.8h, v2.8h, #6
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <16 x i10> %a to <16 x i16>
   ret <16 x i16> %c
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
new file mode 100644
index 000000000000..a78fa853d99d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
@@ -0,0 +1,542 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+
+; PauthLR is controlled via a combination of -mbranch-protection and +pauth-lr.
+; -mbranch-protection=+pc enables branch protection. If the feature +pauth-lr
+; is available (v9.5a onwards) then non-NOP instructions are used; otherwise
+; NOP instructions are used.
+
+; There are 6 cases to cover:
+
+; feature \ -mbranch-protection= |    none    | pac-ret |   pac-ret+pc
+; ------------------------------------------------------------------------
+; without +pauth-lr              | no codegen | old pac |     NOP pauth-lr
+;    with +pauth-lr              | no codegen | old pac | non-NOP pauth-lr
+
+; sign-return-address.ll tests combinations of -mbranch-protection=none/pac-ret
+; and whether +pauth-lr is present or not.
+
+; sign-return-address-pauth-lr.ll is identical, with the addition of this module
+; attribute, which enables -mbranch-protection=pac-ret+pc, and therefore tests
+; the remaining parameter combinations in the table:
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"branch-protection-pauth-lr", i32 1}
+
+; RUN: llc -mtriple=aarch64              < %s | FileCheck --check-prefixes=CHECK,COMPAT %s
+; RUN: llc -mtriple=aarch64 -mattr=v8.3a < %s | FileCheck --check-prefixes=CHECK,V83A %s
+; RUN: llc -mtriple=aarch64 -mattr=v9a -mattr=pauth-lr < %s | FileCheck --check-prefixes=PAUTHLR %s
+
+define i32 @leaf(i32 %x) {
+; CHECK-LABEL: leaf:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+;
+; PAUTHLR-LABEL: leaf:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    ret
+  ret i32 %x
+}
+
+define i32 @leaf_sign_none(i32 %x) "sign-return-address"="none"  {
+; CHECK-LABEL: leaf_sign_none:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+;
+; PAUTHLR-LABEL: leaf_sign_none:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    ret
+  ret i32 %x
+}
+
+define i32 @leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
+; CHECK-LABEL: leaf_sign_non_leaf:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+;
+; PAUTHLR-LABEL: leaf_sign_non_leaf:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    ret
+  ret i32 %x
+}
+
+define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" {
+; COMPAT-LABEL: leaf_sign_all:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp0:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp0
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp0:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp0
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_sign_all:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp0:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retaasppc .Ltmp0
+  ret i32 %x
+}
+
+define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf"  {
+; COMPAT-LABEL: leaf_clobbers_lr:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp1:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; COMPAT-NEXT:    .cfi_def_cfa_offset 16
+; COMPAT-NEXT:    .cfi_offset w30, -16
+; COMPAT-NEXT:    //APP
+; COMPAT-NEXT:    mov x30, x0
+; COMPAT-NEXT:    //NO_APP
+; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; COMPAT-NEXT:    adr x16, .Ltmp1
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_clobbers_lr:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp1:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; V83A-NEXT:    .cfi_def_cfa_offset 16
+; V83A-NEXT:    .cfi_offset w30, -16
+; V83A-NEXT:    //APP
+; V83A-NEXT:    mov x30, x0
+; V83A-NEXT:    //NO_APP
+; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; V83A-NEXT:    adr x16, .Ltmp1
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_clobbers_lr:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp1:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    //APP
+; PAUTHLR-NEXT:    mov x30, x0
+; PAUTHLR-NEXT:    //NO_APP
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    retaasppc .Ltmp1
+  call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1
+  ret i64 %x
+}
+
+declare i32 @foo(i32)
+
+define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
+; COMPAT-LABEL: non_leaf_sign_all:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp2:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; COMPAT-NEXT:    .cfi_def_cfa_offset 16
+; COMPAT-NEXT:    .cfi_offset w30, -16
+; COMPAT-NEXT:    bl foo
+; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; COMPAT-NEXT:    adr x16, .Ltmp2
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: non_leaf_sign_all:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp2:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; V83A-NEXT:    .cfi_def_cfa_offset 16
+; V83A-NEXT:    .cfi_offset w30, -16
+; V83A-NEXT:    bl foo
+; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; V83A-NEXT:    adr x16, .Ltmp2
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: non_leaf_sign_all:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp2:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    bl foo
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    retaasppc .Ltmp2
+  %call = call i32 @foo(i32 %x)
+  ret i32 %call
+}
+
+define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
+; COMPAT-LABEL: non_leaf_sign_non_leaf:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp3:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; COMPAT-NEXT:    .cfi_def_cfa_offset 16
+; COMPAT-NEXT:    .cfi_offset w30, -16
+; COMPAT-NEXT:    bl foo
+; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; COMPAT-NEXT:    adr x16, .Ltmp3
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: non_leaf_sign_non_leaf:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp3:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; V83A-NEXT:    .cfi_def_cfa_offset 16
+; V83A-NEXT:    .cfi_offset w30, -16
+; V83A-NEXT:    bl foo
+; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; V83A-NEXT:    adr x16, .Ltmp3
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: non_leaf_sign_non_leaf:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp3:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    bl foo
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    retaasppc .Ltmp3
+  %call = call i32 @foo(i32 %x)
+  ret i32 %call
+}
+
+; Should not use the RETAA instruction.
+define i32 @non_leaf_scs(i32 %x) "sign-return-address"="non-leaf" shadowcallstack "target-features"="+v8.3a,+reserve-x18"  {
+; CHECK-LABEL: non_leaf_scs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [x18], #8
+; CHECK-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:  .Ltmp4:
+; CHECK-NEXT:    paciasp
+; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    adr x16, .Ltmp4
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    autiasp
+; CHECK-NEXT:    ldr x30, [x18, #-8]!
+; CHECK-NEXT:    ret
+;
+; PAUTHLR-LABEL: non_leaf_scs:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    str x30, [x18], #8
+; PAUTHLR-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
+; PAUTHLR-NEXT:  .Ltmp4:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    bl foo
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    autiasppc .Ltmp4
+; PAUTHLR-NEXT:    ldr x30, [x18, #-8]!
+; PAUTHLR-NEXT:    ret
+  %call = call i32 @foo(i32 %x)
+  ret i32 %call
+}
+
+define i32 @leaf_sign_all_v83(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" {
+; CHECK-LABEL: leaf_sign_all_v83:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:  .Ltmp5:
+; CHECK-NEXT:    paciasp
+; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    adr x16, .Ltmp5
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_sign_all_v83:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp5:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retaasppc .Ltmp5
+  ret i32 %x
+}
+
+declare fastcc i64 @bar(i64)
+
+define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
+; COMPAT-LABEL: spill_lr_and_tail_call:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp6:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; COMPAT-NEXT:    .cfi_def_cfa_offset 16
+; COMPAT-NEXT:    .cfi_offset w30, -16
+; COMPAT-NEXT:    //APP
+; COMPAT-NEXT:    mov x30, x0
+; COMPAT-NEXT:    //NO_APP
+; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; COMPAT-NEXT:    adr x16, .Ltmp6
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    b bar
+;
+; V83A-LABEL: spill_lr_and_tail_call:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp6:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; V83A-NEXT:    .cfi_def_cfa_offset 16
+; V83A-NEXT:    .cfi_offset w30, -16
+; V83A-NEXT:    //APP
+; V83A-NEXT:    mov x30, x0
+; V83A-NEXT:    //NO_APP
+; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; V83A-NEXT:    adr x16, .Ltmp6
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    autiasp
+; V83A-NEXT:    b bar
+;
+; PAUTHLR-LABEL: spill_lr_and_tail_call:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp6:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    //APP
+; PAUTHLR-NEXT:    mov x30, x0
+; PAUTHLR-NEXT:    //NO_APP
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    autiasppc .Ltmp6
+; PAUTHLR-NEXT:    b bar
+  call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1
+  tail call fastcc i64 @bar(i64 %x)
+  ret void
+}
+
+define i32 @leaf_sign_all_a_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" {
+; COMPAT-LABEL: leaf_sign_all_a_key:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp7:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp7
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all_a_key:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp7:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp7
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_sign_all_a_key:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp7:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retaasppc .Ltmp7
+  ret i32 %x
+}
+
+define i32 @leaf_sign_all_b_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="b_key" {
+; COMPAT-LABEL: leaf_sign_all_b_key:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    .cfi_b_key_frame
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp8:
+; COMPAT-NEXT:    hint #27
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp8
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #31
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all_b_key:
+; V83A:       // %bb.0:
+; V83A-NEXT:    .cfi_b_key_frame
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp8:
+; V83A-NEXT:    pacibsp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp8
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retab
+;
+; PAUTHLR-LABEL: leaf_sign_all_b_key:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:  .Ltmp8:
+; PAUTHLR-NEXT:    pacibsppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retabsppc .Ltmp8
+  ret i32 %x
+}
+
+define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" "sign-return-address-key"="b_key" {
+; CHECK-LABEL: leaf_sign_all_v83_b_key:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .cfi_b_key_frame
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:  .Ltmp9:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    adr x16, .Ltmp9
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    retab
+;
+; PAUTHLR-LABEL: leaf_sign_all_v83_b_key:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:  .Ltmp9:
+; PAUTHLR-NEXT:    pacibsppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retabsppc .Ltmp9
+  ret i32 %x
+}
+
+; Note that BTI instruction is not needed before PACIASP.
+define i32 @leaf_sign_all_a_key_bti(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" "branch-target-enforcement"="true"{
+; COMPAT-LABEL: leaf_sign_all_a_key_bti:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #34
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp10:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp10
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all_a_key_bti:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #34
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp10:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp10
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_sign_all_a_key_bti:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    bti c
+; PAUTHLR-NEXT:  .Ltmp10:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retaasppc .Ltmp10
+  ret i32 %x
+}
+
+; Note that BTI instruction is not needed before PACIBSP.
+define i32 @leaf_sign_all_b_key_bti(i32 %x) "sign-return-address"="all" "sign-return-address-key"="b_key" "branch-target-enforcement"="true"{
+; COMPAT-LABEL: leaf_sign_all_b_key_bti:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #34
+; COMPAT-NEXT:    .cfi_b_key_frame
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp11:
+; COMPAT-NEXT:    hint #27
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp11
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #31
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all_b_key_bti:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #34
+; V83A-NEXT:    .cfi_b_key_frame
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp11:
+; V83A-NEXT:    pacibsp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp11
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retab
+;
+; PAUTHLR-LABEL: leaf_sign_all_b_key_bti:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    bti c
+; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:  .Ltmp11:
+; PAUTHLR-NEXT:    pacibsppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retabsppc .Ltmp11
+  ret i32 %x
+}
+
+; Note that BTI instruction is not needed before PACIBSP.
+define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" "sign-return-address-key"="b_key" "branch-target-enforcement"="true" {
+; CHECK-LABEL: leaf_sign_all_v83_b_key_bti:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    hint #34
+; CHECK-NEXT:    .cfi_b_key_frame
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:  .Ltmp12:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    adr x16, .Ltmp12
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    retab
+;
+; PAUTHLR-LABEL: leaf_sign_all_v83_b_key_bti:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    bti c
+; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:  .Ltmp12:
+; PAUTHLR-NEXT:    pacibsppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retabsppc .Ltmp12
+  ret i32 %x
+}
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address.ll b/llvm/test/CodeGen/AArch64/sign-return-address.ll
index 5680915c7f41..1481d4beb50d 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address.ll
@@ -2,6 +2,9 @@
 ; RUN: llc -mtriple=aarch64              < %s | FileCheck --check-prefixes=CHECK,COMPAT %s
 ; RUN: llc -mtriple=aarch64 -mattr=v8.3a < %s | FileCheck --check-prefixes=CHECK,V83A %s
 
+; v9.5-A is not expected to change codegen without -mbranch-protection=+pc, so reuse V83A.
+; RUN: llc -mtriple=aarch64 -mattr=v9.5a < %s | FileCheck --check-prefixes=CHECK,V83A %s
+
 define i32 @leaf(i32 %x) {
 ; CHECK-LABEL: leaf:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 01756b846001..ef7d55a1c239 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -7,6 +7,8 @@
   target triple = "aarch64--linux-gnu"
 
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr2() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr2mul2() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_pnr() #1 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_virtreg_pnr() #1 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable }
@@ -64,6 +66,96 @@ body:             |
     RET_ReallyLR
 ...
 ---
+name: spills_fills_stack_id_ppr2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: ppr2 }
+stack:
+liveins:
+  - { reg: '$p0_p1', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $p0_p1
+
+    ; CHECK-LABEL: name: spills_fills_stack_id_ppr2
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
+
+    ; EXPAND-LABEL: name: spills_fills_stack_id_ppr2
+    ; EXPAND: STR_PXI $p0, $sp, 6
+    ; EXPAND: STR_PXI $p1, $sp, 7
+    ; EXPAND: $p0 = LDR_PXI $sp, 6
+    ; EXPAND: $p1 = LDR_PXI $sp, 7
+
+    %0:ppr2 = COPY $p0_p1
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p0_p1 = COPY %0
+    RET_ReallyLR
+...
+---
+name: spills_fills_stack_id_ppr2mul2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: ppr2mul2 }
+stack:
+liveins:
+  - { reg: '$p0_p1', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $p0_p1
+
+    ; CHECK-LABEL: name: spills_fills_stack_id_ppr2
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
+
+    ; EXPAND-LABEL: name: spills_fills_stack_id_ppr2mul2
+    ; EXPAND: STR_PXI $p0, $sp, 6
+    ; EXPAND: STR_PXI $p1, $sp, 7
+    ; EXPAND: $p0 = LDR_PXI $sp, 6
+    ; EXPAND: $p1 = LDR_PXI $sp, 7
+
+    %0:ppr2mul2 = COPY $p0_p1
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p0_p1 = COPY %0
+    RET_ReallyLR
+...
+---
 name: spills_fills_stack_id_pnr
 tracksRegLiveness: true
 registers:
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-pair-spill-fill.ll b/llvm/test/CodeGen/AArch64/sve-pred-pair-spill-fill.ll
new file mode 100644
index 000000000000..4dcc81feb72f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-pred-pair-spill-fill.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s | FileCheck %s
+
+; Derived from 
+; #include <arm_sve.h>
+
+; void g();
+
+; svboolx2_t f0(int64_t i, int64_t n) {
+;     svboolx2_t r = svwhilelt_b16_x2(i, n);
+;     g();
+;     return r;
+; }
+
+; svboolx2_t f1(svcount_t n) {
+;     svboolx2_t r = svpext_lane_c8_x2(n, 1);
+;     g();
+;     return r;
+; }
+; 
+; Check that predicate register pairs are spilled/filled without an ICE in the backend.
+
+target triple = "aarch64-unknown-linux"
+
+define <vscale x 32 x i1> @f0(i64 %i, i64 %n) #0 {
+entry:
+  %0 = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv8i1(i64 %i, i64 %n)
+  %1 = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } %0, 0
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
+  %3 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %2, i64 0)
+  %4 = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } %0, 1
+  %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %4)
+  %6 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %3, <vscale x 16 x i1> %5, i64 16)
+  tail call void @g()
+  ret <vscale x 32 x i1> %6
+}
+; CHECK-LABEL: f0:
+; CHECK: whilelt { p0.h, p1.h }
+; CHECK: str p0, [sp, #6, mul vl]
+; CHECK: str p1, [sp, #7, mul vl]
+; CHECK: ldr p0, [sp, #6, mul vl]
+; CHECK: ldr p1, [sp, #7, mul vl]
+
+define <vscale x 32 x i1> @f1(target("aarch64.svcount") %n) #0 {
+entry:
+  %0 = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") %n, i32 1)
+  %1 = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } %0, 0
+  %2 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %1, i64 0)
+  %3 = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } %0, 1
+  %4 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %2, <vscale x 16 x i1> %3, i64 16)
+  tail call void @g()
+  ret <vscale x 32 x i1> %4
+}
+
+; CHECK-LABEL: f1:
+; CHECK: pext { p0.b, p1.b }
+; CHECK: str p0, [sp, #6, mul vl]
+; CHECK: str p1, [sp, #7, mul vl]
+; CHECK: ldr p0, [sp, #6, mul vl]
+; CHECK: ldr p1, [sp, #7, mul vl]
+
+declare void @g(...)
+declare { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv8i1(i64, i64)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1>, <vscale x 16 x i1>, i64 immarg)
+declare { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount"), i32 immarg) #1
+
+attributes #0 = { nounwind "target-features"="+sve,+sve2,+sve2p1" }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
new file mode 100644
index 000000000000..63b5a97703e6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -0,0 +1,306 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+
+define float @add_HalfS(<2 x float> %bin.rdx)  {
+; CHECK-LABEL: add_HalfS:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    faddp s0, v0.2s
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %bin.rdx)
+  ret float %r
+}
+
+define half @add_HalfH(<4 x half> %bin.rdx)  {
+; CHECK-SD-NOFP16-LABEL: add_HalfH:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s2, s1
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s1, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: add_HalfH:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-FP16-NEXT:    mov h1, v0.h[2]
+; CHECK-SD-FP16-NEXT:    faddp h2, v0.2h
+; CHECK-SD-FP16-NEXT:    mov h0, v0.h[3]
+; CHECK-SD-FP16-NEXT:    fadd h1, h2, h1
+; CHECK-SD-FP16-NEXT:    fadd h0, h1, h0
+; CHECK-SD-FP16-NEXT:    ret
+  %r = call half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
+  ret half %r
+}
+
+
+define half @add_H(<8 x half> %bin.rdx)  {
+; CHECK-SD-NOFP16-LABEL: add_H:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s2, s1
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s1, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: add_H:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    mov h1, v0.h[2]
+; CHECK-SD-FP16-NEXT:    faddp h2, v0.2h
+; CHECK-SD-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-SD-FP16-NEXT:    fadd h1, h2, h1
+; CHECK-SD-FP16-NEXT:    mov h2, v0.h[4]
+; CHECK-SD-FP16-NEXT:    fadd h1, h1, h3
+; CHECK-SD-FP16-NEXT:    mov h3, v0.h[5]
+; CHECK-SD-FP16-NEXT:    fadd h1, h1, h2
+; CHECK-SD-FP16-NEXT:    mov h2, v0.h[6]
+; CHECK-SD-FP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-FP16-NEXT:    fadd h1, h1, h3
+; CHECK-SD-FP16-NEXT:    fadd h1, h1, h2
+; CHECK-SD-FP16-NEXT:    fadd h0, h1, h0
+; CHECK-SD-FP16-NEXT:    ret
+  %r = call half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx)
+  ret half %r
+}
+
+define float @add_S(<4 x float> %bin.rdx)  {
+; CHECK-LABEL: add_S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s1, v0.s[2]
+; CHECK-NEXT:    faddp s2, v0.2s
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    fadd s1, s2, s1
+; CHECK-NEXT:    fadd s0, s1, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %bin.rdx)
+  ret float %r
+}
+
+define double @add_D(<2 x double> %bin.rdx)  {
+; CHECK-LABEL: add_D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    faddp d0, v0.2d
+; CHECK-NEXT:    ret
+  %r = call double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %bin.rdx)
+  ret double %r
+}
+
+define half @add_2H(<16 x half> %bin.rdx)  {
+; CHECK-SD-NOFP16-LABEL: add_2H:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s3, s2
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s2, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: add_2H:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-SD-FP16-NEXT:    faddp h3, v0.2h
+; CHECK-SD-FP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-FP16-NEXT:    fadd h2, h3, h2
+; CHECK-SD-FP16-NEXT:    mov h3, v0.h[4]
+; CHECK-SD-FP16-NEXT:    fadd h2, h2, h4
+; CHECK-SD-FP16-NEXT:    mov h4, v0.h[5]
+; CHECK-SD-FP16-NEXT:    fadd h2, h2, h3
+; CHECK-SD-FP16-NEXT:    mov h3, v0.h[6]
+; CHECK-SD-FP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-FP16-NEXT:    fadd h2, h2, h4
+; CHECK-SD-FP16-NEXT:    fadd h2, h2, h3
+; CHECK-SD-FP16-NEXT:    mov h3, v1.h[2]
+; CHECK-SD-FP16-NEXT:    fadd h0, h2, h0
+; CHECK-SD-FP16-NEXT:    mov h2, v1.h[1]
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h1
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h2
+; CHECK-SD-FP16-NEXT:    mov h2, v1.h[3]
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h3
+; CHECK-SD-FP16-NEXT:    mov h3, v1.h[4]
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h2
+; CHECK-SD-FP16-NEXT:    mov h2, v1.h[5]
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h3
+; CHECK-SD-FP16-NEXT:    mov h3, v1.h[6]
+; CHECK-SD-FP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h2
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h3
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h1
+; CHECK-SD-FP16-NEXT:    ret
+  %r = call half @llvm.vector.reduce.fadd.f16.v16f16(half -0.0, <16 x half> %bin.rdx)
+  ret half %r
+}
+
+define float @add_2S(<8 x float> %bin.rdx)  {
+; CHECK-LABEL: add_2S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s2, v0.s[2]
+; CHECK-NEXT:    faddp s3, v0.2s
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    fadd s2, s3, s2
+; CHECK-NEXT:    mov s3, v1.s[2]
+; CHECK-NEXT:    fadd s0, s2, s0
+; CHECK-NEXT:    mov s2, v1.s[1]
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    mov s1, v1.s[3]
+; CHECK-NEXT:    fadd s0, s0, s2
+; CHECK-NEXT:    fadd s0, s0, s3
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %bin.rdx)
+  ret float %r
+}
+
+define double @add_2D(<4 x double> %bin.rdx)  {
+; CHECK-LABEL: add_2D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    faddp d0, v0.2d
+; CHECK-NEXT:    mov d2, v1.d[1]
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    fadd d0, d0, d2
+; CHECK-NEXT:    ret
+  %r = call double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %bin.rdx)
+  ret double %r
+}
+
+; Added at least one test where the start value is not -0.0.
+define float @add_S_init_42(<4 x float> %bin.rdx)  {
+; CHECK-LABEL: add_S_init_42:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1109917696 // =0x42280000
+; CHECK-NEXT:    mov s2, v0.s[1]
+; CHECK-NEXT:    mov s3, v0.s[2]
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    fadd s1, s0, s1
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    fadd s1, s1, s2
+; CHECK-NEXT:    fadd s1, s1, s3
+; CHECK-NEXT:    fadd s0, s1, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
+  ret float %r
+}
+
+; Function Attrs: nounwind readnone
+declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
+declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
new file mode 100644
index 000000000000..68cd3496a923
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+
+define float @mul_HalfS(<2 x float> %bin.rdx)  {
+; CHECK-LABEL: mul_HalfS:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
+  ret float %r
+}
+
+define half @mul_HalfH(<4 x half> %bin.rdx)  {
+; CHECK-SD-NOFP16-LABEL: mul_HalfH:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s2, s1
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s1, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: mul_HalfH:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-FP16-NEXT:    fmul h1, h0, v0.h[1]
+; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[2]
+; CHECK-SD-FP16-NEXT:    fmul h0, h1, v0.h[3]
+; CHECK-SD-FP16-NEXT:    ret
+  %r = call half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
+  ret half %r
+}
+
+
+define half @mul_H(<8 x half> %bin.rdx)  {
+; CHECK-SD-NOFP16-LABEL: mul_H:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s2, s1
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s1, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: mul_H:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    fmul h1, h0, v0.h[1]
+; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[2]
+; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[3]
+; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[4]
+; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[5]
+; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[6]
+; CHECK-SD-FP16-NEXT:    fmul h0, h1, v0.h[7]
+; CHECK-SD-FP16-NEXT:    ret
+  %r = call half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %bin.rdx)
+  ret half %r
+}
+
+define float @mul_S(<4 x float> %bin.rdx)  {
+; CHECK-LABEL: mul_S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul s1, s0, v0.s[1]
+; CHECK-NEXT:    fmul s1, s1, v0.s[2]
+; CHECK-NEXT:    fmul s0, s1, v0.s[3]
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx)
+  ret float %r
+}
+
+define double @mul_D(<2 x double> %bin.rdx)  {
+; CHECK-LABEL: mul_D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul d0, d0, v0.d[1]
+; CHECK-NEXT:    ret
+  %r = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %bin.rdx)
+  ret double %r
+}
+
+define half @mul_2H(<16 x half> %bin.rdx)  {
+; CHECK-SD-NOFP16-LABEL: mul_2H:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s2, s3, s2
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s2, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: mul_2H:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    fmul h2, h0, v0.h[1]
+; CHECK-SD-FP16-NEXT:    fmul h2, h2, v0.h[2]
+; CHECK-SD-FP16-NEXT:    fmul h2, h2, v0.h[3]
+; CHECK-SD-FP16-NEXT:    fmul h2, h2, v0.h[4]
+; CHECK-SD-FP16-NEXT:    fmul h2, h2, v0.h[5]
+; CHECK-SD-FP16-NEXT:    fmul h2, h2, v0.h[6]
+; CHECK-SD-FP16-NEXT:    fmul h0, h2, v0.h[7]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[1]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[2]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[3]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[4]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[5]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[6]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[7]
+; CHECK-SD-FP16-NEXT:    ret
+  %r = call half @llvm.vector.reduce.fmul.f16.v16f16(half 1.0, <16 x half> %bin.rdx)
+  ret half %r
+}
+
+define float @mul_2S(<8 x float> %bin.rdx)  {
+; CHECK-LABEL: mul_2S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul s2, s0, v0.s[1]
+; CHECK-NEXT:    fmul s2, s2, v0.s[2]
+; CHECK-NEXT:    fmul s0, s2, v0.s[3]
+; CHECK-NEXT:    fmul s0, s0, s1
+; CHECK-NEXT:    fmul s0, s0, v1.s[1]
+; CHECK-NEXT:    fmul s0, s0, v1.s[2]
+; CHECK-NEXT:    fmul s0, s0, v1.s[3]
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx)
+  ret float %r
+}
+
+define double @mul_2D(<4 x double> %bin.rdx)  {
+; CHECK-LABEL: mul_2D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul d0, d0, v0.d[1]
+; CHECK-NEXT:    fmul d0, d0, d1
+; CHECK-NEXT:    fmul d0, d0, v1.d[1]
+; CHECK-NEXT:    ret
+  %r = call double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %bin.rdx)
+  ret double %r
+}
+
+; Added at least one test where the start value is not 1.0.
+define float @mul_S_init_42(<4 x float> %bin.rdx)  {
+; CHECK-LABEL: mul_S_init_42:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1109917696 // =0x42280000
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    fmul s1, s1, s0
+; CHECK-NEXT:    fmul s1, s1, v0.s[1]
+; CHECK-NEXT:    fmul s1, s1, v0.s[2]
+; CHECK-NEXT:    fmul s0, s1, v0.s[3]
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fmul.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
+  ret float %r
+}
+
+; Function Attrs: nounwind readnone
+declare half @llvm.vector.reduce.fmul.f16.v4f16(half, <4 x half>)
+declare half @llvm.vector.reduce.fmul.f16.v8f16(half, <8 x half>)
+declare half @llvm.vector.reduce.fmul.f16.v16f16(half, <16 x half>)
+declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index 53aefaf3d336..7f804fe48fd8 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -168,53 +168,32 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
 ; CHECK-GI-LABEL: test_v9i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    uxtb w8, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
-; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    mov b1, v0.b[5]
-; CHECK-GI-NEXT:    mov b2, v0.b[6]
-; CHECK-GI-NEXT:    cmp w8, w10, uxtb
-; CHECK-GI-NEXT:    fmov w10, s3
-; CHECK-GI-NEXT:    uxtb w8, w11
-; CHECK-GI-NEXT:    csel w9, w9, w12, hi
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    uxtb w8, w10
-; CHECK-GI-NEXT:    fmov w10, s4
-; CHECK-GI-NEXT:    csel w9, w9, w11, lo
-; CHECK-GI-NEXT:    fmov w11, s3
-; CHECK-GI-NEXT:    mov b3, v0.b[7]
-; CHECK-GI-NEXT:    mov b0, v0.b[8]
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    uxtb w8, w10
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    csel w9, w9, w11, lo
-; CHECK-GI-NEXT:    fmov w11, s4
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    uxtb w8, w10
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    csel w9, w9, w11, lo
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    umov w10, v0.b[2]
 ; CHECK-GI-NEXT:    fmov w11, s1
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    uxtb w8, w10
-; CHECK-GI-NEXT:    fmov w10, s3
-; CHECK-GI-NEXT:    csel w9, w9, w11, lo
-; CHECK-GI-NEXT:    fmov w11, s2
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    uxtb w8, w10
-; CHECK-GI-NEXT:    fmov w10, s0
-; CHECK-GI-NEXT:    csel w9, w9, w11, lo
-; CHECK-GI-NEXT:    fmov w11, s3
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    uxtb w8, w10
-; CHECK-GI-NEXT:    csel w9, w9, w11, lo
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    csel w0, w9, w10, lo
+; CHECK-GI-NEXT:    cmp w8, w11, uxtb
+; CHECK-GI-NEXT:    umov w11, v0.b[3]
+; CHECK-GI-NEXT:    csel w8, w8, w9, hi
+; CHECK-GI-NEXT:    umov w9, v0.b[4]
+; CHECK-GI-NEXT:    cmp w10, w8, uxtb
+; CHECK-GI-NEXT:    csel w8, w8, w10, lo
+; CHECK-GI-NEXT:    umov w10, v0.b[5]
+; CHECK-GI-NEXT:    cmp w11, w8, uxtb
+; CHECK-GI-NEXT:    csel w8, w8, w11, lo
+; CHECK-GI-NEXT:    umov w11, v0.b[6]
+; CHECK-GI-NEXT:    cmp w9, w8, uxtb
+; CHECK-GI-NEXT:    csel w8, w8, w9, lo
+; CHECK-GI-NEXT:    umov w9, v0.b[7]
+; CHECK-GI-NEXT:    cmp w10, w8, uxtb
+; CHECK-GI-NEXT:    csel w8, w8, w10, lo
+; CHECK-GI-NEXT:    umov w10, v0.b[8]
+; CHECK-GI-NEXT:    cmp w11, w8, uxtb
+; CHECK-GI-NEXT:    csel w8, w8, w11, lo
+; CHECK-GI-NEXT:    cmp w9, w8, uxtb
+; CHECK-GI-NEXT:    csel w8, w8, w9, lo
+; CHECK-GI-NEXT:    cmp w10, w8, uxtb
+; CHECK-GI-NEXT:    csel w0, w8, w10, lo
 ; CHECK-GI-NEXT:    ret
   %b = call i8 @llvm.vector.reduce.umax.v9i8(<9 x i8> %a)
   ret i8 %b
@@ -259,21 +238,18 @@ define i1 @test_v4i1(<4 x i1> %a) nounwind {
 ; CHECK-GI-LABEL: test_v4i1:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w12, s2
-; CHECK-GI-NEXT:    fmov w13, s3
-; CHECK-GI-NEXT:    and w9, w8, #0x1
-; CHECK-GI-NEXT:    and w11, w10, #0x1
-; CHECK-GI-NEXT:    cmp w9, w11
-; CHECK-GI-NEXT:    and w9, w12, #0x1
-; CHECK-GI-NEXT:    and w11, w13, #0x1
-; CHECK-GI-NEXT:    csel w8, w8, w10, hi
-; CHECK-GI-NEXT:    cmp w9, w11
-; CHECK-GI-NEXT:    csel w9, w12, w13, hi
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
+; CHECK-GI-NEXT:    and w12, w8, #0x1
+; CHECK-GI-NEXT:    and w13, w9, #0x1
+; CHECK-GI-NEXT:    cmp w12, w13
+; CHECK-GI-NEXT:    and w12, w10, #0x1
+; CHECK-GI-NEXT:    and w13, w11, #0x1
+; CHECK-GI-NEXT:    csel w8, w8, w9, hi
+; CHECK-GI-NEXT:    cmp w12, w13
+; CHECK-GI-NEXT:    csel w9, w10, w11, hi
 ; CHECK-GI-NEXT:    and w10, w8, #0x1
 ; CHECK-GI-NEXT:    and w11, w9, #0x1
 ; CHECK-GI-NEXT:    cmp w10, w11
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index 0dd4e3644b78..21982fadbe80 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -224,23 +224,13 @@ entry:
 }
 
 define <3 x i8> @xtn_v3i16_v3i8(<3 x i16> %a) {
-; CHECK-SD-LABEL: xtn_v3i16_v3i8:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    umov w0, v0.h[0]
-; CHECK-SD-NEXT:    umov w1, v0.h[1]
-; CHECK-SD-NEXT:    umov w2, v0.h[2]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: xtn_v3i16_v3i8:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    fmov w1, s1
-; CHECK-GI-NEXT:    fmov w2, s2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: xtn_v3i16_v3i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    umov w1, v0.h[1]
+; CHECK-NEXT:    umov w2, v0.h[2]
+; CHECK-NEXT:    ret
 entry:
   %arg1 = trunc <3 x i16> %a to <3 x i8>
   ret <3 x i8> %arg1
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 42c0bf79e778..e513340f5b18 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
+; CHECK-GI:       warning: Instruction selection used fallback path for zext_v16i10_v16i16
+
 define i16 @zext_i8_to_i16(i8 %a) {
 ; CHECK-LABEL: zext_i8_to_i16:
 ; CHECK:       // %bb.0: // %entry
@@ -333,18 +335,14 @@ define <3 x i32> @zext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-LABEL: zext_v3i16_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    uxth w8, w8
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    uxth w9, w9
-; CHECK-GI-NEXT:    uxth w8, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w9
-; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    umov w8, v0.h[2]
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i16> %a to <3 x i32>
@@ -366,15 +364,10 @@ define <3 x i64> @zext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-GI-LABEL: zext_v3i16_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    ubfx x8, x8, #0, #16
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
 ; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    ubfx x9, x9, #0, #16
-; CHECK-GI-NEXT:    ubfx x10, x10, #0, #16
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
@@ -396,12 +389,10 @@ define <3 x i64> @zext_v3i32_v3i64(<3 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov w8, v0.s[0]
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
+; CHECK-GI-NEXT:    mov w10, v0.s[2]
 ; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
index 9f3ad8b44444..96a776f6fbb6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
@@ -450,8 +450,9 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %and:_(s32) = G_SELECT %cond(s1), %zero, %variable
+    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(ne), %reg(s32), %zero
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %and:_(s32) = G_AND %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %and(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -476,7 +477,8 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %and:_(s32) = G_SELECT %cond(s1), %variable, %zero
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %and:_(s32) = G_AND %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %and(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -500,9 +502,9 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: %or:_(s32) = G_SELECT %cond(s1), %variable, %neg1
+    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(ne), %reg(s32), %zero
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %or:_(s32) = G_OR %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %or(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -527,8 +529,8 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: %or:_(s32) = G_SELECT %cond(s1), %neg1, %variable
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %or:_(s32) = G_OR %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %or(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -667,9 +669,9 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
     ; CHECK-NEXT: %otherconst:_(s32) = G_CONSTANT i32 123
-    ; CHECK-NEXT: %select:_(s32) = G_SELECT %cond(s1), %neg1, %otherconst
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %select:_(s32) = G_OR [[SEXT]], %otherconst
     ; CHECK-NEXT: %or:_(s32) = G_OR %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %or(s32)
     %reg:_(s32) = COPY $vgpr0
@@ -749,8 +751,7 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: %srem:_(s32) = G_SELECT %cond(s1), [[C]], %zero
+    ; CHECK-NEXT: %srem:_(s32) = G_ZEXT %cond(s1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit %srem(s32)
     %reg:_(s32) = COPY $vgpr0
     %zero:_(s32) = G_CONSTANT i32 0
@@ -802,8 +803,7 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: %udiv:_(s32) = G_SELECT %cond(s1), [[C]], %zero
+    ; CHECK-NEXT: %udiv:_(s32) = G_ZEXT %cond(s1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit %udiv(s32)
     %reg:_(s32) = COPY $vgpr0
     %zero:_(s32) = G_CONSTANT i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index ccf4e84fbbbd..4ac1fad6deec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -37,7 +37,8 @@ define amdgpu_ps void @divergent_i1_phi_uniform_branch(ptr addrspace(1) %out, i3
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
 ; GFX10-NEXT:    global_store_dword v[3:4], v5, off
 ; GFX10-NEXT:  .LBB0_3: ; %exit
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 A:
@@ -72,7 +73,8 @@ define amdgpu_ps void @divergent_i1_phi_uniform_branch_simple(ptr addrspace(1) %
 ; GFX10-NEXT:  .LBB1_2: ; %B
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
 ; GFX10-NEXT:  .LBB1_3: ; %exit
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 A:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index afd271c99577..c1f3924e466d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -14,7 +14,8 @@ define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid,
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
 ; GFX10-NEXT:  ; %bb.2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 A:
@@ -51,7 +52,8 @@ define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid,
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 1, v2
 ; GFX10-NEXT:  ; %bb.4: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
index 0f85c80954ac..bb37e54e3b56 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
@@ -66,7 +66,7 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() {
 define i32 @asm_vgpr_early_clobber() {
   ; CHECK-LABEL: name: asm_vgpr_early_clobber
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 2490379 /* regdef-ec:VGPR_32 */, def early-clobber %8, 2490379 /* regdef-ec:VGPR_32 */, def early-clobber %9, !0
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %8, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %9, !0
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
@@ -94,7 +94,7 @@ entry:
 define i32 @test_single_vgpr_output() nounwind {
   ; CHECK-LABEL: name: test_single_vgpr_output
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 2490378 /* regdef:VGPR_32 */, def %8
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -106,7 +106,7 @@ entry:
 define i32 @test_single_sgpr_output_s32() nounwind {
   ; CHECK-LABEL: name: test_single_sgpr_output_s32
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %8
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -119,7 +119,7 @@ entry:
 define float @test_multiple_register_outputs_same() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_same
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 2490378 /* regdef:VGPR_32 */, def %8, 2490378 /* regdef:VGPR_32 */, def %9
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 2228234 /* regdef:VGPR_32 */, def %9
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
@@ -136,7 +136,7 @@ define float @test_multiple_register_outputs_same() #0 {
 define double @test_multiple_register_outputs_mixed() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_mixed
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2490378 /* regdef:VGPR_32 */, def %8, 3801098 /* regdef:VReg_64 */, def %9
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %8, 3538954 /* regdef:VReg_64 */, def %9
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY %9
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
@@ -171,7 +171,7 @@ define amdgpu_kernel void @test_input_vgpr_imm() {
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[C]](s32)
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 2490377 /* reguse:VGPR_32 */, [[COPY1]]
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY1]]
   ; CHECK-NEXT:   S_ENDPGM 0
   call void asm sideeffect "v_mov_b32 v0, $0", "v"(i32 42)
   ret void
@@ -185,7 +185,7 @@ define amdgpu_kernel void @test_input_sgpr_imm() {
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[C]](s32)
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[COPY1]]
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[COPY1]]
   ; CHECK-NEXT:   S_ENDPGM 0
   call void asm sideeffect "s_mov_b32 s0, $0", "s"(i32 42)
   ret void
@@ -212,7 +212,7 @@ define float @test_input_vgpr(i32 %src) nounwind {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 2490378 /* regdef:VGPR_32 */, def %9, 2490377 /* reguse:VGPR_32 */, [[COPY1]]
+  ; CHECK-NEXT:   INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %9, 2228233 /* reguse:VGPR_32 */, [[COPY1]]
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -227,7 +227,7 @@ define i32 @test_memory_constraint(ptr addrspace(3) %a) nounwind {
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-  ; CHECK-NEXT:   INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 2490378 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3)
+  ; CHECK-NEXT:   INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 2228234 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -244,7 +244,7 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32)
-  ; CHECK-NEXT:   INLINEASM &";", 1 /* sideeffect attdialect */, 2490378 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &";", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %11
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -256,13 +256,13 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
 define i32 @test_sgpr_matching_constraint() nounwind {
   ; CHECK-LABEL: name: test_sgpr_matching_constraint
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %8
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %10
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %10
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %10
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32)
-  ; CHECK-NEXT:   INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %12, 2621449 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %12, 2359305 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY %12
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY4]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -285,7 +285,7 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
-  ; CHECK-NEXT:   INLINEASM &"; ", 1 /* sideeffect attdialect */, 2490378 /* regdef:VGPR_32 */, def %11, 2490378 /* regdef:VGPR_32 */, def %12, 2490378 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5)
+  ; CHECK-NEXT:   INLINEASM &"; ", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def %11, 2228234 /* regdef:VGPR_32 */, def %12, 2228234 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5)
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY %11
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY %12
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY %13
@@ -306,10 +306,10 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
 define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind {
   ; CHECK-LABEL: name: test_sgpr_to_vgpr_move_matching_constraint
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %8
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2359306 /* regdef:SReg_32 */, def %8
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 2490378 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %10
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index db33ed8fa556..e3d2ecefbda3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -589,13 +589,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -633,13 +630,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -676,10 +670,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -758,16 +749,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -849,16 +836,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -961,13 +944,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1007,13 +987,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1052,10 +1029,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -1140,16 +1114,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1237,16 +1207,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -2005,13 +1971,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -2049,13 +2012,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -2092,10 +2052,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -2174,16 +2131,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
@@ -2266,16 +2219,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 6a664f26d470..9f97f1f4bace 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -657,15 +657,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s4, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s2
-; GFX8-NEXT:    s_add_i32 s4, s4, 32
-; GFX8-NEXT:    s_min_u32 s7, s5, s4
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s7
-; GFX8-NEXT:    s_mov_b32 m0, s7
+; GFX8-NEXT:    s_ff1_i32_b64 s4, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s4
+; GFX8-NEXT:    v_readlane_b32 s7, v0, s4
+; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX8-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX8-NEXT:    s_add_i32 s6, s6, s8
+; GFX8-NEXT:    s_add_i32 s6, s6, s7
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -705,15 +702,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s4, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s2
-; GFX9-NEXT:    s_add_i32 s4, s4, 32
-; GFX9-NEXT:    s_min_u32 s7, s5, s4
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s7
-; GFX9-NEXT:    s_mov_b32 m0, s7
+; GFX9-NEXT:    s_ff1_i32_b64 s4, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s4
+; GFX9-NEXT:    v_readlane_b32 s7, v0, s4
+; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX9-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX9-NEXT:    s_add_i32 s6, s6, s8
+; GFX9-NEXT:    s_add_i32 s6, s6, s7
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -753,10 +747,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s4, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s2
-; GFX1064-NEXT:    s_add_i32 s4, s4, 32
-; GFX1064-NEXT:    s_min_u32 s7, s5, s4
+; GFX1064-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1064-NEXT:    v_writelane_b32 v1, s6, s7
@@ -847,16 +838,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s4, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s2
-; GFX1164-NEXT:    s_add_i32 s4, s4, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s7, s5, s4
+; GFX1164-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1164-NEXT:    v_writelane_b32 v1, s6, s7
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -951,16 +938,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    ; implicit-def: $vgpr1
 ; GFX1264-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1264-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT:    s_ctz_i32_b32 s4, s3
-; GFX1264-NEXT:    s_ctz_i32_b32 s5, s2
-; GFX1264-NEXT:    s_add_co_i32 s4, s4, 32
-; GFX1264-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1264-NEXT:    s_min_u32 s7, s5, s4
+; GFX1264-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1264-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1264-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1264-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1264-NEXT:    v_writelane_b32 v1, s6, s7
 ; GFX1264-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1264-NEXT:    s_add_co_i32 s6, s6, s8
 ; GFX1264-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1264-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -2557,15 +2540,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s4, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s2
-; GFX8-NEXT:    s_add_i32 s4, s4, 32
-; GFX8-NEXT:    s_min_u32 s7, s5, s4
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s7
-; GFX8-NEXT:    s_mov_b32 m0, s7
+; GFX8-NEXT:    s_ff1_i32_b64 s4, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s4
+; GFX8-NEXT:    v_readlane_b32 s7, v0, s4
+; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX8-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX8-NEXT:    s_add_i32 s6, s6, s8
+; GFX8-NEXT:    s_add_i32 s6, s6, s7
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB8_1
@@ -2605,15 +2585,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s4, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s2
-; GFX9-NEXT:    s_add_i32 s4, s4, 32
-; GFX9-NEXT:    s_min_u32 s7, s5, s4
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s7
-; GFX9-NEXT:    s_mov_b32 m0, s7
+; GFX9-NEXT:    s_ff1_i32_b64 s4, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s4
+; GFX9-NEXT:    v_readlane_b32 s7, v0, s4
+; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX9-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX9-NEXT:    s_add_i32 s6, s6, s8
+; GFX9-NEXT:    s_add_i32 s6, s6, s7
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
@@ -2653,10 +2630,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s4, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s2
-; GFX1064-NEXT:    s_add_i32 s4, s4, 32
-; GFX1064-NEXT:    s_min_u32 s7, s5, s4
+; GFX1064-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1064-NEXT:    v_writelane_b32 v1, s6, s7
@@ -2747,16 +2721,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s4, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s2
-; GFX1164-NEXT:    s_add_i32 s4, s4, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s7, s5, s4
+; GFX1164-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1164-NEXT:    v_writelane_b32 v1, s6, s7
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
@@ -2851,16 +2821,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    ; implicit-def: $vgpr1
 ; GFX1264-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1264-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT:    s_ctz_i32_b32 s4, s3
-; GFX1264-NEXT:    s_ctz_i32_b32 s5, s2
-; GFX1264-NEXT:    s_add_co_i32 s4, s4, 32
-; GFX1264-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1264-NEXT:    s_min_u32 s7, s5, s4
+; GFX1264-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1264-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1264-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1264-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1264-NEXT:    v_writelane_b32 v1, s6, s7
 ; GFX1264-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1264-NEXT:    s_add_co_i32 s6, s6, s8
 ; GFX1264-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1264-NEXT:    s_cbranch_scc1 .LBB8_1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index cf2afeb7b01b..34499043ce6b 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -478,13 +478,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -522,13 +519,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -565,10 +559,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -649,16 +640,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_add_i32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -757,10 +744,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX8-NEXT:    s_mov_b32 s2, 0
 ; GFX8-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s3, s1
-; GFX8-NEXT:    s_ff1_i32_b32 s4, s0
-; GFX8-NEXT:    s_add_i32 s3, s3, 32
-; GFX8-NEXT:    s_min_u32 s3, s4, s3
+; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s3
 ; GFX8-NEXT:    s_add_i32 s2, s2, s6
@@ -789,10 +773,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX9-NEXT:    s_mov_b32 s2, 0
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s4, s0
-; GFX9-NEXT:    s_add_i32 s3, s3, 32
-; GFX9-NEXT:    s_min_u32 s3, s4, s3
+; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s3
 ; GFX9-NEXT:    s_add_i32 s2, s2, s6
@@ -820,10 +801,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1064-NEXT:    s_mov_b32 s2, 0
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s4, s0
-; GFX1064-NEXT:    s_add_i32 s3, s3, 32
-; GFX1064-NEXT:    s_min_u32 s3, s4, s3
+; GFX1064-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[4:5], 1, s3
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
@@ -880,15 +858,12 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1164-NEXT:    s_mov_b32 s2, 0
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s4, s0
-; GFX1164-NEXT:    s_add_i32 s3, s3, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s3, s4, s3
+; GFX1164-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    s_add_i32 s2, s2, s6
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -2005,13 +1980,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -2049,13 +2021,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -2092,10 +2061,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -2176,16 +2142,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_add_i32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB9_1
@@ -2284,10 +2246,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX8-NEXT:    s_mov_b32 s2, 0
 ; GFX8-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s3, s1
-; GFX8-NEXT:    s_ff1_i32_b32 s4, s0
-; GFX8-NEXT:    s_add_i32 s3, s3, 32
-; GFX8-NEXT:    s_min_u32 s3, s4, s3
+; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s3
 ; GFX8-NEXT:    s_add_i32 s2, s2, s6
@@ -2316,10 +2275,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX9-NEXT:    s_mov_b32 s2, 0
 ; GFX9-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s4, s0
-; GFX9-NEXT:    s_add_i32 s3, s3, 32
-; GFX9-NEXT:    s_min_u32 s3, s4, s3
+; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s3
 ; GFX9-NEXT:    s_add_i32 s2, s2, s6
@@ -2347,10 +2303,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1064-NEXT:    s_mov_b32 s2, 0
 ; GFX1064-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s4, s0
-; GFX1064-NEXT:    s_add_i32 s3, s3, 32
-; GFX1064-NEXT:    s_min_u32 s3, s4, s3
+; GFX1064-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[4:5], 1, s3
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
@@ -2407,15 +2360,12 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1164-NEXT:    s_mov_b32 s2, 0
 ; GFX1164-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s4, s0
-; GFX1164-NEXT:    s_add_i32 s3, s3, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s3, s4, s3
+; GFX1164-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    s_add_i32 s2, s2, s6
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB10_1
@@ -3105,13 +3055,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB14_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_and_b32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -3149,13 +3096,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB14_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_and_b32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -3192,10 +3136,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB14_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -3276,16 +3217,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB14_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_and_b32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB14_1
@@ -3392,13 +3329,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_or_b32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -3436,13 +3370,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_or_b32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -3479,10 +3410,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -3563,16 +3491,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_or_b32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB15_1
@@ -3679,13 +3603,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_xor_b32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -3723,13 +3644,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_xor_b32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -3766,10 +3684,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -3850,16 +3765,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_xor_b32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB16_1
@@ -3966,13 +3877,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB17_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_max_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -4010,13 +3918,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB17_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_max_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -4053,10 +3958,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB17_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -4137,16 +4039,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB17_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_max_i32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB17_1
@@ -4495,13 +4393,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB19_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_min_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -4539,13 +4434,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB19_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_min_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -4582,10 +4474,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB19_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -4666,16 +4555,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB19_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_min_i32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB19_1
@@ -5024,13 +4909,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB21_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_max_u32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -5068,13 +4950,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB21_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_max_u32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -5111,10 +4990,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB21_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -5195,16 +5071,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB21_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_max_u32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB21_1
@@ -5548,13 +5420,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_min_u32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -5592,13 +5461,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_min_u32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -5635,10 +5501,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -5719,16 +5582,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_min_u32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB23_1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 43068a28812e..79f8b3a1d5d8 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -588,13 +588,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -632,13 +629,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -675,10 +669,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -757,16 +748,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -848,16 +835,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -1610,13 +1593,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1654,13 +1634,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1697,10 +1674,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -1779,16 +1753,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB6_1
@@ -1871,16 +1841,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB6_1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 5ac8ed8df456..edf6fbadf1a6 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -605,13 +605,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -650,13 +647,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -694,10 +688,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -778,16 +769,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -871,16 +858,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -1785,13 +1768,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1830,13 +1810,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1874,10 +1851,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -1958,16 +1932,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
@@ -2052,16 +2022,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 79b9f8caea94..adc23860e896 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -72,6 +72,1831 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
   ret void
 }
 
+define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+  ret <2 x bfloat> %load
+}
+
+define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+  ret <3 x bfloat> %load
+}
+
+define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+  ret <4 x bfloat> %load
+}
+
+define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v6bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v6bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v6bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v6bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v6bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v6bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+  ret <6 x bfloat> %load
+}
+
+define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+  ret <8 x bfloat> %load
+}
+
+define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v16bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[0:3], v[4:5], off
+; GFX11-NEXT:    global_load_b128 v[4:7], v[4:5], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+  ret <16 x bfloat> %load
+}
+
+define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GCN-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX7-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v12, v0
+; GFX8-NEXT:    v_mov_b32_e32 v13, v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v12
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v13, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v12
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[12:13]
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 48, v12
+; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[16:17], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[16:17], off offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v17, v1
+; GFX10-NEXT:    v_mov_b32_e32 v16, v0
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[16:17], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[16:17], off offset:48
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v32bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_load_b128 v[0:3], v[12:13], off
+; GFX11-NEXT:    global_load_b128 v[4:7], v[12:13], off offset:16
+; GFX11-NEXT:    global_load_b128 v[8:11], v[12:13], off offset:32
+; GFX11-NEXT:    global_load_b128 v[12:15], v[12:13], off offset:48
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+  ret <32 x bfloat> %load
+}
+
+define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v64bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 0x7c, v0
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 0x78, v0
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x74, v0
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 0x70, v0
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 0x6c, v0
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 0x68, v0
+; GCN-NEXT:    v_add_i32_e32 v13, vcc, 0x64, v0
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, 0x60, v0
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, 0x5c, v0
+; GCN-NEXT:    v_add_i32_e32 v16, vcc, 0x58, v0
+; GCN-NEXT:    v_add_i32_e32 v17, vcc, 0x54, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 0x50, v0
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 0x4c, v0
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x48, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, v11, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v12, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v13, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v14, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 0x44, v0
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 64, v0
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 60, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, v15, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v16, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v17, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 56, v0
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 52, v0
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 48, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v10, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v11, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; GCN-NEXT:    buffer_store_dword v5, v20, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
+; GCN-NEXT:    buffer_store_dword v4, v21, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 36, v0
+; GCN-NEXT:    buffer_store_dword v3, v22, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 28, v0
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 24, v0
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 20, v0
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 16, v0
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; GCN-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
+; GCN-NEXT:    buffer_store_dword v8, v4, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
+; GCN-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    buffer_store_dword v18, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v17, v6, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v16, v19, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v15, v20, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v12, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v64bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x7c, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x78, v0
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x74, v0
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0x70, v0
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, 52, v0
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, 48, v0
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, 44, v0
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, 40, v0
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, 36, v0
+; GFX7-NEXT:    v_add_i32_e32 v24, vcc, 32, v0
+; GFX7-NEXT:    v_add_i32_e32 v25, vcc, 28, v0
+; GFX7-NEXT:    v_add_i32_e32 v26, vcc, 24, v0
+; GFX7-NEXT:    v_add_i32_e32 v27, vcc, 20, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x6c, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x68, v0
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x64, v0
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0x60, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x5c, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x58, v0
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x54, v0
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0x50, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x4c, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x48, v0
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x44, v0
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 64, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; GFX7-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
+; GFX7-NEXT:    buffer_store_dword v4, v19, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
+; GFX7-NEXT:    buffer_store_dword v3, v20, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    buffer_store_dword v10, v21, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v9, v22, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v8, v23, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v7, v24, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    buffer_store_dword v14, v25, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v13, v26, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v12, v27, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(12)
+; GFX7-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v17, v4, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v16, v3, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v64bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v28, v0
+; GFX8-NEXT:    v_mov_b32_e32 v29, v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v28
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v29, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v28
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v29, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 48, v28
+; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v29, vcc
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 64, v28
+; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, 0, v29, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x50
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v28
+; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v29, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x60
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v28
+; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v29, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x70
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[28:29]
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
+; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v29, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
+; GFX8-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
+; GFX8-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v64bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v29, v1
+; GFX9-NEXT:    v_mov_b32_e32 v28, v0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
+; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v64bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v33, v1
+; GFX10-NEXT:    v_mov_b32_e32 v32, v0
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[32:33], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[32:33], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[32:33], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[32:33], off offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[32:33], off offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[32:33], off offset:80
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v[32:33], off offset:96
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v[32:33], off offset:112
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v64bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    global_load_b128 v[0:3], v[28:29], off
+; GFX11-NEXT:    global_load_b128 v[4:7], v[28:29], off offset:16
+; GFX11-NEXT:    global_load_b128 v[8:11], v[28:29], off offset:32
+; GFX11-NEXT:    global_load_b128 v[12:15], v[28:29], off offset:48
+; GFX11-NEXT:    global_load_b128 v[16:19], v[28:29], off offset:64
+; GFX11-NEXT:    global_load_b128 v[20:23], v[28:29], off offset:80
+; GFX11-NEXT:    global_load_b128 v[24:27], v[28:29], off offset:96
+; GFX11-NEXT:    global_load_b128 v[28:31], v[28:29], off offset:112
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <64 x bfloat>, ptr addrspace(1) %ptr
+  ret <64 x bfloat> %load
+}
+
+define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v[1:2], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_store_dword v[1:2], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[1:2], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <2 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
+; GCN-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_store_short v[2:3], v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v[2:3], v1, off offset:4
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v0, v6, v0, 16
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <4 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v10, v4, 16
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[8:9], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v1, v0, 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <8 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v18, v4, 16
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    v_alignbit_b32 v13, v15, v14, 16
+; GCN-NEXT:    v_alignbit_b32 v12, v19, v12, 16
+; GCN-NEXT:    v_alignbit_b32 v11, v11, v10, 16
+; GCN-NEXT:    v_alignbit_b32 v10, v9, v8, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[16:17], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX7-NEXT:    v_alignbit_b32 v14, v0, v14, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX7-NEXT:    v_alignbit_b32 v13, v0, v12, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_alignbit_b32 v12, v0, v10, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_alignbit_b32 v11, v0, v8, 16
+; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v8
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v16bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[8:9], v[4:7], off offset:16
+; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <16 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GCN-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v31, v4, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
+; GCN-NEXT:    v_alignbit_b32 v13, v0, v14, 16
+; GCN-NEXT:    v_alignbit_b32 v12, v1, v12, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GCN-NEXT:    v_alignbit_b32 v11, v0, v10, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v10, v0, v8, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; GCN-NEXT:    v_alignbit_b32 v9, v0, v22, 16
+; GCN-NEXT:    v_alignbit_b32 v8, v1, v20, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GCN-NEXT:    v_alignbit_b32 v7, v0, v18, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v27
+; GCN-NEXT:    v_alignbit_b32 v6, v0, v16, 16
+; GCN-NEXT:    v_alignbit_b32 v16, v1, v28, 16
+; GCN-NEXT:    v_alignbit_b32 v15, v14, v26, 16
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GCN-NEXT:    v_alignbit_b32 v14, v0, v24, 16
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    v_alignbit_b32 v17, v17, v30, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
+; GFX7-NEXT:    v_alignbit_b32 v13, v0, v14, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v11, v0, v10, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX7-NEXT:    v_alignbit_b32 v10, v0, v8, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v9, v0, v22, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GFX7-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GFX7-NEXT:    v_alignbit_b32 v12, v1, v12, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; GFX7-NEXT:    v_alignbit_b32 v7, v0, v18, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX7-NEXT:    v_alignbit_b32 v8, v1, v20, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v27
+; GFX7-NEXT:    v_alignbit_b32 v6, v0, v16, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX7-NEXT:    v_alignbit_b32 v16, v1, v28, 16
+; GFX7-NEXT:    v_alignbit_b32 v15, v14, v26, 16
+; GFX7-NEXT:    v_alignbit_b32 v14, v0, v24, 16
+; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_alignbit_b32 v4, v31, v4, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT:    v_alignbit_b32 v17, v17, v30, 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v16
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v16
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v16
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:48
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:32
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off offset:16
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:48
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:32
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off offset:16
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v32bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:48
+; GFX11-NEXT:    global_store_b128 v[16:17], v[8:11], off offset:32
+; GFX11-NEXT:    global_store_b128 v[16:17], v[4:7], off offset:16
+; GFX11-NEXT:    global_store_b128 v[16:17], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <32 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v64bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT:    v_alignbit_b32 v22, v23, v22, 16
+; GCN-NEXT:    v_alignbit_b32 v21, v21, v20, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT:    v_alignbit_b32 v20, v19, v18, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:136
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:132
+; GCN-NEXT:    v_alignbit_b32 v19, v19, v16, 16
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[19:22], v[17:18], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
+; GCN-NEXT:    v_alignbit_b32 v13, v15, v14, 16
+; GCN-NEXT:    v_alignbit_b32 v12, v16, v12, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_alignbit_b32 v11, v11, v10, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v10, v9, v8, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[17:18], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:120
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_alignbit_b32 v11, v8, v10, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v10, v9, v12, 16
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:108
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:104
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:100
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_alignbit_b32 v9, v8, v9, 16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v8, v8, v13, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[17:18], s[4:7], 0 addr64 offset:112
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:96
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:88
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:92
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:84
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_alignbit_b32 v11, v8, v10, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v10, v9, v12, 16
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:80
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:76
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:72
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:68
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_alignbit_b32 v9, v8, v9, 16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v8, v8, v13, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v29
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v27
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v25
+; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[17:18], s[4:7], 0 addr64 offset:96
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v12, v4, 16
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:28
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT:    v_alignbit_b32 v8, v13, v28, 16
+; GCN-NEXT:    v_alignbit_b32 v7, v14, v26, 16
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_alignbit_b32 v6, v15, v24, 16
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:64
+; GCN-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT:    v_alignbit_b32 v9, v9, v30, 16
+; GCN-NEXT:    v_alignbit_b32 v13, v10, v11, 16
+; GCN-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NEXT:    v_alignbit_b32 v12, v12, v16, 16
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:56
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; GCN-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NEXT:    v_alignbit_b32 v11, v11, v0, 16
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
+; GCN-NEXT:    v_alignbit_b32 v10, v1, v14, 16
+; GCN-NEXT:    v_alignbit_b32 v22, v15, v16, 16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_alignbit_b32 v21, v20, v0, 16
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:40
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:36
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v19
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_alignbit_b32 v20, v15, v0, 16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v19, v0, v14, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[19:22], v[17:18], s[4:7], 0 addr64 offset:80
+; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[17:18], s[4:7], 0 addr64 offset:64
+; GCN-NEXT:    buffer_store_dwordx4 v[6:9], v[17:18], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[17:18], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v64bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GFX7-NEXT:    v_alignbit_b32 v4, v31, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
+; GFX7-NEXT:    v_alignbit_b32 v13, v0, v14, 16
+; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v12, v1, v12, 16
+; GFX7-NEXT:    v_alignbit_b32 v11, v0, v10, 16
+; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:132
+; GFX7-NEXT:    buffer_load_dword v35, off, s[0:3], s32
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT:    v_alignbit_b32 v10, v9, v8, 16
+; GFX7-NEXT:    v_alignbit_b32 v8, v21, v20, 16
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    v_alignbit_b32 v9, v23, v22, 16
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    v_alignbit_b32 v23, v6, v7, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_alignbit_b32 v22, v15, v31, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v20, 16, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_alignbit_b32 v21, v20, v33, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v32, 16, v34
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX7-NEXT:    v_alignbit_b32 v7, v6, v18, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_alignbit_b32 v20, v32, v14, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    buffer_store_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v29
+; GFX7-NEXT:    v_alignbit_b32 v6, v6, v16, 16
+; GFX7-NEXT:    v_alignbit_b32 v16, v15, v28, 16
+; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v35
+; GFX7-NEXT:    v_alignbit_b32 v17, v14, v30, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v27
+; GFX7-NEXT:    v_alignbit_b32 v15, v14, v26, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v25
+; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_alignbit_b32 v14, v14, v24, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    v_alignbit_b32 v21, v19, v32, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(13)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v33
+; GFX7-NEXT:    s_waitcnt vmcnt(12)
+; GFX7-NEXT:    v_alignbit_b32 v20, v19, v34, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(11)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v36
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    v_alignbit_b32 v19, v19, v37, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_alignbit_b32 v25, v22, v23, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v24, 16, v38
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_alignbit_b32 v24, v24, v18, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v39
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_alignbit_b32 v23, v18, v48, 16
+; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_alignbit_b32 v22, v22, v29, 16
+; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_store_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    v_alignbit_b32 v18, v22, v18, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_alignbit_b32 v25, v22, v30, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_alignbit_b32 v24, v23, v32, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v33
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_alignbit_b32 v23, v22, v28, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_alignbit_b32 v22, v22, v34, 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT:    buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT:    buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v64bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX8-NEXT:    s_movk_i32 s4, 0x70
+; GFX8-NEXT:    s_movk_i32 s5, 0x50
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_add_u32_e32 v34, vcc, s4, v32
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_addc_u32_e32 v35, vcc, 0, v33, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x60
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[34:35], v[28:31]
+; GFX8-NEXT:    flat_store_dwordx4 v[32:33], v[0:3]
+; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v32
+; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v33, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s5, v32
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 64, v32
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v33, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[28:29], v[24:27]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 48, v32
+; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v33, vcc
+; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 32, v32
+; GFX8-NEXT:    v_addc_u32_e32 v27, vcc, 0, v33, vcc
+; GFX8-NEXT:    v_add_u32_e32 v28, vcc, 16, v32
+; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v33, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
+; GFX8-NEXT:    flat_store_dwordx4 v[24:25], v[12:15]
+; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
+; GFX8-NEXT:    flat_store_dwordx4 v[28:29], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v64bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v64bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v64bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:112
+; GFX11-NEXT:    global_store_b128 v[32:33], v[24:27], off offset:96
+; GFX11-NEXT:    global_store_b128 v[32:33], v[20:23], off offset:80
+; GFX11-NEXT:    global_store_b128 v[32:33], v[16:19], off offset:64
+; GFX11-NEXT:    global_store_b128 v[32:33], v[12:15], off offset:48
+; GFX11-NEXT:    global_store_b128 v[32:33], v[8:11], off offset:32
+; GFX11-NEXT:    global_store_b128 v[32:33], v[4:7], off offset:16
+; GFX11-NEXT:    global_store_b128 v[32:33], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store <64 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
+; GCN-LABEL: test_store_fpimm:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_short v4, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_store_short v5, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_store_fpimm:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX7-NEXT:    buffer_store_short v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0x4228
+; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_store_fpimm:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX8-NEXT:    flat_store_short v[0:1], v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x4228
+; GFX8-NEXT:    flat_store_short v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_store_fpimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX9-NEXT:    global_store_short v[0:1], v4, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4228
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_store_fpimm:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GFX10-NEXT:    global_store_short v[0:1], v4, off
+; GFX10-NEXT:    global_store_short v[2:3], v5, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_store_fpimm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b16 v[2:3], v5, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store bfloat 1.0, ptr addrspace(1) %ptr0
+  store bfloat 42.0, ptr addrspace(1) %ptr1
+  ret void
+}
+
 define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store_f32_to_bf16:
 ; GCN:       ; %bb.0:
@@ -8750,6 +10575,112 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
   ret <32 x bfloat> %op
 }
 
+define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
+; GCN-LABEL: v_fadd_bf16_fpimm_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16_fpimm_0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16_fpimm_0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16_fpimm_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16_fpimm_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_bf16_fpimm_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd bfloat %arg0, 1.0
+  ret bfloat %add
+}
+
+define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
+; GCN-LABEL: v_fadd_bf16_fpimm_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16_fpimm_1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16_fpimm_1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16_fpimm_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16_fpimm_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_bf16_fpimm_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd bfloat %arg0, 42.0
+  ret bfloat %add
+}
+
 define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_fsub_bf16:
 ; GCN:       ; %bb.0:
@@ -9504,6 +11435,1509 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
   ret <4 x bfloat> %op
 }
 
+define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v9, v10, v9
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX9-NEXT:    v_mul_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX9-NEXT:    v_mul_f32_e32 v11, v12, v11
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
+; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
+; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_mul_f32_e32 v9, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_mul_f32_e32 v10, v13, v12
+; GFX10-NEXT:    v_mul_f32_e32 v11, v15, v14
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmul_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v9, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mul_f32_e32 v9, v10, v9
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v12, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX11-NEXT:    v_dual_mul_f32 v10, v12, v11 :: v_dual_mul_f32 v11, v14, v13
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul <8 x bfloat> %a, %b
+  ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GCN-NEXT:    v_mul_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX7-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX7-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX7-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX7-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX7-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX7-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX7-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GFX7-NEXT:    v_mul_f32_e32 v15, v15, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT:    v_mul_f32_e32 v17, v18, v17
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
+; GFX9-NEXT:    v_mul_f32_e32 v18, v19, v18
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT:    v_mul_f32_e32 v19, v20, v19
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v20, v21, v20
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX9-NEXT:    v_mul_f32_e32 v21, v22, v21
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v22, v23, v22
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v23, v24, v23
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
+; GFX10-NEXT:    v_mul_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_mul_f32_e32 v18, v20, v19
+; GFX10-NEXT:    v_mul_f32_e32 v19, v22, v21
+; GFX10-NEXT:    v_mul_f32_e32 v20, v24, v23
+; GFX10-NEXT:    v_mul_f32_e32 v21, v26, v25
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_mul_f32_e32 v22, v23, v22
+; GFX10-NEXT:    v_mul_f32_e32 v23, v25, v24
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmul_v16bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v17, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v12 :: v_dual_mul_f32 v5, v5, v13
+; GFX11-NEXT:    v_dual_mul_f32 v17, v18, v17 :: v_dual_mul_f32 v18, v20, v19
+; GFX11-NEXT:    v_mul_f32_e32 v19, v22, v21
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_mul_f32_e32 v21, v26, v25
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v14 :: v_dual_and_b32 v25, 0xffff0000, v0
+; GFX11-NEXT:    v_mul_f32_e32 v20, v24, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v10 :: v_dual_mul_f32 v3, v3, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v9 :: v_dual_mul_f32 v22, v23, v22
+; GFX11-NEXT:    v_mul_f32_e32 v23, v25, v24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul <16 x bfloat> %a, %b
+  ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_mul_f32_e32 v31, v32, v31
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:    v_mul_f32_e32 v30, v30, v32
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v29, v29, v33
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_mul_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v27, v27, v33
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:    v_mul_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v25, v25, v33
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT:    v_mul_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v23, v23, v33
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT:    v_mul_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v21, v21, v33
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT:    v_mul_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v19, v19, v33
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT:    v_mul_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v17, v17, v33
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_mul_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v15, v15, v33
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT:    v_mul_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v13, v13, v33
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT:    v_mul_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v11, v11, v33
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT:    v_mul_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v9, v9, v33
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_mul_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v7, v7, v33
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_mul_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v5, v5, v33
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_mul_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v33
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v33
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v32
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v31, v32, v31
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v30, v30, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v29, v29, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v28, v28, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v27, v27, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v26, v26, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v25, v25, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v24, v24, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v23, v23, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v22, v22, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v21, v21, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v20, v20, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v19, v19, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v18, v18, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v17, v17, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v16, v16, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v15, v15, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v14, v14, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v13, v13, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v12, v12, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v11, v11, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v10, v10, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v9, v9, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v8, v8, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v32
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_mul_f32_e32 v31, v32, v31
+; GFX8-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_mul_f32_e32 v30, v32, v30
+; GFX8-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_mul_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_mul_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_mul_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_mul_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX8-NEXT:    v_mul_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX8-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_mul_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_mul_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX8-NEXT:    v_perm_b32 v15, v15, v32, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v58, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v59, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v44, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v45, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v46, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v47, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v56, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v57, 0xffff0000, v2
+; GFX9-NEXT:    v_mul_f32_e32 v38, v39, v38
+; GFX9-NEXT:    v_mul_f32_e32 v39, v49, v48
+; GFX9-NEXT:    v_mul_f32_e32 v48, v51, v50
+; GFX9-NEXT:    v_mul_f32_e32 v51, v41, v40
+; GFX9-NEXT:    v_mul_f32_e32 v40, v59, v58
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_mul_f32_e32 v49, v53, v52
+; GFX9-NEXT:    v_mul_f32_e32 v50, v55, v54
+; GFX9-NEXT:    v_mul_f32_e32 v52, v43, v42
+; GFX9-NEXT:    v_mul_f32_e32 v53, v45, v44
+; GFX9-NEXT:    v_mul_f32_e32 v54, v47, v46
+; GFX9-NEXT:    v_mul_f32_e32 v55, v57, v56
+; GFX9-NEXT:    v_perm_b32 v1, v1, v40, s4
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
+; GFX9-NEXT:    v_mul_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v16
+; GFX9-NEXT:    v_mul_f32_e32 v34, v35, v34
+; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v36, v37, v36
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_mul_f32_e32 v33, v35, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v33, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v55, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v54, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v53, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v52, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v51, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v50, s4
+; GFX9-NEXT:    v_perm_b32 v8, v8, v49, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v48, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v39, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v38, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v36, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v34, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v32, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v31
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX9-NEXT:    v_mul_f32_e32 v35, v37, v35
+; GFX9-NEXT:    v_mul_f32_e32 v15, v15, v31
+; GFX9-NEXT:    v_perm_b32 v15, v15, v35, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX10-NEXT:    v_mul_f32_e32 v53, v54, v53
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX10-NEXT:    v_mul_f32_e32 v55, v64, v55
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
+; GFX10-NEXT:    v_mul_f32_e32 v65, v66, v65
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT:    v_mul_f32_e32 v67, v68, v67
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX10-NEXT:    v_mul_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
+; GFX10-NEXT:    v_mul_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX10-NEXT:    v_mul_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
+; GFX10-NEXT:    v_mul_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
+; GFX10-NEXT:    v_mul_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
+; GFX10-NEXT:    v_mul_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_mul_f32_e32 v34, v36, v34
+; GFX10-NEXT:    v_mul_f32_e32 v36, v48, v38
+; GFX10-NEXT:    v_mul_f32_e32 v38, v52, v50
+; GFX10-NEXT:    v_mul_f32_e32 v48, v64, v54
+; GFX10-NEXT:    v_mul_f32_e32 v50, v68, v66
+; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX10-NEXT:    v_mul_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmul_v32bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_and_b32_e32 v82, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v23, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v19, 16, v19
+; GFX11-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v80, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v67, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v38, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_dual_mul_f32 v33, v34, v33 :: v_dual_mul_f32 v34, v36, v35
+; GFX11-NEXT:    v_dual_mul_f32 v35, v38, v37 :: v_dual_mul_f32 v12, v12, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_mul_f32 v5, v5, v21
+; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v16, v32, v16 :: v_dual_mul_f32 v13, v13, v29
+; GFX11-NEXT:    v_dual_mul_f32 v15, v15, v17 :: v_dual_mul_f32 v14, v14, v30
+; GFX11-NEXT:    v_mul_f32_e32 v36, v48, v39
+; GFX11-NEXT:    v_dual_mul_f32 v48, v64, v55 :: v_dual_mul_f32 v37, v50, v49
+; GFX11-NEXT:    v_mul_f32_e32 v50, v68, v67
+; GFX11-NEXT:    v_dual_mul_f32 v38, v52, v51 :: v_dual_mul_f32 v51, v70, v69
+; GFX11-NEXT:    v_dual_mul_f32 v52, v80, v71 :: v_dual_mul_f32 v39, v54, v53
+; GFX11-NEXT:    v_dual_mul_f32 v53, v82, v81 :: v_dual_mul_f32 v54, v84, v83
+; GFX11-NEXT:    v_mul_f32_e32 v55, v86, v85
+; GFX11-NEXT:    v_mul_f32_e32 v49, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fmul <32 x bfloat> %a, %b
+  ret <32 x bfloat> %op
+}
+
 define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_fdiv_bf16:
 ; GCN:       ; %bb.0:
@@ -9716,7 +13150,8 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
   %op = call bfloat @llvm.fabs.bf16(bfloat %a)
   %cast = bitcast bfloat %op to i16
   %zext = zext i16 %cast to i32
-  ret i32 %zext
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
 }
 
 define bfloat @v_fneg_bf16(bfloat %a) {
@@ -9943,6 +13378,11 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
 
 declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
 declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
+declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
+declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
 
 define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_minnum_bf16:
@@ -10120,8 +13560,2440 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
   ret <2 x bfloat> %op
 }
 
+define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v5, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_min_f32 v0, v0, v2
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_min_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_min_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v6, v7, v7 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_min_f32 v1, v1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f32_e32 v4, v6, v5
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_min_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_min_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_min_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX7-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_min_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_min_f32_e32 v9, v10, v9
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_min_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX9-NEXT:    v_min_f32_e32 v11, v12, v11
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
+; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v2
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_max_f32_e32 v9, v11, v11
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_min_f32_e32 v9, v9, v10
+; GFX10-NEXT:    v_max_f32_e32 v10, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v11, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v12, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v13, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_min_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_min_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_min_f32 v8, v9, v8
+; GFX11-NEXT:    v_min_f32_e32 v9, v11, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT:    v_max_f32_e32 v10, v12, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v10, v11, v10
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_and_b32 v12, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v3, v3, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_and_b32 v4, 0xffff0000, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v2, v2, v5 :: v_dual_and_b32 v5, 0xffff0000, v8
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_min_f32_e32 v14, v14, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_min_f32_e32 v13, v13, v29
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_min_f32_e32 v12, v12, v28
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_min_f32_e32 v11, v11, v27
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_min_f32_e32 v10, v10, v26
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_min_f32_e32 v9, v9, v25
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_min_f32_e32 v8, v8, v24
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_min_f32_e32 v7, v7, v23
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_min_f32_e32 v6, v6, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_min_f32_e32 v5, v5, v21
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_min_f32_e32 v4, v4, v20
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v19
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v18
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v17
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_min_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX7-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX7-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX7-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX7-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX7-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX7-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX7-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX7-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_min_f32_e32 v15, v15, v22
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_min_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_min_f32_e32 v17, v18, v17
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_min_f32_e32 v18, v19, v18
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_min_f32_e32 v19, v20, v19
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_min_f32_e32 v20, v21, v20
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_min_f32_e32 v21, v22, v21
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_min_f32_e32 v22, v23, v22
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v23, v24, v23
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX10-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v18, v18
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
+; GFX10-NEXT:    v_min_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT:    v_max_f32_e32 v19, v20, v20
+; GFX10-NEXT:    v_max_f32_e32 v20, v21, v21
+; GFX10-NEXT:    v_max_f32_e32 v21, v22, v22
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_min_f32_e32 v18, v19, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_min_f32_e32 v19, v21, v20
+; GFX10-NEXT:    v_max_f32_e32 v20, v22, v22
+; GFX10-NEXT:    v_max_f32_e32 v21, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v22, v24, v24
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_min_f32_e32 v20, v21, v20
+; GFX10-NEXT:    v_min_f32_e32 v21, v23, v22
+; GFX10-NEXT:    v_min_f32_e32 v22, v25, v24
+; GFX10-NEXT:    v_min_f32_e32 v23, v27, v26
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v16bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_and_b32 v18, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v19, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v25, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_max_f32 v17, v18, v18 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_and_b32 v26, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_min_f32 v17, v18, v17 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v19, v20, v20 :: v_dual_max_f32 v20, v21, v21
+; GFX11-NEXT:    v_dual_max_f32 v21, v22, v22 :: v_dual_and_b32 v22, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_dual_min_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v19, v21, v20 :: v_dual_max_f32 v20, v22, v22
+; GFX11-NEXT:    v_dual_max_f32 v14, v14, v14 :: v_dual_max_f32 v21, v23, v23
+; GFX11-NEXT:    v_dual_max_f32 v22, v24, v24 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v20, v21, v20 :: v_dual_max_f32 v15, v15, v15
+; GFX11-NEXT:    v_dual_max_f32 v26, v26, v26 :: v_dual_and_b32 v27, 0xffff0000, v0
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_dual_min_f32 v21, v23, v22 :: v_dual_min_f32 v22, v25, v24
+; GFX11-NEXT:    v_dual_min_f32 v23, v27, v26 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-NEXT:    v_dual_min_f32 v6, v6, v14 :: v_dual_max_f32 v5, v5, v5
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v10 :: v_dual_max_f32 v11, v11, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_dual_min_f32 v7, v7, v15 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v12, v12, v12 :: v_dual_min_f32 v5, v5, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v8 :: v_dual_min_f32 v3, v3, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX11-NEXT:    v_dual_min_f32 v4, v4, v12 :: v_dual_min_f32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
+  ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:    v_min_f32_e32 v31, v32, v31
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:    v_min_f32_e32 v30, v30, v32
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT:    v_min_f32_e32 v29, v29, v32
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_min_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT:    v_min_f32_e32 v27, v27, v32
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:    v_min_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
+; GCN-NEXT:    v_min_f32_e32 v25, v25, v32
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT:    v_min_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_min_f32_e32 v23, v23, v32
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT:    v_min_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
+; GCN-NEXT:    v_min_f32_e32 v21, v21, v32
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT:    v_min_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT:    v_min_f32_e32 v19, v19, v32
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT:    v_min_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
+; GCN-NEXT:    v_min_f32_e32 v17, v17, v32
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_min_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_min_f32_e32 v15, v15, v32
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT:    v_min_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_min_f32_e32 v13, v13, v32
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT:    v_min_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_min_f32_e32 v11, v11, v32
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT:    v_min_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_min_f32_e32 v9, v9, v32
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_min_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_min_f32_e32 v7, v7, v32
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_min_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_min_f32_e32 v5, v5, v32
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_min_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v32
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v32
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v32
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v31, v32, v31
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v30, v30, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v29, v29, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v28, v28, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v27, v27, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v26, v26, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v25, v25, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v24, v24, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v23, v23, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v22, v22, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v21, v21, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v20, v20, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v19, v19, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v18, v18, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v17, v17, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v16, v16, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v15, v15, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v14, v14, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v13, v13, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v12, v12, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v11, v11, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v10, v10, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v9, v9, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v8, v8, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v7, v7, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v4, v4, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v32
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_min_f32_e32 v31, v32, v31
+; GFX8-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_min_f32_e32 v30, v32, v30
+; GFX8-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_min_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
+; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_min_f32_e32 v28, v32, v28
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_min_f32_e32 v27, v27, v33
+; GFX8-NEXT:    v_min_f32_e32 v15, v15, v32
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_min_f32_e32 v32, v33, v32
+; GFX8-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_min_f32_e32 v26, v33, v26
+; GFX8-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_min_f32_e32 v25, v33, v25
+; GFX8-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_min_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_min_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX8-NEXT:    v_perm_b32 v10, v10, v32, s4
+; GFX8-NEXT:    v_perm_b32 v15, v15, v27, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
+; GFX9-NEXT:    v_max_f32_e32 v31, v31, v31
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v32
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v33
+; GFX9-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX9-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX9-NEXT:    v_max_f32_e32 v37, v37, v37
+; GFX9-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v21
+; GFX9-NEXT:    v_min_f32_e32 v31, v32, v31
+; GFX9-NEXT:    v_min_f32_e32 v32, v34, v33
+; GFX9-NEXT:    v_min_f32_e32 v33, v37, v36
+; GFX9-NEXT:    v_min_f32_e32 v37, v51, v50
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX9-NEXT:    v_max_f32_e32 v39, v39, v39
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v50, v43, v43
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    v_min_f32_e32 v34, v39, v38
+; GFX9-NEXT:    v_min_f32_e32 v38, v53, v52
+; GFX9-NEXT:    v_min_f32_e32 v50, v51, v50
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
+; GFX9-NEXT:    v_min_f32_e32 v51, v52, v51
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_min_f32_e32 v39, v55, v54
+; GFX9-NEXT:    v_min_f32_e32 v52, v53, v52
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
+; GFX9-NEXT:    v_min_f32_e32 v53, v54, v53
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v15
+; GFX9-NEXT:    v_min_f32_e32 v36, v49, v48
+; GFX9-NEXT:    v_min_f32_e32 v48, v41, v40
+; GFX9-NEXT:    v_min_f32_e32 v54, v55, v54
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT:    v_min_f32_e32 v55, v40, v55
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v35
+; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT:    v_min_f32_e32 v49, v42, v49
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v35, v35, v35
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT:    v_max_f32_e32 v30, v30, v30
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT:    v_max_f32_e32 v29, v29, v29
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT:    v_max_f32_e32 v28, v28, v28
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v15, v15, v35
+; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v55, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v54, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v53, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v52, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v51, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v50, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v48, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v39, s4
+; GFX9-NEXT:    v_perm_b32 v8, v8, v38, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v37, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v36, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v34, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v33, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v32, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v49, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT:    v_max_f32_e32 v65, v65, v65
+; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT:    v_max_f32_e32 v67, v67, v67
+; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX10-NEXT:    v_min_f32_e32 v53, v54, v53
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX10-NEXT:    v_min_f32_e32 v55, v64, v55
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
+; GFX10-NEXT:    v_min_f32_e32 v65, v66, v65
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT:    v_min_f32_e32 v67, v68, v67
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v32, v32, v32
+; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT:    v_max_f32_e32 v35, v35, v35
+; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT:    v_max_f32_e32 v37, v37, v37
+; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT:    v_max_f32_e32 v39, v39, v39
+; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX10-NEXT:    v_min_f32_e32 v32, v34, v32
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
+; GFX10-NEXT:    v_min_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
+; GFX10-NEXT:    v_min_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
+; GFX10-NEXT:    v_min_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_max_f32_e32 v33, v33, v33
+; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT:    v_max_f32_e32 v30, v30, v30
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v29, v29, v29
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v28, v28, v28
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_min_f32_e32 v34, v36, v34
+; GFX10-NEXT:    v_min_f32_e32 v36, v48, v38
+; GFX10-NEXT:    v_min_f32_e32 v38, v52, v50
+; GFX10-NEXT:    v_min_f32_e32 v48, v64, v54
+; GFX10-NEXT:    v_min_f32_e32 v50, v68, v66
+; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v14, v14, v32, 0x3020706
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_min_f32_e32 v16, v33, v16
+; GFX10-NEXT:    v_min_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v32bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_max_f32 v33, v33, v33 :: v_dual_and_b32 v32, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v35, v35, v35 :: v_dual_max_f32 v34, v34, v34
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_dual_max_f32 v38, v38, v38 :: v_dual_max_f32 v37, v37, v37
+; GFX11-NEXT:    v_dual_max_f32 v39, v39, v39 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX11-NEXT:    v_dual_max_f32 v65, v65, v65 :: v_dual_and_b32 v64, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_max_f32 v83, v83, v83 :: v_dual_and_b32 v82, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    v_dual_max_f32 v49, v49, v49 :: v_dual_max_f32 v48, v48, v48
+; GFX11-NEXT:    v_dual_max_f32 v51, v51, v51 :: v_dual_max_f32 v50, v50, v50
+; GFX11-NEXT:    v_dual_max_f32 v54, v54, v54 :: v_dual_max_f32 v53, v53, v53
+; GFX11-NEXT:    v_dual_max_f32 v67, v67, v67 :: v_dual_max_f32 v66, v66, v66
+; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_max_f32 v26, v26, v26
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT:    v_dual_max_f32 v21, v21, v21 :: v_dual_max_f32 v22, v22, v22
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT:    v_dual_min_f32 v33, v34, v33 :: v_dual_max_f32 v16, v16, v16
+; GFX11-NEXT:    v_dual_min_f32 v34, v36, v35 :: v_dual_min_f32 v35, v38, v37
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v81, v81, v81 :: v_dual_and_b32 v80, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v70, v70, v70 :: v_dual_max_f32 v69, v69, v69
+; GFX11-NEXT:    v_dual_min_f32 v36, v48, v39 :: v_dual_min_f32 v37, v50, v49
+; GFX11-NEXT:    v_min_f32_e32 v39, v54, v53
+; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_min_f32 v1, v1, v17
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_max_f32 v32, v32, v32 :: v_dual_max_f32 v55, v55, v55
+; GFX11-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX11-NEXT:    v_dual_max_f32 v64, v64, v64 :: v_dual_max_f32 v71, v71, v71
+; GFX11-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX11-NEXT:    v_max_f32_e32 v80, v80, v80
+; GFX11-NEXT:    v_max_f32_e32 v82, v82, v82
+; GFX11-NEXT:    v_dual_max_f32 v86, v86, v86 :: v_dual_max_f32 v85, v85, v85
+; GFX11-NEXT:    v_dual_max_f32 v15, v15, v15 :: v_dual_max_f32 v84, v84, v84
+; GFX11-NEXT:    v_dual_max_f32 v29, v29, v29 :: v_dual_max_f32 v30, v30, v30
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_max_f32 v14, v14, v14
+; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v28, v28, v28
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT:    v_dual_max_f32 v19, v19, v19 :: v_dual_max_f32 v20, v20, v20
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    v_dual_min_f32 v38, v52, v51 :: v_dual_min_f32 v53, v82, v81
+; GFX11-NEXT:    v_dual_min_f32 v48, v64, v55 :: v_dual_min_f32 v55, v86, v85
+; GFX11-NEXT:    v_dual_min_f32 v49, v66, v65 :: v_dual_min_f32 v50, v68, v67
+; GFX11-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX11-NEXT:    v_dual_min_f32 v51, v70, v69 :: v_dual_min_f32 v52, v80, v71
+; GFX11-NEXT:    v_dual_min_f32 v9, v9, v25 :: v_dual_min_f32 v54, v84, v83
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_min_f32 v14, v14, v30
+; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_min_f32 v12, v12, v28
+; GFX11-NEXT:    v_dual_min_f32 v7, v7, v23 :: v_dual_min_f32 v8, v8, v24
+; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_min_f32 v4, v4, v20
+; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v17, 16, v31
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_min_f32 v2, v2, v18
+; GFX11-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v16, v32, v16
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
+  ret <32 x bfloat> %op
+}
+
+
 declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
 declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
+declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
+declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
 
 define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GCN-LABEL: v_maxnum_bf16:
@@ -10299,6 +16171,2432 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
   ret <2 x bfloat> %op
 }
 
+define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v5, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_max_f32 v0, v0, v2
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_max_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v6, v7, v7 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_max_f32_e32 v4, v6, v5
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_max_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_max_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_max_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX7-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_max_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v9, v10, v9
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_max_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX9-NEXT:    v_max_f32_e32 v11, v12, v11
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
+; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v2
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_max_f32_e32 v9, v11, v11
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v10
+; GFX10-NEXT:    v_max_f32_e32 v10, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v11, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v12, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v13, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_max_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v9, v8
+; GFX11-NEXT:    v_max_f32_e32 v9, v11, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT:    v_max_f32_e32 v10, v12, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v10, v11, v10
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_and_b32 v12, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_max_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v3, v3, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_and_b32 v4, 0xffff0000, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v5 :: v_dual_and_b32 v5, 0xffff0000, v8
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_max_f32_e32 v14, v14, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_max_f32_e32 v13, v13, v29
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_max_f32_e32 v12, v12, v28
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_max_f32_e32 v11, v11, v27
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_max_f32_e32 v10, v10, v26
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_max_f32_e32 v9, v9, v25
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_max_f32_e32 v8, v8, v24
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_max_f32_e32 v7, v7, v23
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_max_f32_e32 v6, v6, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_max_f32_e32 v5, v5, v21
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_max_f32_e32 v4, v4, v20
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v19
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v18
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v17
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_max_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX7-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX7-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX7-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX7-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX7-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX7-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX7-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_max_f32_e32 v15, v15, v22
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_max_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v17, v18, v17
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_max_f32_e32 v18, v19, v18
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v19, v20, v19
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v20, v21, v20
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v21, v22, v21
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v22, v23, v22
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v23, v24, v23
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX10-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v18, v18
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
+; GFX10-NEXT:    v_max_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT:    v_max_f32_e32 v19, v20, v20
+; GFX10-NEXT:    v_max_f32_e32 v20, v21, v21
+; GFX10-NEXT:    v_max_f32_e32 v21, v22, v22
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_max_f32_e32 v19, v21, v20
+; GFX10-NEXT:    v_max_f32_e32 v20, v22, v22
+; GFX10-NEXT:    v_max_f32_e32 v21, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v22, v24, v24
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v20, v21, v20
+; GFX10-NEXT:    v_max_f32_e32 v21, v23, v22
+; GFX10-NEXT:    v_max_f32_e32 v22, v25, v24
+; GFX10-NEXT:    v_max_f32_e32 v23, v27, v26
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_v16bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_and_b32 v18, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v19, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v25, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_max_f32 v17, v18, v18 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_and_b32 v26, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_max_f32 v17, v18, v17 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v19, v20, v20 :: v_dual_max_f32 v20, v21, v21
+; GFX11-NEXT:    v_dual_max_f32 v21, v22, v22 :: v_dual_and_b32 v22, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_dual_max_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v19, v21, v20 :: v_dual_max_f32 v20, v22, v22
+; GFX11-NEXT:    v_dual_max_f32 v14, v14, v14 :: v_dual_max_f32 v21, v23, v23
+; GFX11-NEXT:    v_dual_max_f32 v22, v24, v24 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v20, v21, v20 :: v_dual_max_f32 v15, v15, v15
+; GFX11-NEXT:    v_dual_max_f32 v26, v26, v26 :: v_dual_and_b32 v27, 0xffff0000, v0
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_dual_max_f32 v21, v23, v22 :: v_dual_max_f32 v22, v25, v24
+; GFX11-NEXT:    v_dual_max_f32 v23, v27, v26 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-NEXT:    v_dual_max_f32 v6, v6, v14 :: v_dual_max_f32 v5, v5, v5
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v10 :: v_dual_max_f32 v11, v11, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v15 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v12, v12, v12 :: v_dual_max_f32 v5, v5, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v8 :: v_dual_max_f32 v3, v3, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v12 :: v_dual_max_f32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
+  ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:    v_max_f32_e32 v31, v32, v31
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:    v_max_f32_e32 v30, v30, v32
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT:    v_max_f32_e32 v29, v29, v32
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_max_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT:    v_max_f32_e32 v27, v27, v32
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:    v_max_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
+; GCN-NEXT:    v_max_f32_e32 v25, v25, v32
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT:    v_max_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_max_f32_e32 v23, v23, v32
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT:    v_max_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
+; GCN-NEXT:    v_max_f32_e32 v21, v21, v32
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT:    v_max_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT:    v_max_f32_e32 v19, v19, v32
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT:    v_max_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
+; GCN-NEXT:    v_max_f32_e32 v17, v17, v32
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_max_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_max_f32_e32 v15, v15, v32
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT:    v_max_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_max_f32_e32 v13, v13, v32
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT:    v_max_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_max_f32_e32 v11, v11, v32
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT:    v_max_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_max_f32_e32 v9, v9, v32
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_max_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_max_f32_e32 v7, v7, v32
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_max_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_max_f32_e32 v5, v5, v32
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_max_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v32
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v32
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v32
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v31, v32, v31
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v30, v30, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v29, v29, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v28, v28, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v27, v27, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v26, v26, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v25, v25, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v24, v24, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v23, v23, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v22, v22, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v21, v21, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v20, v20, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v19, v19, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v18, v18, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v17, v17, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v16, v16, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v15, v15, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v14, v14, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v13, v13, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v12, v12, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v11, v11, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v10, v10, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v9, v9, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v8, v8, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v4, v4, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v32
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_max_f32_e32 v31, v32, v31
+; GFX8-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_max_f32_e32 v30, v32, v30
+; GFX8-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_max_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
+; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_max_f32_e32 v28, v32, v28
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_max_f32_e32 v27, v27, v33
+; GFX8-NEXT:    v_max_f32_e32 v15, v15, v32
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_max_f32_e32 v32, v33, v32
+; GFX8-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_max_f32_e32 v26, v33, v26
+; GFX8-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_max_f32_e32 v25, v33, v25
+; GFX8-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_max_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_max_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX8-NEXT:    v_perm_b32 v10, v10, v32, s4
+; GFX8-NEXT:    v_perm_b32 v15, v15, v27, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
+; GFX9-NEXT:    v_max_f32_e32 v31, v31, v31
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v32
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v33
+; GFX9-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX9-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX9-NEXT:    v_max_f32_e32 v37, v37, v37
+; GFX9-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v21
+; GFX9-NEXT:    v_max_f32_e32 v31, v32, v31
+; GFX9-NEXT:    v_max_f32_e32 v32, v34, v33
+; GFX9-NEXT:    v_max_f32_e32 v33, v37, v36
+; GFX9-NEXT:    v_max_f32_e32 v37, v51, v50
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX9-NEXT:    v_max_f32_e32 v39, v39, v39
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v50, v43, v43
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    v_max_f32_e32 v34, v39, v38
+; GFX9-NEXT:    v_max_f32_e32 v38, v53, v52
+; GFX9-NEXT:    v_max_f32_e32 v50, v51, v50
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v51, v52, v51
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v39, v55, v54
+; GFX9-NEXT:    v_max_f32_e32 v52, v53, v52
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v53, v54, v53
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v15
+; GFX9-NEXT:    v_max_f32_e32 v36, v49, v48
+; GFX9-NEXT:    v_max_f32_e32 v48, v41, v40
+; GFX9-NEXT:    v_max_f32_e32 v54, v55, v54
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT:    v_max_f32_e32 v55, v40, v55
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v35
+; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT:    v_max_f32_e32 v49, v42, v49
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v35, v35, v35
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT:    v_max_f32_e32 v30, v30, v30
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT:    v_max_f32_e32 v29, v29, v29
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT:    v_max_f32_e32 v28, v28, v28
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v35
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v55, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v54, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v53, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v52, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v51, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v50, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v48, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v39, s4
+; GFX9-NEXT:    v_perm_b32 v8, v8, v38, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v37, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v36, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v34, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v33, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v32, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v49, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT:    v_max_f32_e32 v65, v65, v65
+; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT:    v_max_f32_e32 v67, v67, v67
+; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX10-NEXT:    v_max_f32_e32 v53, v54, v53
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX10-NEXT:    v_max_f32_e32 v55, v64, v55
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
+; GFX10-NEXT:    v_max_f32_e32 v65, v66, v65
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT:    v_max_f32_e32 v67, v68, v67
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v32, v32, v32
+; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT:    v_max_f32_e32 v35, v35, v35
+; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT:    v_max_f32_e32 v37, v37, v37
+; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT:    v_max_f32_e32 v39, v39, v39
+; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX10-NEXT:    v_max_f32_e32 v32, v34, v32
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
+; GFX10-NEXT:    v_max_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
+; GFX10-NEXT:    v_max_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
+; GFX10-NEXT:    v_max_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_max_f32_e32 v33, v33, v33
+; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT:    v_max_f32_e32 v30, v30, v30
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v29, v29, v29
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v28, v28, v28
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v34, v36, v34
+; GFX10-NEXT:    v_max_f32_e32 v36, v48, v38
+; GFX10-NEXT:    v_max_f32_e32 v38, v52, v50
+; GFX10-NEXT:    v_max_f32_e32 v48, v64, v54
+; GFX10-NEXT:    v_max_f32_e32 v50, v68, v66
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v14, v14, v32, 0x3020706
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_max_f32_e32 v16, v33, v16
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_v32bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_max_f32 v33, v33, v33 :: v_dual_and_b32 v32, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v35, v35, v35 :: v_dual_max_f32 v34, v34, v34
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_dual_max_f32 v38, v38, v38 :: v_dual_max_f32 v37, v37, v37
+; GFX11-NEXT:    v_dual_max_f32 v39, v39, v39 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX11-NEXT:    v_dual_max_f32 v65, v65, v65 :: v_dual_and_b32 v64, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_max_f32 v83, v83, v83 :: v_dual_and_b32 v82, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    v_dual_max_f32 v49, v49, v49 :: v_dual_max_f32 v48, v48, v48
+; GFX11-NEXT:    v_dual_max_f32 v51, v51, v51 :: v_dual_max_f32 v50, v50, v50
+; GFX11-NEXT:    v_dual_max_f32 v54, v54, v54 :: v_dual_max_f32 v53, v53, v53
+; GFX11-NEXT:    v_dual_max_f32 v67, v67, v67 :: v_dual_max_f32 v66, v66, v66
+; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_max_f32 v26, v26, v26
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT:    v_dual_max_f32 v21, v21, v21 :: v_dual_max_f32 v22, v22, v22
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT:    v_dual_max_f32 v33, v34, v33 :: v_dual_max_f32 v16, v16, v16
+; GFX11-NEXT:    v_dual_max_f32 v34, v36, v35 :: v_dual_max_f32 v35, v38, v37
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v81, v81, v81 :: v_dual_and_b32 v80, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v70, v70, v70 :: v_dual_max_f32 v69, v69, v69
+; GFX11-NEXT:    v_dual_max_f32 v36, v48, v39 :: v_dual_max_f32 v37, v50, v49
+; GFX11-NEXT:    v_max_f32_e32 v39, v54, v53
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_max_f32 v1, v1, v17
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_max_f32 v32, v32, v32 :: v_dual_max_f32 v55, v55, v55
+; GFX11-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX11-NEXT:    v_dual_max_f32 v64, v64, v64 :: v_dual_max_f32 v71, v71, v71
+; GFX11-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX11-NEXT:    v_max_f32_e32 v80, v80, v80
+; GFX11-NEXT:    v_max_f32_e32 v82, v82, v82
+; GFX11-NEXT:    v_dual_max_f32 v86, v86, v86 :: v_dual_max_f32 v85, v85, v85
+; GFX11-NEXT:    v_dual_max_f32 v15, v15, v15 :: v_dual_max_f32 v84, v84, v84
+; GFX11-NEXT:    v_dual_max_f32 v29, v29, v29 :: v_dual_max_f32 v30, v30, v30
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_max_f32 v14, v14, v14
+; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v28, v28, v28
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT:    v_dual_max_f32 v19, v19, v19 :: v_dual_max_f32 v20, v20, v20
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    v_dual_max_f32 v38, v52, v51 :: v_dual_max_f32 v53, v82, v81
+; GFX11-NEXT:    v_dual_max_f32 v48, v64, v55 :: v_dual_max_f32 v55, v86, v85
+; GFX11-NEXT:    v_dual_max_f32 v49, v66, v65 :: v_dual_max_f32 v50, v68, v67
+; GFX11-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX11-NEXT:    v_dual_max_f32 v51, v70, v69 :: v_dual_max_f32 v52, v80, v71
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v25 :: v_dual_max_f32 v54, v84, v83
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_max_f32 v14, v14, v30
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_max_f32 v12, v12, v28
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v23 :: v_dual_max_f32 v8, v8, v24
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_max_f32 v4, v4, v20
+; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v17, 16, v31
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_max_f32 v2, v2, v18
+; GFX11-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v16, v32, v16
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
+  ret <32 x bfloat> %op
+}
+
 declare bfloat @llvm.sqrt.bf16(bfloat)
 
 define bfloat @v_sqrt_bf16(bfloat %a) {
@@ -18659,6 +26957,4766 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
   ret i32 %readlane
 }
 
+define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_select_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v2, v4, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_select_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v7, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v2, v4, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v6, v5 :: v_dual_cndmask_b32 v0, v7, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v3
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
+  ret <4 x bfloat> %op
+}
+
+define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) {
+; GCN-LABEL: v_select_v6bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v7, v1, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_v6bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v1, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_v6bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v12, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_v6bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v11, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_v6bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v9, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_v6bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v2 :: v_dual_cndmask_b32 v5, v10, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v7 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v5
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
+  ret <6 x bfloat> %op
+}
+
+define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_select_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v15, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v14, v13, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v12, v11, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v14, v13, vcc
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v16, v15, vcc
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v13, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v15, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-NEXT:    v_dual_cndmask_b32 v13, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v10, v9 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v11 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_and_b32 v12, 0xffff, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v5
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v2, v12, v3
+; GFX11-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
+  ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_select_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v30
+; GCN-NEXT:    v_cndmask_b32_e32 v14, v14, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v29
+; GCN-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v28
+; GCN-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v27
+; GCN-NEXT:    v_cndmask_b32_e32 v11, v11, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v26
+; GCN-NEXT:    v_cndmask_b32_e32 v10, v10, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
+; GCN-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v24
+; GCN-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
+; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v22
+; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v21
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v20
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v19
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v18, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v17, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v21
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GCN-NEXT:    v_cndmask_b32_e32 v16, v14, v16, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v16
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v30
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v14, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v29
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v28
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v27
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v11, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v26
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v10, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v24
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v22
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v21
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v0, v16, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v20
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v18, v15, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v16
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v0, v20, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v0, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v0, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v0, v19, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v18, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v11
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v15
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v16
+; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v20
+; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v0, v20, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v0, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v0, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v0, v19, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v18, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v11
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
+; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
+; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v15
+; GFX9-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v16
+; GFX9-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v20
+; GFX9-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v0, v27, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v22, v21, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v18, v17, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v20, v19, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v28, v29, v28, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v31, v30, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v26, v25, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v24, v23, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v28
+; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v4, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v5, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v6, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v7, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_v16bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v16, v0, v27 :: v_dual_cndmask_b32 v1, v9, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v18, v17, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v14, v6 :: v_dual_cndmask_b32 v3, v11, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v10, v2 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX11-NEXT:    v_dual_cndmask_b32 v10, v20, v19 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v22, v21, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v28, v29, v28 :: v_dual_cndmask_b32 v15, v31, v30
+; GFX11-NEXT:    v_dual_cndmask_b32 v14, v26, v25 :: v_dual_and_b32 v3, 0xffff, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_dual_cndmask_b32 v12, v24, v23 :: v_dual_lshlrev_b32 v11, 16, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v11
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v28
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v4, v5, v9
+; GFX11-NEXT:    v_or_b32_e32 v5, v6, v10
+; GFX11-NEXT:    v_or_b32_e32 v6, v7, v11
+; GFX11-NEXT:    v_or_b32_e32 v7, v8, v12
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
+  ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_select_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:132
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:128
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v31, v0, v31, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:    v_cndmask_b32_e32 v32, v32, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:120
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GCN-NEXT:    v_cndmask_b32_e32 v30, v30, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v29, 16, v34
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT:    v_cndmask_b32_e32 v29, v29, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:112
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v28, 16, v33
+; GCN-NEXT:    v_cndmask_b32_e32 v28, v28, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v27, 16, v34
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT:    v_cndmask_b32_e32 v27, v27, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v26, 16, v33
+; GCN-NEXT:    v_cndmask_b32_e32 v26, v26, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v25, 16, v34
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
+; GCN-NEXT:    v_cndmask_b32_e32 v25, v25, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:96
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v24, 16, v33
+; GCN-NEXT:    v_cndmask_b32_e32 v24, v24, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v34
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_cndmask_b32_e32 v23, v23, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:88
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v22, 16, v33
+; GCN-NEXT:    v_cndmask_b32_e32 v22, v22, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v34
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
+; GCN-NEXT:    v_cndmask_b32_e32 v21, v21, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:80
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v33
+; GCN-NEXT:    v_cndmask_b32_e32 v20, v20, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v34
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT:    v_cndmask_b32_e32 v19, v19, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:72
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v33
+; GCN-NEXT:    v_cndmask_b32_e32 v18, v18, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v34
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
+; GCN-NEXT:    v_cndmask_b32_e32 v17, v17, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v33
+; GCN-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v34
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_cndmask_b32_e32 v15, v15, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GCN-NEXT:    v_cndmask_b32_e32 v33, v33, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v14
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT:    v_cndmask_b32_e32 v14, v14, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v12
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_cndmask_b32_e32 v11, v11, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:40
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT:    v_cndmask_b32_e32 v10, v10, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v8
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v33
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v16
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; GCN-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GCN-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GCN-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GCN-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GCN-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
+; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
+; GCN-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
+; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
+; GCN-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
+; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
+; GCN-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_select_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT:    v_cndmask_b32_e32 v31, v31, v0, vcc
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX7-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v32, v32, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX7-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX7-NEXT:    v_cndmask_b32_e32 v30, v30, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT:    v_cndmask_b32_e32 v29, v29, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
+; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX7-NEXT:    v_cndmask_b32_e32 v28, v28, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX7-NEXT:    v_cndmask_b32_e32 v27, v27, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
+; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX7-NEXT:    v_cndmask_b32_e32 v26, v26, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT:    v_cndmask_b32_e32 v25, v25, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
+; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX7-NEXT:    v_cndmask_b32_e32 v24, v24, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT:    v_cndmask_b32_e32 v23, v23, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT:    v_cndmask_b32_e32 v22, v22, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
+; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT:    v_cndmask_b32_e32 v21, v21, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX7-NEXT:    v_cndmask_b32_e32 v20, v20, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX7-NEXT:    v_cndmask_b32_e32 v19, v19, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v18, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT:    v_cndmask_b32_e32 v17, v17, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v15, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v14, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v11, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v10, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v16
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
+; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; GFX7-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX7-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX7-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
+; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
+; GFX7-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
+; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
+; GFX7-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
+; GFX7-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_select_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v30, v30, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v29, v29, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v28, v28, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v27, v27, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v26, v26, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v25, v25, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v24, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v0, vcc
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v0, v16, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v0, v33, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v32, v15, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v0, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v18, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v21
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v22
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v23
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v24
+; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v25
+; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v26
+; GFX8-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v27
+; GFX8-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v28
+; GFX8-NEXT:    v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v29
+; GFX8-NEXT:    v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v30
+; GFX8-NEXT:    v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
+; GFX8-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
+; GFX8-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v33
+; GFX8-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_select_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v30, v30, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v29, v29, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v28, v28, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v27, v27, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v26, v26, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v25, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v24, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v23, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v22, v0, vcc
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v0, v16, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v33, v0, v33, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v32, v15, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX9-NEXT:    v_cndmask_b32_e32 v32, v32, v0, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v21, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v20, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v19, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v18, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v21
+; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v22
+; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v23
+; GFX9-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v24
+; GFX9-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v25
+; GFX9-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v26
+; GFX9-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v27
+; GFX9-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v28
+; GFX9-NEXT:    v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v29
+; GFX9-NEXT:    v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v30
+; GFX9-NEXT:    v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
+; GFX9-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
+; GFX9-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v33
+; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_select_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v29
+; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v30
+; GFX10-NEXT:    v_cndmask_b32_e32 v67, v68, v67, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
+; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v64, v65, v64, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v29, v29, v68, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v25, v34, v33, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v33, v36, v35, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v30, v30, v65, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v55, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v54, v53, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v52, v51, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v50, v49, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v48, v39, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v38, v37, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v33
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v28, v28, v65, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v26, v68, v66, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v64
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v4, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v67
+; GFX10-NEXT:    v_or_b32_sdwa v5, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v6, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v7, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v8, v9, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v9, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v10, v11, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v11, v12, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v12, v13, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v13, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v31
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v32
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v65, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v27, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_or_b32_sdwa v14, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v15, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_v32bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
+; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
+; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v24
+; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v26
+; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v27
+; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v28
+; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v29
+; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v16
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v30
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v30, v82, v81 :: v_dual_cndmask_b32 v11, v27, v11
+; GFX11-NEXT:    v_dual_cndmask_b32 v12, v28, v12 :: v_dual_cndmask_b32 v7, v23, v7
+; GFX11-NEXT:    v_dual_cndmask_b32 v28, v70, v69 :: v_dual_cndmask_b32 v27, v68, v67
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v26, v66, v65 :: v_dual_cndmask_b32 v23, v52, v51
+; GFX11-NEXT:    v_dual_cndmask_b32 v8, v24, v8 :: v_dual_cndmask_b32 v5, v21, v5
+; GFX11-NEXT:    v_dual_cndmask_b32 v24, v54, v53 :: v_dual_cndmask_b32 v21, v48, v39
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v1, v17, v1
+; GFX11-NEXT:    v_dual_cndmask_b32 v22, v50, v49 :: v_dual_cndmask_b32 v3, v19, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v20, v4 :: v_dual_cndmask_b32 v17, v34, v33
+; GFX11-NEXT:    v_cndmask_b32_e32 v20, v38, v37, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX11-NEXT:    v_dual_cndmask_b32 v18, v36, v35 :: v_dual_lshlrev_b32 v17, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-NEXT:    v_dual_cndmask_b32 v13, v29, v13 :: v_dual_and_b32 v12, 0xffff, v12
+; GFX11-NEXT:    v_dual_cndmask_b32 v29, v80, v71 :: v_dual_and_b32 v14, 0xffff, v14
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v25, v64, v55 :: v_dual_and_b32 v10, 0xffff, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v31
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; GFX11-NEXT:    v_dual_cndmask_b32 v16, v31, v16 :: v_dual_cndmask_b32 v15, v32, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v31, v87, v86 :: v_dual_cndmask_b32 v84, v85, v84
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v83, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v84
+; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v17
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v18
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v19
+; GFX11-NEXT:    v_or_b32_e32 v3, v4, v20
+; GFX11-NEXT:    v_or_b32_e32 v4, v5, v21
+; GFX11-NEXT:    v_or_b32_e32 v5, v6, v22
+; GFX11-NEXT:    v_or_b32_e32 v6, v7, v23
+; GFX11-NEXT:    v_or_b32_e32 v7, v8, v24
+; GFX11-NEXT:    v_or_b32_e32 v8, v9, v25
+; GFX11-NEXT:    v_or_b32_e32 v9, v10, v26
+; GFX11-NEXT:    v_or_b32_e32 v10, v11, v27
+; GFX11-NEXT:    v_or_b32_e32 v11, v12, v28
+; GFX11-NEXT:    v_or_b32_e32 v12, v13, v29
+; GFX11-NEXT:    v_or_b32_e32 v13, v14, v30
+; GFX11-NEXT:    v_or_b32_e32 v14, v15, v31
+; GFX11-NEXT:    v_or_b32_e32 v15, v16, v32
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
+  ret <32 x bfloat> %op
+}
+
+define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) {
+; GCN-LABEL: s_select_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshr_b32 s2, s2, 16
+; GCN-NEXT:    s_lshr_b32 s5, s5, 16
+; GCN-NEXT:    s_lshr_b32 s1, s1, 16
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    s_lshr_b32 s3, s3, 16
+; GCN-NEXT:    s_lshr_b32 s4, s4, 16
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s1, v2
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_select_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX7-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_select_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_select_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_select_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s4, v1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, s2, v2, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_select_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s5, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, s2, v2 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    ; return to shader part epilog
+  %cond = icmp eq i32 %c, 0
+  %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
+  %cast = bitcast <3 x bfloat> %op to i48
+  %elt0 = trunc i48 %cast to i32
+  %elt1.hi = lshr i48 %cast, 32
+  %elt1 = trunc i48 %elt1.hi to i32
+  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+  %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+  %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
+  ret <2 x i32> %bv.1
+}
+
+define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) {
+; GCN-LABEL: s_select_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshr_b32 s1, s1, 16
+; GCN-NEXT:    s_lshr_b32 s5, s5, 16
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    s_lshr_b32 s4, s4, 16
+; GCN-NEXT:    s_lshr_b32 s3, s3, 16
+; GCN-NEXT:    s_lshr_b32 s2, s2, 16
+; GCN-NEXT:    s_lshr_b32 s6, s6, 16
+; GCN-NEXT:    s_lshr_b32 s7, s7, 16
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NEXT:    v_mov_b32_e32 v6, s1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_or_b32_e32 v1, v2, v3
+; GCN-NEXT:    v_readfirstlane_b32 s0, v1
+; GCN-NEXT:    v_readfirstlane_b32 s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_select_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX7-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX7-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX7-NEXT:    s_lshr_b32 s7, s7, 16
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s3
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s5
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_select_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_select_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_select_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX10-NEXT:    v_mov_b32_e32 v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s6, v1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, s4, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, s2, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_select_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX11-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s6, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, s3, v3 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    ; return to shader part epilog
+  %cond = icmp eq i32 %c, 0
+  %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
+  %cast = bitcast <4 x bfloat> %op to <2 x i32>
+  %elt0 = extractelement <2 x i32> %cast, i32 0
+  %elt1 = extractelement <2 x i32> %cast, i32 1
+  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+  %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+  %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
+  ret <2 x i32> %bv.1
+}
+
+define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, <4 x i32> %c) {
+; GCN-LABEL: s_vselect_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v4, s7
+; GCN-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-NEXT:    v_mov_b32_e32 v5, s2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_mov_b32_e32 v5, s0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s1, v2
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_vselect_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v5, s2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    v_mov_b32_e32 v4, s1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v3, s4
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_vselect_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_vselect_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_vselect_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, s5, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, s0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s4, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, s1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, s3, v5, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_vselect_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX11-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX11-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, s6, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s4, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, s3, v6, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    ; return to shader part epilog
+  %cond = icmp eq <4 x i32> %c, zeroinitializer
+  %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
+  %cast = bitcast <4 x bfloat> %op to <2 x i32>
+  %elt0 = extractelement <2 x i32> %cast, i32 0
+  %elt1 = extractelement <2 x i32> %cast, i32 1
+  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+  %bv.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+  %bv.1 = insertelement <2 x i32> %bv.0, i32 %readlane1, i32 1
+  ret <2 x i32> %bv.1
+}
+
+define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_vselect_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v11, v7, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v10, v6, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v9, v5, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v8, v4, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_vselect_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v11, v7, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v10, v6, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v9, v5, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v8, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_vselect_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_vselect_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_vselect_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_vselect_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
+  ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_vselect_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 1, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GCN-NEXT:    v_cndmask_b32_e32 v7, v23, v15, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GCN-NEXT:    v_cndmask_b32_e32 v6, v22, v14, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v21, v13, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v20, v12, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v19, v11, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v18, v10, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v17, v9, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v16, v8, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_vselect_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX7-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v23, v15, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX7-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v22, v14, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v21, v13, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v20, v12, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v19, v11, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v18, v10, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v17, v9, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v16, v8, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_vselect_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v22, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v20, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_vselect_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v22, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v20, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_vselect_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_vselect_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v23, v22 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v13, v9 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v12, v8 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v21, v20, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
+  ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_vselect_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v31, s30, 0
+; GCN-NEXT:    v_writelane_b32 v31, s31, 1
+; GCN-NEXT:    v_writelane_b32 v31, s34, 2
+; GCN-NEXT:    v_writelane_b32 v31, s35, 3
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v1
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v2
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v3
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v4
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v5
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v6
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v7
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v8
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v9
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v10
+; GCN-NEXT:    v_and_b32_e32 v1, 1, v11
+; GCN-NEXT:    v_and_b32_e32 v2, 1, v12
+; GCN-NEXT:    v_and_b32_e32 v3, 1, v13
+; GCN-NEXT:    v_and_b32_e32 v4, 1, v14
+; GCN-NEXT:    v_and_b32_e32 v5, 1, v15
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v1
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v2
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v3
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v4
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v5
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:56
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cndmask_b32_e64 v15, v3, v2, s[34:35]
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:52
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cndmask_b32_e64 v14, v4, v30, s[30:31]
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cndmask_b32_e64 v13, v5, v29, s[28:29]
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:44
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cndmask_b32_e64 v12, v2, v28, s[26:27]
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cndmask_b32_e64 v11, v3, v27, s[24:25]
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:36
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cndmask_b32_e64 v10, v4, v26, s[22:23]
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cndmask_b32_e64 v9, v2, v25, s[20:21]
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cndmask_b32_e64 v8, v3, v24, s[18:19]
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:24
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v4, v23, s[16:17]
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v2, v22, s[14:15]
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v3, v21, s[12:13]
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s[10:11]
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_cndmask_b32_e64 v19, v2, v19, s[8:9]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, v18, s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v17, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_readlane_b32 s35, v31, 3
+; GCN-NEXT:    v_readlane_b32 s34, v31, 2
+; GCN-NEXT:    v_readlane_b32 s31, v31, 1
+; GCN-NEXT:    v_readlane_b32 s30, v31, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_vselect_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v4
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v7
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v8
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v9
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v10
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v11
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v12
+; GFX7-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v13
+; GFX7-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v14
+; GFX7-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v15
+; GFX7-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(13)
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[8:9]
+; GFX7-NEXT:    s_waitcnt vmcnt(12)
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v20, s[10:11]
+; GFX7-NEXT:    s_waitcnt vmcnt(11)
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v21, s[12:13]
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v22, s[14:15]
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v23, s[16:17]
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_cndmask_b32_e64 v8, v8, v24, s[18:19]
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_cndmask_b32_e64 v9, v9, v25, s[20:21]
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_cndmask_b32_e64 v10, v10, v26, s[22:23]
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v27, s[24:25]
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_cndmask_b32_e64 v12, v12, v28, s[26:27]
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_cndmask_b32_e64 v13, v13, v29, s[28:29]
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s[30:31]
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v32, v15, s[34:35]
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_readlane_b32 s35, v31, 3
+; GFX7-NEXT:    v_readlane_b32 s34, v31, 2
+; GFX7-NEXT:    v_readlane_b32 s31, v31, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_vselect_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v32, v31, vcc
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX8-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v30, v22, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v30, v12, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v29, v21, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v30, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v19
+; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v28, v20, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v29, v10, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v26
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v30, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v17
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v24
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v31, v23, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v31
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v25
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v10, v28, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX8-NEXT:    v_or_b32_sdwa v4, v20, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v5, v21, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v6, v22, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_vselect_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v32, v31, vcc
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX9-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v21
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v30, v22, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v30, v12, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v29, v21, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v30, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v28, v20, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v29, v10, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v26
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v30, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v17
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v24
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v31, v23, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v31
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v25
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v28, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
+; GFX9-NEXT:    v_or_b32_sdwa v4, v20, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v5, v21, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v6, v22, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_vselect_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
+; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
+; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v28
+; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v39, v38, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v37, v36, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v4, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v5, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v6, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v31
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
+; GFX10-NEXT:    v_or_b32_sdwa v7, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_vselect_v16bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
+; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
+; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v28
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX11-NEXT:    v_dual_cndmask_b32 v8, v28, v20 :: v_dual_and_b32 v15, 1, v15
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v39, v38 :: v_dual_and_b32 v8, 0xffff, v8
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v37, v36 :: v_dual_and_b32 v6, 0xffff, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v31
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v16, v54, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
+; GFX11-NEXT:    v_or_b32_e32 v5, v10, v11
+; GFX11-NEXT:    v_or_b32_e32 v6, v12, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
+  ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_vselect_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v31, 1, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 1, v29
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:252
+; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:120
+; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:248
+; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:116
+; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:244
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:240
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:104
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:232
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:100
+; GCN-NEXT:    v_and_b32_e32 v51, 1, v5
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:152
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v51
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v31
+; GCN-NEXT:    s_waitcnt vmcnt(11)
+; GCN-NEXT:    v_cndmask_b32_e64 v31, v34, v33, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v29
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    v_cndmask_b32_e64 v29, v36, v35, s[4:5]
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:220
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:96
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:108
+; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:236
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
+; GCN-NEXT:    v_and_b32_e32 v22, 1, v22
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_and_b32_e32 v28, 1, v28
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
+; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
+; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v28
+; GCN-NEXT:    s_waitcnt vmcnt(11)
+; GCN-NEXT:    v_cndmask_b32_e64 v28, v38, v37, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v27
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cndmask_b32_e64 v27, v39, v51, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v26
+; GCN-NEXT:    v_cndmask_b32_e64 v26, v36, v35, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v25
+; GCN-NEXT:    v_cndmask_b32_e64 v25, v49, v48, s[4:5]
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:228
+; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:224
+; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:88
+; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:216
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:84
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:212
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:208
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:72
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v24
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    v_cndmask_b32_e64 v24, v35, v50, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v23
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_cndmask_b32_e64 v23, v36, v34, s[4:5]
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:76
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:204
+; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_and_b32_e32 v21, 1, v21
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v22
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cndmask_b32_e64 v22, v33, v50, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v21
+; GCN-NEXT:    v_cndmask_b32_e64 v21, v38, v37, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v20
+; GCN-NEXT:    v_cndmask_b32_e64 v20, v48, v39, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v19
+; GCN-NEXT:    v_cndmask_b32_e64 v19, v49, v36, s[4:5]
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:200
+; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:68
+; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:196
+; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:192
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:56
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:184
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:52
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v18
+; GCN-NEXT:    v_cndmask_b32_e64 v18, v35, v34, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v17
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    v_cndmask_b32_e64 v17, v33, v51, s[4:5]
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:60
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:188
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
+; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
+; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
+; GCN-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NEXT:    v_cndmask_b32_e64 v16, v37, v36, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cndmask_b32_e64 v15, v38, v51, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v14
+; GCN-NEXT:    v_cndmask_b32_e64 v14, v35, v34, s[4:5]
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:176
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:40
+; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:168
+; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:164
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:160
+; GCN-NEXT:    v_and_b32_e32 v10, 1, v10
+; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v13
+; GCN-NEXT:    v_cndmask_b32_e64 v13, v48, v39, s[4:5]
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:172
+; GCN-NEXT:    v_and_b32_e32 v12, 1, v12
+; GCN-NEXT:    v_and_b32_e32 v11, 1, v11
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v12
+; GCN-NEXT:    v_cndmask_b32_e64 v12, v50, v49, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v11
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    v_cndmask_b32_e64 v11, v34, v33, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cndmask_b32_e64 v10, v48, v39, s[4:5]
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:140
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:16
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:28
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:156
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
+; GCN-NEXT:    v_and_b32_e32 v6, 1, v6
+; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
+; GCN-NEXT:    v_and_b32_e32 v8, 1, v8
+; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
+; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
+; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v9
+; GCN-NEXT:    v_cndmask_b32_e64 v9, v36, v35, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v8
+; GCN-NEXT:    v_cndmask_b32_e64 v8, v38, v37, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v51, v50, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v49, v48, s[4:5]
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:20
+; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:148
+; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:144
+; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:8
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:136
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:132
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v32, v5, vcc
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:256
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v36, v35, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v37, v39, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v34, v33, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v48, v38, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v50, v49, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 1, v51
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v33
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cndmask_b32_e32 v32, v32, v30, vcc
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v31
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v32
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_vselect_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX7-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX7-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX7-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX7-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX7-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX7-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v28
+; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v27
+; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:252
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v26
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v25
+; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:248
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v24
+; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v23
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:244
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v22
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:240
+; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v30
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v29
+; GFX7-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX7-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX7-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX7-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX7-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX7-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX7-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX7-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX7-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_cndmask_b32_e64 v30, v27, v28, s[4:5]
+; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_cndmask_b32_e64 v28, v23, v24, s[6:7]
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:236
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_cndmask_b32_e32 v29, v25, v26, vcc
+; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_cndmask_b32_e64 v27, v22, v27, s[8:9]
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_cndmask_b32_e64 v26, v23, v25, s[10:11]
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:232
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e64 v25, v23, v22, s[12:13]
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:228
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, v23, v22, s[14:15]
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:224
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, v22, v23, s[16:17]
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:220
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e64 v22, v31, v22, s[18:19]
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v31
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:256
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v21
+; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:216
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v21, v32, v21, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v20
+; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:212
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v20, v32, v20, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
+; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:208
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v19, v19, v32, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
+; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:204
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v18, v32, v18, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:200
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v17, v32, v17, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
+; GFX7-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:196
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:192
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v15, v15, v32, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
+; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:188
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v14, v32, v14, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
+; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:184
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v13, v32, v13, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX7-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:180
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v12, v32, v12, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; GFX7-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:176
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v11, v11, v32, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:172
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v10, v32, v10, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:168
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v32, v9, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX7-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:164
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v8, v32, v8, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:160
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v7, v32, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:156
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v32, v6, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:152
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v32, v5, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:148
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v32, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:140
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v32, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:136
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v32, v1, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:132
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_vselect_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v22
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v23
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v22
+; GFX8-NEXT:    buffer_load_ushort v22, off, s[0:3], s32
+; GFX8-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX8-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v28
+; GFX8-NEXT:    v_and_b32_e32 v28, 1, v29
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 1, v25
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v28
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v24
+; GFX8-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v26
+; GFX8-NEXT:    v_and_b32_e32 v26, 1, v27
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v26
+; GFX8-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX8-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX8-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX8-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX8-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX8-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX8-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX8-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX8-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX8-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v22
+; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:124
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, v23, v22, s[10:11]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, v23, v24, s[12:13]
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:56
+; GFX8-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:120
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, v25, v24, s[14:15]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_cndmask_b32_e64 v25, v25, v26, s[16:17]
+; GFX8-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:52
+; GFX8-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:116
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e64 v26, v27, v26, s[18:19]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v27, v28, s[20:21]
+; GFX8-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:48
+; GFX8-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v29, v28, s[6:7]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v29, v30, s[8:9]
+; GFX8-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:64
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v29
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cndmask_b32_e32 v29, v31, v29, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[4:5]
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v21
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v34, v21, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v32, vcc
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v34, v19, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v32, vcc
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v34, v17, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v33, v32, vcc
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v34, v15, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v33, v32, vcc
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v34, v13, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v33, v32, vcc
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v34, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v33, v32, vcc
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v34, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v33, v32, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v32
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v32, v7, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
+; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v34, v32, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v34, v7, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v33
+; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
+; GFX8-NEXT:    v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v31
+; GFX8-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v12, v26, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v13, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v14, v22, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_vselect_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v22
+; GFX9-NEXT:    v_and_b32_e32 v22, 1, v23
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v22
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX9-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v28
+; GFX9-NEXT:    v_and_b32_e32 v28, 1, v29
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v24
+; GFX9-NEXT:    v_and_b32_e32 v24, 1, v25
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v28
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v24
+; GFX9-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v26
+; GFX9-NEXT:    v_and_b32_e32 v26, 1, v27
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v26
+; GFX9-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX9-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX9-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX9-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX9-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX9-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX9-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX9-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX9-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX9-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v22
+; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, v23, v22, s[10:11]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, v23, v24, s[12:13]
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, v25, v24, s[14:15]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, v25, v26, s[16:17]
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v26, v27, v26, s[18:19]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_cndmask_b32_e64 v27, v27, v28, s[20:21]
+; GFX9-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v28, v29, v28, s[6:7]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_cndmask_b32_e64 v30, v29, v30, s[8:9]
+; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v29
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e32 v29, v31, v29, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[4:5]
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v21
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v34, v21, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v20
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v32, vcc
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v34, v19, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v32, vcc
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v34, v17, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v33, v32, vcc
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v34, v15, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v33, v32, vcc
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v34, v13, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v33, v32, vcc
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v34, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v33, v32, vcc
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v34, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v33, v32, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v32
+; GFX9-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v32, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v32, v34, v32, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v34, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v33
+; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
+; GFX9-NEXT:    v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v31
+; GFX9-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v12, v26, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v13, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v14, v22, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_vselect_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX10-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX10-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX10-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX10-NEXT:    s_clause 0x15
+; GFX10-NEXT:    buffer_load_ushort v36, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v20
+; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
+; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v24
+; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 1, v30
+; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s7, 1, v0
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s8, 1, v2
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s9, 1, v4
+; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s10, 1, v6
+; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s11, 1, v8
+; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s12, 1, v10
+; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s13, 1, v12
+; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX10-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX10-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX10-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX10-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX10-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX10-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX10-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX10-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX10-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX10-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX10-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX10-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s15, 1, v16
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s16, 1, v18
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s17, 1, v28
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s19, 1, v26
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s31, 1, v1
+; GFX10-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s18, 1, v29
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s20, 1, v27
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s21, 1, v25
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s22, 1, v23
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s23, 1, v21
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s24, 1, v19
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s25, 1, v17
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s26, 1, v15
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s27, 1, v13
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s28, 1, v11
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s29, 1, v7
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s30, 1, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s35, 1, v9
+; GFX10-NEXT:    s_waitcnt vmcnt(32)
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v36
+; GFX10-NEXT:    s_waitcnt vmcnt(31)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v37
+; GFX10-NEXT:    s_waitcnt vmcnt(30)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v38
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v38, v37, s6
+; GFX10-NEXT:    s_waitcnt vmcnt(29)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v39
+; GFX10-NEXT:    s_waitcnt vmcnt(28)
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v48, v39, s7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v48
+; GFX10-NEXT:    s_waitcnt vmcnt(23)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v49
+; GFX10-NEXT:    s_waitcnt vmcnt(22)
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v50, v49, s8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v50
+; GFX10-NEXT:    s_waitcnt vmcnt(21)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v51
+; GFX10-NEXT:    s_waitcnt vmcnt(20)
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v52, v51, s9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v52
+; GFX10-NEXT:    s_waitcnt vmcnt(19)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v53
+; GFX10-NEXT:    s_waitcnt vmcnt(18)
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v54, v53, s10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v54
+; GFX10-NEXT:    s_waitcnt vmcnt(17)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v55
+; GFX10-NEXT:    s_waitcnt vmcnt(16)
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v64, v55, s11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v64
+; GFX10-NEXT:    s_waitcnt vmcnt(15)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v65
+; GFX10-NEXT:    s_waitcnt vmcnt(14)
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v66, v65, s12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v66
+; GFX10-NEXT:    s_waitcnt vmcnt(13)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v67
+; GFX10-NEXT:    s_waitcnt vmcnt(12)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v68
+; GFX10-NEXT:    s_waitcnt vmcnt(11)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v69
+; GFX10-NEXT:    s_waitcnt vmcnt(10)
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v20, v69, s14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v22
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v24, v22, s15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v30
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v0, v30, s16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v34
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v35, v34, s17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v4, v2, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v32
+; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v33
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_cndmask_b32_e64 v33, v8, v33, s19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v10, v32, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cndmask_b32_e32 v66, v12, v6, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v12
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v35, v54, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v65, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v64, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v53, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v52, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v24, v51, s25
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v20, v49, s26
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v48, v38, s27
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v37, v29, s28
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v25, v21, s29
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s31
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v16, v13, s30
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v19, v17, s34
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v28, v26, s35
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v39, v68, v67, s13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v3
+; GFX10-NEXT:    v_or_b32_sdwa v0, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v3, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v4, v27, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v5, v36, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v6, v39, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v7, v50, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v8, v22, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v9, v30, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v10, v55, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v11, v66, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v12, v32, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v13, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v14, v34, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v15, v15, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_readlane_b32 s35, v31, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v31, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v31, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_vselect_v32bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x20
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_u16 v87, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v31
+; GFX11-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 16, v33
+; GFX11-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 16, v34
+; GFX11-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v35
+; GFX11-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 16, v36
+; GFX11-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(26)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 16, v37
+; GFX11-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 16, v38
+; GFX11-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11-NEXT:    s_waitcnt vmcnt(24)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 16, v39
+; GFX11-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v48
+; GFX11-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 16, v49
+; GFX11-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v50
+; GFX11-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 16, v51
+; GFX11-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 16, v52
+; GFX11-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v53
+; GFX11-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 16, v54
+; GFX11-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX11-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v55
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 16, v64
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v83
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v83, v84, v83, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v85
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 16, v65
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v85, v86, v85, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v66
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 16, v67
+; GFX11-NEXT:    v_cndmask_b32_e32 v26, v82, v81, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v68
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 16, v69
+; GFX11-NEXT:    v_cndmask_b32_e32 v24, v80, v71, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
+; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 16, v70
+; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 16, v71
+; GFX11-NEXT:    v_cndmask_b32_e32 v22, v70, v69, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 16, v80
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v81
+; GFX11-NEXT:    v_cndmask_b32_e32 v20, v68, v67, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
+; GFX11-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 16, v82
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
+; GFX11-NEXT:    v_cndmask_b32_e32 v18, v66, v65, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v87, 1, v87
+; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v64, v55, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
+; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, v54, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v52, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v50, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v48, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v38, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v36, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v34, v33, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v32, v31, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v87
+; GFX11-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v85
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v30, v86, v30, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v83
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    v_cndmask_b32_e32 v28, v84, v28, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
+; GFX11-NEXT:    v_dual_cndmask_b32 v27, v147, v146 :: v_dual_lshlrev_b32 v28, 16, v28
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_cndmask_b32_e32 v25, v145, v144, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_cndmask_b32_e32 v23, v135, v134, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, v133, v132, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v19, v131, v130, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, v129, v128, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v119, v118, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v117, v116, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v115, v114, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v113, v112, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v103, v102, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v101, v100, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v99, v98, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v97, v96, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
+; GFX11-NEXT:    v_or_b32_e32 v5, v10, v11
+; GFX11-NEXT:    v_or_b32_e32 v6, v12, v13
+; GFX11-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-NEXT:    v_or_b32_e32 v8, v16, v17
+; GFX11-NEXT:    v_or_b32_e32 v9, v18, v19
+; GFX11-NEXT:    v_or_b32_e32 v10, v20, v21
+; GFX11-NEXT:    v_or_b32_e32 v11, v22, v23
+; GFX11-NEXT:    v_or_b32_e32 v12, v24, v25
+; GFX11-NEXT:    v_or_b32_e32 v13, v26, v27
+; GFX11-NEXT:    v_or_b32_e32 v14, v29, v28
+; GFX11-NEXT:    v_or_b32_e32 v15, v31, v30
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
+  ret <32 x bfloat> %op
+}
+
 declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
 declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
 declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
index 1aafcdcbdfa0..253e7e278aaf 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
@@ -68,7 +68,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.7(0x7c000000)
   ; CHECK-NEXT:   liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VS_32 */, def renamable $sgpr4
+  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VRegOrLds_32 */, def renamable $sgpr4
   ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
index 15879eb1e2fb..e4d9fbfb1705 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
@@ -68,7 +68,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.7(0x7c000000)
   ; CHECK-NEXT:   liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VS_32 */, def renamable $sgpr4
+  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VRegOrLds_32 */, def renamable $sgpr4
   ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
index ac196635b363..5c1a70937204 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
+++ b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
@@ -4,19 +4,20 @@
 define amdgpu_cs <2 x i32> @f() {
 ; CHECK-LABEL: f:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s3, s0
-; CHECK-NEXT:    s_mov_b32 s4, s0
-; CHECK-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; CHECK-NEXT:    s_mov_b32 s5, s0
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    s_mov_b32 s6, s4
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    s_mov_b32 s0, s4
+; CHECK-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; CHECK-NEXT:    s_mov_b32 s1, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; CHECK-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[0:1], v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v1, s4
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CHECK-NEXT:    ; return to shader part epilog
 bb:
   %i = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
index 45c8a98f3dcb..968cf21bd6d5 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
@@ -20,7 +20,7 @@ body:             |
     ; CHECK-LABEL: name: foo1
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_16_and_SReg_1_with_lo16_in_TTMP_LO16 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_16_and_SReg_1_with_lo16_in_TTMP_LO16 */, def undef early-clobber %2.sub1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
     INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32
@@ -41,7 +41,7 @@ body:             |
     ; CHECK-LABEL: name: foo2
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_16_and_SReg_1_with_lo16_in_TTMP_LO16 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_16_and_SReg_1_with_lo16_in_TTMP_LO16 */, def undef %2.sub0
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_32 */, def undef %2.sub0
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
     INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32
@@ -62,7 +62,7 @@ body:             |
     ; CHECK-LABEL: name: foo3
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_16_and_SReg_1_with_lo16_in_TTMP_LO16 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_16_and_SReg_1_with_lo16_in_TTMP_LO16 */, def undef early-clobber %2.sub1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
     INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32
@@ -83,7 +83,7 @@ body:             |
     ; CHECK-LABEL: name: foo4
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_16_and_SReg_1_with_lo16_in_TTMP_LO16 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_16_and_SReg_1_with_lo16_in_TTMP_LO16 */, def undef %2.sub0
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_32 */, def undef %2.sub0
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
     INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 213b6e6e620d..3d69655111da 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -607,13 +607,10 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_flbit_i32_b32 s4, s4
-; SI-NEXT:    s_flbit_i32_b32 s5, s5
-; SI-NEXT:    s_min_u32 s4, s4, 0xffffffdf
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    s_add_i32 s4, s4, 32
-; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
+; SI-NEXT:    s_flbit_i32_b64 s4, s[4:5]
+; SI-NEXT:    s_min_u32 s4, s4, 64
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -625,10 +622,9 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_flbit_i32_b32 s4, s4
-; VI-NEXT:    v_add_u32_e64 v0, s[6:7], s4, 32 clamp
-; VI-NEXT:    s_flbit_i32_b32 s4, s5
-; VI-NEXT:    v_min3_u32 v0, v0, s4, 64
+; VI-NEXT:    s_flbit_i32_b64 s4, s[4:5]
+; VI-NEXT:    s_min_u32 s4, s4, 64
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -657,10 +653,9 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_flbit_i32_b32 s0, s2
-; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
-; GFX10-NEXT:    s_flbit_i32_b32 s0, s3
-; GFX10-NEXT:    v_min3_u32 v0, v0, s0, 64
+; GFX10-NEXT:    s_flbit_i32_b64 s0, s[2:3]
+; GFX10-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -684,14 +679,11 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x4c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clz_i32_u32 s2, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_add_nc_u32_e64 v0, s2, 32 clamp
-; GFX11-NEXT:    s_clz_i32_u32 s2, s3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_min3_u32 v0, v0, s2, 64
+; GFX11-NEXT:    s_clz_i32_u64 s2, s[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_min_u32 s2, s2, 64
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX11-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -706,16 +698,13 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; SI-NEXT:    s_min_u32 s2, s2, 64
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_flbit_i32_b32 s0, s2
-; SI-NEXT:    s_min_u32 s0, s0, 0xffffffdf
-; SI-NEXT:    s_flbit_i32_b32 s1, s3
-; SI-NEXT:    s_add_i32 s0, s0, 32
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_min3_u32 v0, s0, v0, 64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -726,11 +715,10 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_flbit_i32_b32 s0, s2
+; VI-NEXT:    s_flbit_i32_b64 s0, s[2:3]
+; VI-NEXT:    s_min_u32 s0, s0, 64
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_add_u32_e64 v0, s[0:1], s0, 32 clamp
-; VI-NEXT:    s_flbit_i32_b32 s0, s3
-; VI-NEXT:    v_min3_u32 v0, v0, s0, 64
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -754,13 +742,12 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; GFX10-LABEL: s_ctlz_i64_trunc:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_flbit_i32_b32 s2, s2
-; GFX10-NEXT:    v_add_nc_u32_e64 v0, s2, 32 clamp
-; GFX10-NEXT:    s_flbit_i32_b32 s2, s3
-; GFX10-NEXT:    v_min3_u32 v0, v0, s2, 64
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; GFX10-NEXT:    s_min_u32 s2, s2, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc:
@@ -777,15 +764,12 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; GFX11-LABEL: s_ctlz_i64_trunc:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clz_i32_u32 s2, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_add_nc_u32_e64 v0, s2, 32 clamp
-; GFX11-NEXT:    s_clz_i32_u32 s2, s3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_min3_u32 v0, v0, s2, 64
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_clz_i32_u64 s2, s[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_min_u32 s2, s2, 64
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 354f5b954659..03f3d04cf8a6 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -6,6 +6,8 @@
 
 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
 
+declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
+
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
@@ -305,6 +307,787 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
   ret void
 }
 
+define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind {
+; SI-LABEL: s_ctlz_zero_undef_i8_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s2, s2, 0xff
+; SI-NEXT:    s_flbit_i32_b32 s2, s2
+; SI-NEXT:    s_sub_i32 s4, s2, 24
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_ctlz_zero_undef_i8_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
+; VI-NEXT:    s_flbit_i32_b32 s2, s2
+; VI-NEXT:    s_add_i32 s2, s2, -16
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_add_u16_e64 v2, s2, -8
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_byte v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: s_ctlz_zero_undef_i8_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, 0.0,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     FFBH_UINT T0.W, T0.X,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT:    -24(nan), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
+; EG-NEXT:     LSHL T0.X, PV.W, PS,
+; EG-NEXT:     LSHL * T0.W, literal.x, PS,
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i8_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s0
+; GFX9-GISEL-NEXT:    s_sub_i32 s0, s0, 24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i8 %val, 0
+  %ret = select i1 %ctlz_ret, i8 %ctlz, i8 32
+  store i8 %ctlz, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind {
+; SI-LABEL: s_ctlz_zero_undef_i16_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s2, s2, 0xffff
+; SI-NEXT:    s_flbit_i32_b32 s2, s2
+; SI-NEXT:    s_add_i32 s4, s2, -16
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_ctlz_zero_undef_i16_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_flbit_i32_b32 s2, s2
+; VI-NEXT:    s_add_i32 s2, s2, -16
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: s_ctlz_zero_undef_i16_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, 0.0,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     FFBH_UINT T0.W, T0.X,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT:    -16(nan), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT:     LSHL T0.X, PV.W, PS,
+; EG-NEXT:     LSHL * T0.W, literal.x, PS,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i16_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s0
+; GFX9-GISEL-NEXT:    s_sub_i32 s0, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i16 %val, 0
+  %ret = select i1 %ctlz_ret, i16 %ctlz, i16 32
+  store i16 %ctlz, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind {
+; SI-LABEL: s_ctlz_zero_undef_i32_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_flbit_i32_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_ctlz_zero_undef_i32_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_flbit_i32_b32 s2, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: s_ctlz_zero_undef_i32_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    ALU clause starting at 4:
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT * T1.X, KC0[2].Z,
+;
+; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i32 %val, 0
+  %ret = select i1 %ctlz_ret, i32 %ctlz, i32 32
+  store i32 %ctlz, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind {
+; SI-LABEL: s_ctlz_zero_undef_i64_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_ctlz_zero_undef_i64_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: s_ctlz_zero_undef_i64_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    ALU clause starting at 4:
+; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].W,
+; EG-NEXT:     FFBH_UINT T1.W, KC0[3].X,
+; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T0.X, KC0[3].X, PS, PV.W,
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s4, s[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i64 %val, 0
+  %ret = select i1 %ctlz_ret, i64 %ctlz, i64 32
+  store i64 %ctlz, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; SI-LABEL: v_ctlz_zero_undef_i8_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_ffbh_u32_e32 v1, v0
+; SI-NEXT:    v_subrev_i32_e32 v1, vcc, 24, v1
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_ctlz_zero_undef_i8_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v1
+; VI-NEXT:    v_add_u16_e32 v1, -8, v1
+; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_byte v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: v_ctlz_zero_undef_i8_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
+; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    -24(nan), 3(4.203895e-45)
+; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
+; EG-NEXT:     LSHL T0.X, PV.W, PS,
+; EG-NEXT:     LSHL * T0.W, literal.x, PS,
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 24, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %val = load i8, ptr addrspace(1) %arrayidx, align 1
+  %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i8 %val, 0
+  %ret = select i1 %ctlz_ret, i8 %ctlz, i8 32
+  store i8 %ret, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; SI-LABEL: v_ctlz_zero_undef_i16_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_ffbh_u32_e32 v1, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, -16, v1
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_ctlz_zero_undef_i16_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s4, s2, 1
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_load_ubyte v2, v[2:3]
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v1
+; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: v_ctlz_zero_undef_i16_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
+; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    -16(nan), 3(4.203895e-45)
+; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT:     LSHL T0.X, PV.W, PS,
+; EG-NEXT:     LSHL * T0.W, literal.x, PS,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %val = load i16, ptr addrspace(1) %arrayidx, align 1
+  %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i16 %val, 0
+  %ret = select i1 %ctlz_ret, i16 %ctlz, i16 32
+  store i16 %ret, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; SI-LABEL: v_ctlz_zero_undef_i32_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:3
+; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0
+; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:2
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_ctlz_zero_undef_i32_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s2, 1
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s2
+; VI-NEXT:    flat_load_ubyte v2, v[2:3]
+; VI-NEXT:    flat_load_ubyte v3, v[4:5]
+; VI-NEXT:    flat_load_ubyte v4, v[6:7]
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v2, 32, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: v_ctlz_zero_undef_i32_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 11:
+; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
+; EG-NEXT:     FFBH_UINT * T1.W, PV.W,
+; EG-NEXT:     CNDE_INT T0.X, T0.W, literal.x, PV.W,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %val = load i32, ptr addrspace(1) %arrayidx, align 1
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i32 %val, 0
+  %ret = select i1 %ctlz_ret, i32 %ctlz, i32 32
+  store i32 %ret, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; SI-LABEL: v_ctlz_zero_undef_i64_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:5
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:7
+; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0
+; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v4, off, s[8:11], 0 offset:2
+; SI-NEXT:    buffer_load_ubyte v5, off, s[8:11], 0 offset:3
+; SI-NEXT:    buffer_load_ubyte v6, off, s[8:11], 0 offset:4
+; SI-NEXT:    buffer_load_ubyte v7, off, s[8:11], 0 offset:6
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v0, v0, v6
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v1, v1, v7
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v3, v5, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_or_b32_e32 v1, v3, v2
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v1
+; SI-NEXT:    v_min_u32_e32 v0, v1, v0
+; SI-NEXT:    v_min_u32_e32 v0, 64, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_ctlz_zero_undef_i64_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s4, s2, 5
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_add_u32 s4, s2, 4
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_add_u32 s4, s2, 7
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    s_add_u32 s4, s2, 6
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v7, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s4
+; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v9, s5
+; VI-NEXT:    v_mov_b32_e32 v8, s4
+; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v11, s5
+; VI-NEXT:    v_mov_b32_e32 v10, s4
+; VI-NEXT:    s_add_u32 s4, s2, 1
+; VI-NEXT:    flat_load_ubyte v12, v[0:1]
+; VI-NEXT:    flat_load_ubyte v13, v[2:3]
+; VI-NEXT:    flat_load_ubyte v4, v[4:5]
+; VI-NEXT:    flat_load_ubyte v5, v[6:7]
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    flat_load_ubyte v6, v[8:9]
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_load_ubyte v7, v[10:11]
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    flat_load_ubyte v2, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v12
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_or_b32_e32 v3, v3, v13
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    v_ffbh_u32_e32 v3, v3
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    v_or_b32_e32 v0, v4, v0
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; VI-NEXT:    v_min_u32_e32 v0, v0, v3
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_min_u32_e32 v0, 64, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: v_ctlz_zero_undef_i64_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 3 @6
+; EG-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
+; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 4, #1
+; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 6, #1
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 14:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 15:
+; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
+; EG-NEXT:     FFBH_UINT T1.W, PV.W,
+; EG-NEXT:     LSHL * T2.W, T3.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T0.W, T0.W, literal.x, PV.W,
+; EG-NEXT:     OR_INT * T1.W, PS, T2.X,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T2.W, PS,
+; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T0.X, T1.W, PS, PV.W,
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX9-GISEL-NEXT:    v_or3_b32 v2, v2, v3, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v8, 24, v5
+; GFX9-GISEL-NEXT:    v_or3_b32 v3, v0, v4, 0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v2
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 32, v0
+; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v4, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %val = load i64, ptr addrspace(1) %arrayidx, align 1
+  %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i64 %val, 0
+  %ret = select i1 %ctlz_ret, i64 %ctlz, i64 64
+  store i64 %ret, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
 ; SI-LABEL: v_ctlz_zero_undef_i8:
 ; SI:       ; %bb.0:
@@ -403,10 +1186,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_flbit_i32_b32 s4, s4
-; SI-NEXT:    s_flbit_i32_b32 s5, s5
-; SI-NEXT:    s_add_i32 s4, s4, 32
-; SI-NEXT:    s_min_u32 s4, s4, s5
+; SI-NEXT:    s_flbit_i32_b64 s4, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -418,10 +1198,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_flbit_i32_b32 s2, s2
-; VI-NEXT:    s_flbit_i32_b32 s3, s3
-; VI-NEXT:    s_add_i32 s2, s2, 32
-; VI-NEXT:    s_min_u32 s2, s2, s3
+; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -467,10 +1244,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_flbit_i32_b32 s2, s2
-; SI-NEXT:    s_flbit_i32_b32 s3, s3
-; SI-NEXT:    s_add_i32 s2, s2, 32
-; SI-NEXT:    s_min_u32 s2, s2, s3
+; SI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
@@ -482,10 +1256,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_flbit_i32_b32 s2, s2
-; VI-NEXT:    s_flbit_i32_b32 s3, s3
-; VI-NEXT:    s_add_i32 s2, s2, 32
-; VI-NEXT:    s_min_u32 s2, s2, s3
+; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index e871b80cbe29..db91554b2ff3 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -510,13 +510,10 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_ff1_i32_b32 s5, s5
-; SI-NEXT:    s_min_u32 s5, s5, 0xffffffdf
-; SI-NEXT:    s_add_i32 s5, s5, 32
-; SI-NEXT:    s_ff1_i32_b32 s4, s4
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
+; SI-NEXT:    s_ff1_i32_b64 s4, s[4:5]
+; SI-NEXT:    s_min_u32 s4, s4, 64
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -528,10 +525,9 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ff1_i32_b32 s5, s5
-; VI-NEXT:    v_add_u32_e64 v0, s[6:7], s5, 32 clamp
-; VI-NEXT:    s_ff1_i32_b32 s4, s4
-; VI-NEXT:    v_min3_u32 v0, s4, v0, 64
+; VI-NEXT:    s_ff1_i32_b64 s4, s[4:5]
+; VI-NEXT:    s_min_u32 s4, s4, 64
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -560,10 +556,9 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ff1_i32_b32 s0, s3
-; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
-; GFX10-NEXT:    s_ff1_i32_b32 s0, s2
-; GFX10-NEXT:    v_min3_u32 v0, s0, v0, 64
+; GFX10-NEXT:    s_ff1_i32_b64 s0, s[2:3]
+; GFX10-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -591,16 +586,13 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_ff1_i32_b64 s2, s[2:3]
+; SI-NEXT:    s_min_u32 s2, s2, 64
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_ff1_i32_b32 s0, s3
-; SI-NEXT:    s_min_u32 s0, s0, 0xffffffdf
-; SI-NEXT:    s_add_i32 s0, s0, 32
-; SI-NEXT:    s_ff1_i32_b32 s1, s2
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_min3_u32 v0, s1, v0, 64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -611,11 +603,10 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_ff1_i32_b32 s0, s3
+; VI-NEXT:    s_ff1_i32_b64 s0, s[2:3]
+; VI-NEXT:    s_min_u32 s0, s0, 64
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_add_u32_e64 v0, s[0:1], s0, 32 clamp
-; VI-NEXT:    s_ff1_i32_b32 s0, s2
-; VI-NEXT:    v_min3_u32 v0, s0, v0, 64
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -639,13 +630,12 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; GFX10-LABEL: s_cttz_i64_trunc:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ff1_i32_b32 s3, s3
-; GFX10-NEXT:    s_ff1_i32_b32 s2, s2
-; GFX10-NEXT:    v_add_nc_u32_e64 v0, s3, 32 clamp
-; GFX10-NEXT:    v_min3_u32 v0, s2, v0, 64
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    s_ff1_i32_b64 s2, s[2:3]
+; GFX10-NEXT:    s_min_u32 s2, s2, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_cttz_i64_trunc:
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 7e8c28fa4475..5985a235680c 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -500,10 +500,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_ff1_i32_b32 s3, s3
-; SI-NEXT:    s_ff1_i32_b32 s2, s2
-; SI-NEXT:    s_add_i32 s3, s3, 32
-; SI-NEXT:    s_min_u32 s2, s2, s3
+; SI-NEXT:    s_ff1_i32_b64 s2, s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
@@ -516,10 +513,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ff1_i32_b32 s3, s3
-; VI-NEXT:    s_ff1_i32_b32 s2, s2
-; VI-NEXT:    s_add_i32 s3, s3, 32
-; VI-NEXT:    s_min_u32 s2, s2, s3
+; VI-NEXT:    s_ff1_i32_b64 s2, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -878,39 +872,41 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; SI-NEXT:    s_mov_b32 s10, s2
 ; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s4
-; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_mov_b32 s8, s6
 ; SI-NEXT:    s_mov_b32 s9, s7
-; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
-; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
-; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:3
-; SI-NEXT:    buffer_load_ubyte v4, off, s[8:11], 0 offset:4
-; SI-NEXT:    buffer_load_ubyte v5, off, s[8:11], 0 offset:5
-; SI-NEXT:    buffer_load_ubyte v6, off, s[8:11], 0 offset:6
-; SI-NEXT:    buffer_load_ubyte v7, off, s[8:11], 0 offset:7
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:5
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:7
+; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0
+; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v4, off, s[8:11], 0 offset:2
+; SI-NEXT:    buffer_load_ubyte v5, off, s[8:11], 0 offset:3
+; SI-NEXT:    buffer_load_ubyte v6, off, s[8:11], 0 offset:4
+; SI-NEXT:    buffer_load_ubyte v7, off, s[8:11], 0 offset:6
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v0, v0, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_or_b32_e32 v1, v3, v2
-; SI-NEXT:    v_or_b32_e32 v2, v5, v4
-; SI-NEXT:    v_or_b32_e32 v3, v7, v6
+; SI-NEXT:    v_or_b32_e32 v1, v1, v7
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v3, v5, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_ffbl_b32_e32 v1, v1
 ; SI-NEXT:    v_ffbl_b32_e32 v0, v0
-; SI-NEXT:    v_min_u32_e32 v1, 0xffffffdf, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v1
-; SI-NEXT:    v_min3_u32 v0, v0, v1, 64
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; SI-NEXT:    v_min_u32_e32 v0, v0, v1
+; SI-NEXT:    v_min_u32_e32 v0, 64, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -970,7 +966,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
 ; VI-NEXT:    v_ffbl_b32_e32 v3, v3
-; VI-NEXT:    v_add_u32_e64 v3, s[2:3], v3, 32 clamp
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 32, v3
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -979,8 +975,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v2
 ; VI-NEXT:    v_or_b32_e32 v0, v4, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
-; VI-NEXT:    v_min3_u32 v0, v0, v3, 64
+; VI-NEXT:    v_min_u32_e32 v0, v3, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_min_u32_e32 v0, 64, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index 444d6122eb73..3450700cadb0 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -100,7 +100,9 @@ define amdgpu_kernel void @fadd_f16(
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    buffer_load_u16 v1, off, s[0:3], 0 glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -257,13 +259,14 @@ define amdgpu_kernel void @fadd_f16_imm_a(
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    s_mov_b32 s6, -1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-GISEL-NEXT:    v_mov_b16_e32 v1.l, 0x3c00
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-NEXT:    v_mov_b16_e32 v0.h, 0x3c00
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -400,13 +403,14 @@ define amdgpu_kernel void @fadd_f16_imm_b(
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    s_mov_b32 s6, -1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-GISEL-NEXT:    v_mov_b16_e32 v1.l, 0x4000
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-NEXT:    v_mov_b16_e32 v0.h, 0x4000
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index 794b10eea58b..0cd409f726af 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -1517,7 +1517,8 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -1558,7 +1559,8 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; IR-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; IR-IEEE-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; IR-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; IR-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
 ; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 97216b6c9469..b516660f3bdc 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -230,15 +230,16 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
 ; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; VI-SAFE-GISEL-NEXT:    s_movk_i32 s5, 0x7e00
-; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-SAFE-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
 ; VI-SAFE-GISEL-NEXT:    s_max_i32 s7, s7, 0
 ; VI-SAFE-GISEL-NEXT:    s_or_b32 s6, s2, s6
 ; VI-SAFE-GISEL-NEXT:    s_min_i32 s7, s7, 13
 ; VI-SAFE-GISEL-NEXT:    s_bitset1_b32 s2, 12
+; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
 ; VI-SAFE-GISEL-NEXT:    s_lshr_b32 s8, s2, s7
+; VI-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
 ; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s8, s7
 ; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s7, s2
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
@@ -358,20 +359,21 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT:    s_movk_i32 s5, 0x7e00
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_sub_i32 s6, 1, s4
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s7, s2, 0x1000
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
 ; GFX10-SAFE-GISEL-NEXT:    s_max_i32 s6, s6, 0
-; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s9, s4, 12
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
 ; GFX10-SAFE-GISEL-NEXT:    s_min_i32 s6, s6, 13
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s9
-; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s8, s7, s6
-; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s8, s6
-; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s7
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s7
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s6, s8, s6
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s6, s9, s6
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
 ; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
@@ -497,24 +499,24 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT:    s_movk_i32 s5, 0x7e00
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_sub_i32 s6, 1, s4
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s7, s2, 0x1000
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
 ; GFX11-SAFE-GISEL-NEXT:    s_max_i32 s6, s6, 0
-; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s9, s4, 12
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
 ; GFX11-SAFE-GISEL-NEXT:    s_min_i32 s6, s6, 13
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s9
-; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s8, s7, s6
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s8, s6
-; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s7
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s7
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s9, s6
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
 ; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index 046f26246969..31e481bf7aa4 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -1850,7 +1850,8 @@ define float @v_sqrt_f32_ulp2(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -1886,7 +1887,8 @@ define float @v_sqrt_f32_ulp25(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -1922,7 +1924,8 @@ define float @v_sqrt_f32_ulp3(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -1957,7 +1960,8 @@ define float @v_sqrt_f32_ulp2_fabs(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, s[4:5]
@@ -2090,10 +2094,12 @@ define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -2232,10 +2238,12 @@ define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[6:7], |v1|, v2
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[6:7]
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[6:7]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v1, |v1|, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -2328,7 +2336,8 @@ define float @v_sqrt_f32_ulp2_noncontractable_rcp(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -2425,7 +2434,8 @@ define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
@@ -2509,7 +2519,8 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
@@ -2589,7 +2600,8 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv_arcp(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
@@ -2658,10 +2670,12 @@ define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -2802,10 +2816,12 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -2929,10 +2945,12 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
@@ -3029,7 +3047,8 @@ define float @v_sqrt_f32_known_never_posdenormal_ulp2(float nofpclass(psub) %x)
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3064,7 +3083,8 @@ define float @v_sqrt_f32_nsz_known_never_posdenormal_ulp2(float nofpclass(psub)
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3099,7 +3119,8 @@ define float @v_sqrt_f32_known_never_negdenormal(float nofpclass(nsub) %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3698,7 +3719,8 @@ define float @v_sqrt_f32_known_never_zero_never_ninf_ulp2(float nofpclass(zero n
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3733,7 +3755,8 @@ define float @v_sqrt_f32_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3768,7 +3791,8 @@ define float @v_sqrt_f32_nsz_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
@@ -3911,7 +3935,8 @@ define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v0, v1
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 196a3705ac81..1a3d00211ca9 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -40,8 +40,8 @@ define double @v_sqrt_f64(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -100,8 +100,8 @@ define double @v_sqrt_f64_fneg(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -161,8 +161,8 @@ define double @v_sqrt_f64_fabs(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -222,8 +222,8 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -284,8 +284,8 @@ define double @v_sqrt_f64_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -344,8 +344,8 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -404,8 +404,8 @@ define double @v_sqrt_f64_nnan(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -464,8 +464,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -533,8 +533,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -602,8 +602,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -671,8 +671,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -740,8 +740,8 @@ define double @v_sqrt_f64_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -800,8 +800,8 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -860,8 +860,8 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -920,8 +920,8 @@ define double @v_sqrt_f64_afn(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -980,8 +980,8 @@ define double @v_sqrt_f64_afn_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1062,10 +1062,11 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1139,8 +1140,8 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1199,8 +1200,8 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1260,8 +1261,8 @@ define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1342,10 +1343,11 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1419,8 +1421,8 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1479,8 +1481,8 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1539,8 +1541,8 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1599,8 +1601,8 @@ define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1681,10 +1683,11 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1796,16 +1799,18 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
 ; GISEL-NEXT:    v_mov_b32_e32 v6, s4
-; GISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7]
-; GISEL-NEXT:    v_mov_b32_e32 v8, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v8, vcc
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, v8, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[6:7]
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[8:9], v[2:3]
@@ -1824,8 +1829,8 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
 ; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15]
 ; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11]
-; GISEL-NEXT:    v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
 ; GISEL-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
 ; GISEL-NEXT:    v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index de9e320a363a..f24cc6f177d6 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -3087,8 +3087,8 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
   ret void
 }
 
-define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2, <2 x bfloat> %arg3) #0 {
-; CI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2, <2 x bfloat> %arg3, <4 x bfloat> %arg4) #0 {
+; CI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -3103,39 +3103,55 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:24
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:20
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
-; CI-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v17
-; CI-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v18
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24
+; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:8
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:4
+; CI-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v17
+; CI-NEXT:    v_lshrrev_b32_e32 v16, 16, v18
+; CI-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; CI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v15, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v8, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v14, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v13, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v12, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v15, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v11, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v14, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v10, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v13, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v9, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+; VI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -3150,26 +3166,38 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v20
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v19, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v13, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v16, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v12, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v20, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -3184,36 +3212,54 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v20
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v17, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v19, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v13, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v16, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v12, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v20, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:12
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v33
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3232,19 +3278,28 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    buffer_store_b32 v32, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    buffer_store_b32 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v35, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v36, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v38, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v37, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v32, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <2 x i16> %arg1, ptr addrspace(1) undef
   store volatile <2 x half> %arg2, ptr addrspace(1) undef
   store volatile <2 x bfloat> %arg3, ptr addrspace(1) undef
+  store volatile <4 x bfloat> %arg4, ptr addrspace(1) undef
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/gds-unsupported.ll b/llvm/test/CodeGen/AMDGPU/gds-unsupported.ll
index 174de0d58295..d35b9f79e23a 100644
--- a/llvm/test/CodeGen/AMDGPU/gds-unsupported.ll
+++ b/llvm/test/CodeGen/AMDGPU/gds-unsupported.ll
@@ -1,4 +1,5 @@
 ; RUN: not --crash llc -march=amdgcn -mcpu=gfx90a < %s 2>&1 | FileCheck %s
+; RUN: not --crash llc -march=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck %s
 
 ; GDS is not supported on GFX90A+
 ; CHECK: LLVM ERROR: Cannot select: {{.*}} AtomicLoadAdd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 064831f09866..5ebd3eef69f2 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -415,10 +415,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -484,10 +481,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -608,17 +602,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1610,10 +1601,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -1679,10 +1667,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -1803,17 +1788,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2849,10 +2831,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -2918,10 +2897,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3042,17 +3018,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3586,10 +3559,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3655,10 +3625,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3779,17 +3746,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4780,10 +4744,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -4849,10 +4810,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -4973,17 +4931,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 66c89de1789e..ce1654b38d4b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -434,10 +434,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
@@ -507,11 +504,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
@@ -639,12 +633,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -1622,10 +1613,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
@@ -1695,11 +1683,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
@@ -1827,12 +1812,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -2810,10 +2792,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
@@ -2883,11 +2862,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
@@ -3015,12 +2991,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 17533e22ce2a..7379fd5a5422 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -434,10 +434,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
@@ -507,11 +504,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
@@ -639,12 +633,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -1622,10 +1613,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
@@ -1695,11 +1683,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
@@ -1827,12 +1812,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -2810,10 +2792,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
@@ -2883,11 +2862,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
@@ -3015,12 +2991,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index a8b83edfa743..b2c749c131f6 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -467,10 +467,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -536,10 +533,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -660,17 +654,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1706,10 +1697,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -1775,10 +1763,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -1899,17 +1884,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2945,10 +2927,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3014,10 +2993,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3138,17 +3114,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3726,10 +3699,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3795,10 +3765,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3919,17 +3886,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4964,10 +4928,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -5033,10 +4994,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -5157,17 +5115,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 2d1f032070f0..76b007c22b69 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:SGPR_128 */, def %4
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %4
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7602185 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:SGPR_128 */, def %4
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %4
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %4
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7602185 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:VReg_128 */, def %4
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %4
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %4
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6488073 /* reguse:VReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6815754 /* regdef:VReg_128_Align2 */, def %4
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %4
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6815753 /* reguse:VReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
   call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -46,16 +46,16 @@ define amdgpu_kernel void @v_input_output_i128() {
 define amdgpu_kernel void @a_input_output_i128() {
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128 */, def %4
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %4
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %4
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6684682 /* regdef:AReg_128_Align2 */, def %4
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %4
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6684681 /* reguse:AReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
   call void asm sideeffect "; use $0", "a"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll
index a9c2c2790389..3e977c054ec2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll
@@ -1,5 +1,8 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: not --crash llc -march=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
+
+; GFX12-ERR: LLVM ERROR: Cannot select: {{.*}} = DS_ORDERED_COUNT
 
 ; FUNC-LABEL: {{^}}ds_ordered_add:
 ; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
new file mode 100644
index 000000000000..6a6c5b33e0dd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+
+declare i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1), i64)
+
+define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) %addr, i64 %in) {
+; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_no_rtn:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-SDAG-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_no_rtn:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 -4
+  %unused = call i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1) %gep, i64 %in)
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %addr, i64 %in, ptr addrspace(1) %use) {
+; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_rtn:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX12-SDAG-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_rtn:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT:    global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %val = call i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1) %gep, i64 %in)
+  store i64 %val, ptr addrspace(1) %use
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
index b3912aea55f7..fcc57b8bb707 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
@@ -102,9 +102,9 @@ define amdgpu_cs void @vgpr_inverse_ballot(i64 %input, ptr addrspace(1) %out) {
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GISEL-NEXT:    global_store_b64 v[2:3], v[4:5], off
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
@@ -164,8 +164,8 @@ define amdgpu_cs void @phi_uniform(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace
 ; GISEL-NEXT:    s_add_u32 s0, s0, 1
 ; GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GISEL-NEXT:  .LBB5_2: ; %endif
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index b90c92bf9be5..5296ef1f8867 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -182,10 +182,7 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:  .LBB2_5: ; %ComputeLoop
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    s_ff1_i32_b32 s3, s5
-; VI-NEXT:    s_ff1_i32_b32 s6, s4
-; VI-NEXT:    s_add_i32 s3, s3, 32
-; VI-NEXT:    s_min_u32 s3, s6, s3
+; VI-NEXT:    s_ff1_i32_b64 s3, s[4:5]
 ; VI-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; VI-NEXT:    v_readfirstlane_b32 s8, v1
 ; VI-NEXT:    v_readlane_b32 s9, v2, s3
@@ -268,10 +265,7 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:  .LBB2_5: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s5
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s4
-; GFX9-NEXT:    s_add_i32 s3, s3, 32
-; GFX9-NEXT:    s_min_u32 s3, s6, s3
+; GFX9-NEXT:    s_ff1_i32_b64 s3, s[4:5]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX9-NEXT:    v_readlane_b32 s9, v2, s3
@@ -543,10 +537,7 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:  .LBB3_5: ; %ComputeLoop
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    s_ff1_i32_b32 s3, s5
-; VI-NEXT:    s_ff1_i32_b32 s6, s4
-; VI-NEXT:    s_add_i32 s3, s3, 32
-; VI-NEXT:    s_min_u32 s3, s6, s3
+; VI-NEXT:    s_ff1_i32_b64 s3, s[4:5]
 ; VI-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; VI-NEXT:    v_readfirstlane_b32 s8, v1
 ; VI-NEXT:    v_readlane_b32 s9, v2, s3
@@ -625,10 +616,7 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:  .LBB3_5: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s5
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s4
-; GFX9-NEXT:    s_add_i32 s3, s3, 32
-; GFX9-NEXT:    s_min_u32 s3, s6, s3
+; GFX9-NEXT:    s_ff1_i32_b64 s3, s[4:5]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX9-NEXT:    v_readlane_b32 s9, v2, s3
diff --git a/llvm/test/CodeGen/AMDGPU/mcp-implicit-clobber.mir b/llvm/test/CodeGen/AMDGPU/mcp-implicit-clobber.mir
new file mode 100644
index 000000000000..6e613243e38c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mcp-implicit-clobber.mir
@@ -0,0 +1,26 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN:  llc -march=amdgcn -mcpu=gfx900 %s -o - -run-pass machine-cp -verify-machineinstrs | FileCheck %s
+
+# The MachineCopyPropagation Pass should not treat the subsequent
+# instruction "$sgpr2_sgpr3 = COPY $sgpr6_sgpr7" as a NopCopy.
+# For detailed information, please refer to issue 73512.
+---
+name:            foo
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
+
+    ; CHECK-LABEL: name: foo
+    ; CHECK: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
+    ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr0
+    ; CHECK-NEXT: $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
+    ; CHECK-NEXT: S_NOP 0, implicit $sgpr2_sgpr3
+    $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
+    $sgpr0 = COPY $sgpr3
+    S_NOP 0, implicit-def $sgpr0
+    $sgpr3 = COPY killed $sgpr5
+    $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
+    S_NOP 0, implicit $sgpr2_sgpr3
+...
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
new file mode 100644
index 000000000000..91ba353390f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
@@ -0,0 +1,332 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+
+declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
+declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
+
+define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; GFX9-LABEL: ctlz_i64_poison:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX9-NEXT:    v_ffbh_u32_e32 v2, v2
+; GFX9-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
+; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: ctlz_i64_poison:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_e32 v3, v5, v4
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_ffbh_u32_e32 v2, v3
+; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
+; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+  %val = load i64, ptr addrspace(1) %arrayidx, align 1
+  %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
+  store i64 %ctlz, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; GFX9-LABEL: ctlz_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX9-NEXT:    v_ffbh_u32_e32 v2, v2
+; GFX9-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
+; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
+; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: ctlz_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_e32 v3, v5, v4
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_ffbh_u32_e32 v2, v3
+; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
+; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
+; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+  %val = load i64, ptr addrspace(1) %arrayidx, align 1
+  %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone
+  store i64 %ctlz, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; GFX9-LABEL: cttz_i64_poison:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX9-NEXT:    v_ffbl_b32_e32 v0, v0
+; GFX9-NEXT:    v_ffbl_b32_e32 v2, v2
+; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
+; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: cttz_i64_poison:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
+; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
+; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+  %val = load i64, ptr addrspace(1) %arrayidx, align 1
+  %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
+  store i64 %cttz, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; GFX9-LABEL: cttz_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX9-NEXT:    v_ffbl_b32_e32 v0, v0
+; GFX9-NEXT:    v_ffbl_b32_e32 v2, v2
+; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
+; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
+; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: cttz_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
+; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
+; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
+; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+  %val = load i64, ptr addrspace(1) %arrayidx, align 1
+  %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone
+  store i64 %cttz, ptr addrspace(1) %out, align 8
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index b1ea54c307d4..75da11bf9a09 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -10,10 +10,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908: bb.0 (%ir-block.0):
   ; REGALLOC-GFX908-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX908-NEXT: {{  $}}
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %5:agpr_32
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:VReg_128 */, def %26
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %26
   ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %26
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3801098 /* regdef:VReg_64 */, def %23
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %23
   ; REGALLOC-GFX908-NEXT:   SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
   ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
@@ -35,10 +35,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
   ; PEI-GFX908-NEXT:   $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
   ; PEI-GFX908-NEXT:   $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
-  ; PEI-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
-  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+  ; PEI-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
+  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3801098 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
+  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -59,10 +59,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A: bb.0 (%ir-block.0):
   ; REGALLOC-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX90A-NEXT: {{  $}}
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %5:agpr_32
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6815754 /* regdef:VReg_128_Align2 */, def %25
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %25
   ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %25
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4128778 /* regdef:VReg_64_Align2 */, def %23
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %23
   ; REGALLOC-GFX90A-NEXT:   SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -82,10 +82,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
   ; PEI-GFX90A-NEXT:   $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
   ; PEI-GFX90A-NEXT:   $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6815754 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4128778 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
index ade192bde4dc..b1ec70d89fa4 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
@@ -97,10 +97,10 @@ body:             |
 
     ; GCN-LABEL: name: fold_vimm_16_sub_to_lo
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
-    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_lo16 = COPY killed [[S_MOV_B32_]].lo16
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY killed [[S_MOV_B32_]].lo16
     ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]]
     %0:sreg_32 = S_MOV_B32 2048
-    %1:vgpr_lo16 = COPY killed %0.lo16
+    %1:vgpr_16 = COPY killed %0.lo16
     SI_RETURN_TO_EPILOG %1
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index 3dc565ceed0d..3ad98719c689 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -62,12 +62,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -146,8 +146,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -242,12 +242,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -326,8 +326,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -423,12 +423,12 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -507,8 +507,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -603,12 +603,12 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -687,8 +687,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -784,12 +784,12 @@ define double @v_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -866,8 +866,8 @@ define double @v_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -953,12 +953,12 @@ define double @v_rsq_f64_fabs(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -1035,8 +1035,8 @@ define double @v_rsq_f64_fabs(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1123,12 +1123,12 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -1205,8 +1205,8 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1292,12 +1292,12 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -1374,8 +1374,8 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1461,12 +1461,12 @@ define double @v_neg_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -1543,8 +1543,8 @@ define double @v_neg_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1664,26 +1664,27 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1815,10 +1816,11 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1965,26 +1967,27 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbff00000
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2116,10 +2119,11 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2234,17 +2238,17 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -2252,7 +2256,8 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2356,10 +2361,11 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2507,17 +2513,17 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -2525,7 +2531,8 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2659,10 +2666,11 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2775,12 +2783,12 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -2857,8 +2865,8 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -2946,12 +2954,12 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -3028,8 +3036,8 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3107,12 +3115,12 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3177,8 +3185,8 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3252,12 +3260,12 @@ define double @v_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3322,8 +3330,8 @@ define double @v_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3398,12 +3406,12 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3470,8 +3478,8 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3546,12 +3554,12 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3616,8 +3624,8 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3691,12 +3699,12 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3761,8 +3769,8 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3836,12 +3844,12 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3906,8 +3914,8 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3982,12 +3990,12 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -4054,8 +4062,8 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4138,12 +4146,12 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
@@ -4220,8 +4228,8 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4325,13 +4333,15 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -4339,30 +4349,29 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[10:11], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
@@ -4451,10 +4460,11 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -4550,12 +4560,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -4622,8 +4632,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4706,12 +4716,12 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -4776,8 +4786,8 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -5112,12 +5122,12 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5193,8 +5203,8 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5279,12 +5289,12 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5360,8 +5370,8 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5446,12 +5456,12 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5527,8 +5537,8 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5616,17 +5626,17 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    s_mov_b32 s6, 0
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0x40700000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -5702,10 +5712,10 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40700000
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index c48370a9c6c7..7f84d21fbbc4 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -156,15 +156,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[12:13], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s14, s[6:7]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[8:9]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s12
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s13
-; GCN-IR-NEXT:    s_min_u32 s20, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[12:13]
 ; GCN-IR-NEXT:    s_sub_u32 s16, s14, s20
 ; GCN-IR-NEXT:    s_subb_u32 s17, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[16:17], 63
@@ -993,15 +987,9 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s4
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[12:13], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s14, s[6:7]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s12
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s13
-; GCN-IR-NEXT:    s_min_u32 s20, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[12:13]
 ; GCN-IR-NEXT:    s_sub_u32 s16, s14, s20
 ; GCN-IR-NEXT:    s_subb_u32 s17, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[16:17], 63
@@ -1203,10 +1191,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-IR-NEXT:    s_sub_u32 s2, s2, s4
 ; GCN-IR-NEXT:    s_subb_u32 s3, s3, s4
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT:    s_min_u32 s14, s10, s11
+; GCN-IR-NEXT:    s_flbit_i32_b64 s14, s[2:3]
 ; GCN-IR-NEXT:    s_add_u32 s10, s14, 0xffffffc5
 ; GCN-IR-NEXT:    s_addc_u32 s11, 0, -1
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index b1cdc79016fc..30ace3946468 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -152,6 +152,52 @@ define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1
   ret void
 }
 
+; GCN-LABEL: {{^}}v_select_v16i16:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: cndmask
+define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <16 x i16>, ptr addrspace(1) %a.ptr
+  %b = load <16 x i16>, ptr addrspace(1) %b.ptr
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <16 x i16> %a, <16 x i16> %b
+  store <16 x i16> %select, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_select_v32i16:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: cndmask
+define amdgpu_kernel void @v_select_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <32 x i16>, ptr addrspace(1) %a.ptr
+  %b = load <32 x i16>, ptr addrspace(1) %b.ptr
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <32 x i16> %a, <32 x i16> %b
+  store <32 x i16> %select, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; FIXME: Expansion with bitwise operations may be better if doing a
 ; vector select with SGPR inputs.
 
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index adce63c7e45e..db0d9c1aaa21 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -119,6 +119,7 @@ define amdgpu_kernel void @select_f16(
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b,
@@ -1317,3 +1318,1817 @@ entry:
   store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }
+
+define <4 x half> @v_select_v4f16(<4 x half> %a, <4 x half> %b, i32 %cond) {
+; SI-LABEL: v_select_v4f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; SI-NEXT:    v_or_b32_e32 v3, v6, v3
+; SI-NEXT:    v_or_b32_e32 v1, v4, v1
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_select_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %cond, 0
+  %select = select i1 %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %select
+}
+
+define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond) {
+; SI-LABEL: v_vselect_v4f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v11
+; SI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_vselect_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; VI-NEXT:    v_cndmask_b32_e32 v5, v9, v8, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_vselect_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v1, v7, v1, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq <4 x i32> %cond, zeroinitializer
+  %select = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %select
+}
+
+define <8 x half> @v_select_v8f16(<8 x half> %a, <8 x half> %b, i32 %cond) {
+; SI-LABEL: v_select_v8f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v6, v6, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v13
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
+; SI-NEXT:    v_or_b32_e32 v7, v14, v7
+; SI-NEXT:    v_or_b32_e32 v5, v12, v5
+; SI-NEXT:    v_or_b32_e32 v3, v10, v3
+; SI-NEXT:    v_or_b32_e32 v1, v8, v1
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_select_v8f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_v8f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %cond, 0
+  %select = select i1 %cmp, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %select
+}
+
+define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond) {
+; SI-LABEL: v_vselect_v8f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v20
+; SI-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v21
+; SI-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v22
+; SI-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
+; SI-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_vselect_v8f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; VI-NEXT:    v_cndmask_b32_e32 v15, v17, v16, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; VI-NEXT:    v_cndmask_b32_e32 v13, v17, v16, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v5
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v11
+; VI-NEXT:    v_cndmask_b32_e32 v11, v17, v16, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; VI-NEXT:    v_cndmask_b32_e32 v9, v17, v16, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
+; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
+; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v13
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v15
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_vselect_v8f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v17, v16, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v19, v18, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v21, v20, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v2, v13, v2, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v10
+; GFX11-NEXT:    v_perm_b32 v0, v9, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v11, v1, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v3, v15, v3, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq <8 x i32> %cond, zeroinitializer
+  %select = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %select
+}
+
+define <16 x half> @v_select_v16f16(<16 x half> %a, <16 x half> %b, i32 %cond) {
+; SI-LABEL: v_select_v16f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT:    v_or_b32_e32 v12, v12, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_or_b32_e32 v14, v14, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_or_b32_e32 v10, v10, v11
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v27
+; SI-NEXT:    v_or_b32_e32 v13, v15, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_or_b32_e32 v11, v15, v11
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:4
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_or_b32_e32 v6, v6, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v19
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v17
+; SI-NEXT:    v_or_b32_e32 v8, v8, v9
+; SI-NEXT:    v_or_b32_e32 v9, v24, v25
+; SI-NEXT:    v_or_b32_e32 v22, v22, v23
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_or_b32_e32 v5, v20, v21
+; SI-NEXT:    v_or_b32_e32 v3, v18, v3
+; SI-NEXT:    v_or_b32_e32 v1, v16, v1
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_or_b32_e32 v7, v26, v7
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v15, v22, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v13, v13, v12, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v16, v7, v14, vcc
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_select_v16f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; VI-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_v16f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v16
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v10, v2 :: v_dual_cndmask_b32 v3, v11, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v12, v4 :: v_dual_cndmask_b32 v5, v13, v5
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v14, v6 :: v_dual_cndmask_b32 v7, v15, v7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %cond, 0
+  %select = select i1 %cmp, <16 x half> %a, <16 x half> %b
+  ret <16 x half> %select
+}
+
+define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> %cond) {
+; SI-LABEL: v_vselect_v16f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
+; SI-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[6:7]
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_cndmask_b32_e64 v1, v16, v1, s[8:9]
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[10:11], 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; SI-NEXT:    v_cndmask_b32_e64 v2, v16, v2, s[10:11]
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
+; SI-NEXT:    v_cndmask_b32_e64 v3, v16, v3, s[12:13]
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[14:15], 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; SI-NEXT:    v_cndmask_b32_e64 v4, v16, v4, s[14:15]
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cndmask_b32_e64 v5, v16, v5, s[4:5]
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[16:17], 0, v31
+; SI-NEXT:    v_cndmask_b32_e64 v7, v16, v7, s[16:17]
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:40
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; SI-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; SI-NEXT:    v_cndmask_b32_e32 v12, v18, v12, vcc
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:64
+; SI-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    v_cndmask_b32_e32 v14, v20, v14, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_vselect_v16f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    s_mov_b64 exec, s[4:5]
+; VI-NEXT:    v_writelane_b32 v31, s30, 0
+; VI-NEXT:    v_writelane_b32 v31, s31, 1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; VI-NEXT:    v_cmp_eq_u32_e64 s[18:19], 0, v17
+; VI-NEXT:    v_cmp_eq_u32_e64 s[30:31], 0, v29
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
+; VI-NEXT:    v_cmp_eq_u32_e64 s[10:11], 0, v24
+; VI-NEXT:    v_cmp_eq_u32_e64 s[28:29], 0, v27
+; VI-NEXT:    v_cndmask_b32_e64 v16, v17, v16, s[30:31]
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; VI-NEXT:    v_cmp_eq_u32_e64 s[20:21], 0, v19
+; VI-NEXT:    v_cndmask_b32_e64 v17, v18, v17, s[28:29]
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
+; VI-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[10:11]
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32
+; VI-NEXT:    v_cmp_eq_u32_e64 s[26:27], 0, v25
+; VI-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v20
+; VI-NEXT:    v_cmp_eq_u32_e64 s[24:25], 0, v23
+; VI-NEXT:    v_cndmask_b32_e64 v18, v19, v18, s[26:27]
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
+; VI-NEXT:    v_cmp_eq_u32_e64 s[22:23], 0, v21
+; VI-NEXT:    v_cndmask_b32_e64 v19, v20, v19, s[24:25]
+; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
+; VI-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v22
+; VI-NEXT:    v_cndmask_b32_e64 v20, v21, v20, s[22:23]
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
+; VI-NEXT:    v_cndmask_b32_e64 v21, v22, v21, s[20:21]
+; VI-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v21
+; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; VI-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v20
+; VI-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v26
+; VI-NEXT:    v_cndmask_b32_e64 v22, v23, v22, s[18:19]
+; VI-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[8:9]
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v19
+; VI-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[12:13]
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v15
+; VI-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v22
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v18
+; VI-NEXT:    v_cmp_eq_u32_e64 s[14:15], 0, v28
+; VI-NEXT:    v_cmp_eq_u32_e64 s[16:17], 0, v30
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v17
+; VI-NEXT:    v_cndmask_b32_e64 v6, v14, v6, s[14:15]
+; VI-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s[16:17]
+; VI-NEXT:    v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v16
+; VI-NEXT:    v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_readlane_b32 s31, v31, 1
+; VI-NEXT:    v_readlane_b32 s30, v31, 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
+; VI-NEXT:    v_cndmask_b32_e32 v8, v13, v11, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    s_mov_b64 exec, s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_vselect_v16f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v30
+; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v28
+; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v26
+; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v24
+; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v22
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v29
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v35, v34, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v27
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v37, v36, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v25
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v6, v8, v6, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v5, v9, v5, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v39, v38, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v53, v52, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v4, v10, v4, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v11, v1, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v55, v54, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v51, v50, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, v49, v48, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v2, v13, v2, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v14, v3, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v31
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v33, v32, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v12, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v7, v11, v7, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq <16 x i32> %cond, zeroinitializer
+  %select = select <16 x i1> %cmp, <16 x half> %a, <16 x half> %b
+  ret <16 x half> %select
+}
+
+define <32 x half> @v_select_v32f16(<32 x half> %a, <32 x half> %b, i32 %cond) {
+; SI-LABEL: v_select_v32f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT:    v_or_b32_e32 v20, v20, v21
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; SI-NEXT:    v_or_b32_e32 v12, v12, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56
+; SI-NEXT:    v_or_b32_e32 v10, v10, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48
+; SI-NEXT:    v_or_b32_e32 v8, v8, v9
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:40
+; SI-NEXT:    v_or_b32_e32 v6, v6, v7
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:32
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; SI-NEXT:    v_or_b32_e32 v22, v22, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; SI-NEXT:    v_or_b32_e32 v24, v24, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT:    v_or_b32_e32 v26, v26, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_or_b32_e32 v28, v28, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    v_or_b32_e32 v18, v18, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_or_b32_e32 v16, v16, v17
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:124
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT:    v_or_b32_e32 v14, v14, v15
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:64
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT:    v_or_b32_e32 v21, v23, v21
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:128
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; SI-NEXT:    v_or_b32_e32 v23, v25, v23
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:120
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_or_b32_e32 v25, v27, v25
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:112
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT:    v_or_b32_e32 v27, v29, v27
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:104
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; SI-NEXT:    v_or_b32_e32 v29, v30, v29
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:96
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; SI-NEXT:    v_or_b32_e32 v30, v31, v30
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; SI-NEXT:    v_or_b32_e32 v31, v32, v31
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT:    v_or_b32_e32 v19, v32, v19
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT:    v_or_b32_e32 v17, v32, v17
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT:    v_or_b32_e32 v15, v32, v15
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT:    v_or_b32_e32 v13, v32, v13
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT:    v_or_b32_e32 v11, v32, v11
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT:    v_or_b32_e32 v9, v32, v9
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT:    v_or_b32_e32 v7, v32, v7
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT:    v_or_b32_e32 v5, v32, v5
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT:    v_or_b32_e32 v3, v32, v3
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT:    v_or_b32_e32 v1, v32, v1
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:132
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v32
+; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v8, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v10, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v13, v13, v12, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v15, v15, v14, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v17, v17, v16, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v19, v19, v18, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v31, v31, v20, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v30, v30, v22, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v29, v29, v24, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v27, v27, v26, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v32, v25, v28, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v33, v23, v21, vcc
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v30, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_select_v32f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; VI-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; VI-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_select_v32f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v31
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v16, v0 :: v_dual_cndmask_b32 v1, v17, v1
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v20, v4 :: v_dual_cndmask_b32 v5, v21, v5
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v7, v23, v7
+; GFX11-NEXT:    v_dual_cndmask_b32 v8, v24, v8 :: v_dual_cndmask_b32 v9, v25, v9
+; GFX11-NEXT:    v_dual_cndmask_b32 v10, v26, v10 :: v_dual_cndmask_b32 v11, v27, v11
+; GFX11-NEXT:    v_dual_cndmask_b32 v12, v28, v12 :: v_dual_cndmask_b32 v13, v29, v13
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_cndmask_b32 v14, v30, v14 :: v_dual_cndmask_b32 v15, v32, v15
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq i32 %cond, 0
+  %select = select i1 %cmp, <32 x half> %a, <32 x half> %b
+  ret <32 x half> %select
+}
+
+define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> %cond) {
+; SI-LABEL: v_vselect_v32f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[16:17], 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:136
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[14:15], 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:140
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:144
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[10:11], 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:152
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:156
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:160
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e64 v0, v31, v0, s[16:17]
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e64 v1, v31, v1, s[14:15]
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:12
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e64 v2, v31, v2, s[12:13]
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e64 v3, v31, v3, s[10:11]
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e64 v4, v31, v4, s[8:9]
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e64 v5, v31, v5, s[6:7]
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e64 v6, v31, v6, s[4:5]
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v7, v31, v7, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:164
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v8, v31, v8, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:168
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v9, v31, v9, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:172
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v10, v31, v10, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:176
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v11, v31, v11, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:180
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v12, v31, v12, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:184
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v13, v31, v13, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:188
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v14, v31, v14, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:192
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:196
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v16, v31, v16, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:200
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:72
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v17, v31, v17, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:204
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v18, v31, v18, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:208
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v19, v31, v19, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:212
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:84
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v20, v31, v20, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:216
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v21, v31, v21, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:220
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v22, v31, v22, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:224
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v23, v31, v23, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:228
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v24, v31, v24, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:232
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v25, v31, v25, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:236
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:108
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v26, v31, v26, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:240
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:112
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v27, v31, v27, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:244
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v28, v31, v28, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:248
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v29, v31, v29, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:252
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v30, v31, v30, vcc
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:256
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_vselect_v32f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:120
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:112
+; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:72
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32
+; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:128
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:64
+; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:56
+; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:48
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:40
+; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:116
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v30
+; VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v13
+; VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v29
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v28
+; VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v11
+; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v27
+; VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v10
+; VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v26
+; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v9
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v36
+; VI-NEXT:    v_cndmask_b32_e32 v36, v43, v38, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v35
+; VI-NEXT:    v_cndmask_b32_e32 v35, v45, v44, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v34
+; VI-NEXT:    v_cndmask_b32_e32 v34, v47, v46, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v33
+; VI-NEXT:    v_cndmask_b32_e32 v33, v57, v56, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v32
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v25
+; VI-NEXT:    v_cndmask_b32_e32 v32, v59, v58, vcc
+; VI-NEXT:    s_waitcnt vmcnt(13)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v31
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v24
+; VI-NEXT:    v_cndmask_b32_e32 v38, v38, v60, vcc
+; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v39
+; VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v15
+; VI-NEXT:    v_cndmask_b32_e32 v39, v44, v43, vcc
+; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v37
+; VI-NEXT:    s_waitcnt vmcnt(10)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v55
+; VI-NEXT:    v_cndmask_b32_e32 v31, v31, v45, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v23
+; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v50
+; VI-NEXT:    v_cndmask_b32_e32 v50, v43, v55, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v22
+; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v53
+; VI-NEXT:    v_cndmask_b32_e32 v53, v43, v55, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v21
+; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v54
+; VI-NEXT:    v_cndmask_b32_e32 v54, v43, v55, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v20
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v52
+; VI-NEXT:    v_cndmask_b32_e32 v52, v43, v55, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v19
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v51
+; VI-NEXT:    v_cndmask_b32_e32 v51, v43, v55, vcc
+; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v18
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:108
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v49
+; VI-NEXT:    v_cndmask_b32_e32 v49, v43, v55, vcc
+; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v17
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:76
+; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v48
+; VI-NEXT:    v_cndmask_b32_e32 v48, v46, v43, vcc
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v16
+; VI-NEXT:    s_waitcnt vmcnt(10)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v40
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44
+; VI-NEXT:    v_cndmask_b32_e32 v46, v58, v46, vcc
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v41
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28
+; VI-NEXT:    v_cndmask_b32_e32 v15, v37, v15, vcc
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:20
+; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v42
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:12
+; VI-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:4
+; VI-NEXT:    s_waitcnt vmcnt(13)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v44
+; VI-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v55
+; VI-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v45
+; VI-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; VI-NEXT:    s_waitcnt vmcnt(10)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v47
+; VI-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v56
+; VI-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v43
+; VI-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v57
+; VI-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v59
+; VI-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v40
+; VI-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v58
+; VI-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v41
+; VI-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v37
+; VI-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v42
+; VI-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
+; VI-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v46
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v49
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v51
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v52
+; VI-NEXT:    v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v54
+; VI-NEXT:    v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v53
+; VI-NEXT:    v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v50
+; VI-NEXT:    v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v39
+; VI-NEXT:    v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v38
+; VI-NEXT:    v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v32
+; VI-NEXT:    v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v33
+; VI-NEXT:    v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v34
+; VI-NEXT:    v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v35
+; VI-NEXT:    v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v36
+; VI-NEXT:    v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; VI-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_vselect_v32f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x20
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:128
+; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 16, v30
+; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 16, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v29
+; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 16, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 16, v28
+; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 16, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 16, v27
+; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 16, v26
+; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 16, v25
+; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v24
+; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v23
+; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 16, v22
+; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 16, v21
+; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 16, v20
+; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 16, v19
+; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v18
+; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v15
+; GFX11-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v31
+; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v97, v98, v97, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v32
+; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v99, v100, v99, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v34
+; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v33
+; GFX11-NEXT:    v_cndmask_b32_e32 v34, v102, v101, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v35
+; GFX11-NEXT:    v_cndmask_b32_e32 v35, v112, v103, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v36
+; GFX11-NEXT:    v_cndmask_b32_e32 v36, v114, v113, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(26)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v37
+; GFX11-NEXT:    v_cndmask_b32_e32 v37, v116, v115, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v38
+; GFX11-NEXT:    v_cndmask_b32_e32 v38, v118, v117, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(24)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v39
+; GFX11-NEXT:    v_cndmask_b32_e32 v39, v128, v119, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v48
+; GFX11-NEXT:    v_cndmask_b32_e32 v48, v130, v129, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v49
+; GFX11-NEXT:    v_cndmask_b32_e32 v49, v132, v131, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v50
+; GFX11-NEXT:    v_cndmask_b32_e32 v50, v134, v133, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v51
+; GFX11-NEXT:    v_cndmask_b32_e32 v51, v144, v135, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v52
+; GFX11-NEXT:    v_cndmask_b32_e32 v52, v146, v145, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v53
+; GFX11-NEXT:    v_cndmask_b32_e32 v31, v31, v147, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v54
+; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v98, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v55
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v33, v15, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v64
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v14, v97, v14, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v66
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v67
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v12, v34, v12, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v68
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v69
+; GFX11-NEXT:    v_perm_b32 v13, v99, v13, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v10, v36, v10, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v70
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v71
+; GFX11-NEXT:    v_perm_b32 v11, v35, v11, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v8, v38, v8, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v80
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v81
+; GFX11-NEXT:    v_perm_b32 v9, v37, v9, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v6, v48, v6, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v82
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v83
+; GFX11-NEXT:    v_perm_b32 v7, v39, v7, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v4, v50, v4, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v84
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v85
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v86
+; GFX11-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v0, v32, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v87
+; GFX11-NEXT:    v_perm_b32 v3, v51, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v2, v52, v2, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v100, v96, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v1, v31, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v15, v16, v15, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq <32 x i32> %cond, zeroinitializer
+  %select = select <32 x i1> %cmp, <32 x half> %a, <32 x half> %b
+  ret <32 x half> %select
+}
diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir
new file mode 100644
index 000000000000..4292e76f3709
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir
@@ -0,0 +1,44 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -march=amdgcn -mcpu=tonga -run-pass=si-fix-sgpr-copies --verify-machineinstrs -o - %s | FileCheck %s
+
+# Copy to $sgpr0 is disconnected and becomes an IMPLICIT_DEF
+# Inserted V_AND_B32 defines virtual register after use.
+
+---
+name:            si_fix_sgpr_copies_breaks_function
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: si_fix_sgpr_copies_breaks_function
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+    ; CHECK-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed [[S_LSHR_B32_]]
+    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -32768
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_1]]
+    ; CHECK-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 killed [[COPY1]], [[COPY2]], implicit $exec
+    ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_2]], [[V_XOR_B32_e64_]], implicit $exec
+    ; CHECK-NEXT: $sgpr0 = V_READFIRSTLANE_B32 [[V_AND_B32_e64_]], implicit $exec
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0
+    %0:sgpr_32 = COPY $sgpr0
+    %2:sreg_32 = S_MOV_B32 16
+    %3:sreg_32 = S_LSHR_B32 %0, killed %2, implicit-def dead $scc
+    %4:sreg_32 = COPY killed %3
+    %5:sreg_32 = S_MOV_B32 -32768
+    %7:vgpr_32 = COPY killed %5
+    %6:vgpr_32 = V_XOR_B32_e64 killed %4, %7, implicit $exec
+    %8:sreg_32 = S_MOV_B32 65535
+    %10:sreg_32 = COPY %6
+    %9:sreg_32 = S_AND_B32 killed %8, killed %10, implicit-def dead $scc
+    $sgpr0 = COPY %9
+    SI_RETURN_TO_EPILOG $sgpr0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index 0fdc683d3169..d5f97314f932 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -12,12 +12,12 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; GCN-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2490378 /* regdef:VGPR_32 */, def undef %24.sub0
+  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %24.sub0
   ; GCN-NEXT:   SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
   ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %16:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3801097 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]]
+  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]]
   ; GCN-NEXT:   S_ENDPGM 0
   %v0 = call i32 asm sideeffect "; def $0", "=v"()
   %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index ac212d22e9cf..70e75116e180 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -124,18 +124,14 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s4
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s5
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
-; GCN-IR-NEXT:    s_min_u32 s18, s6, s7
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s10, s[4:5]
+; GCN-IR-NEXT:    s_flbit_i32_b64 s18, s[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GCN-IR-NEXT:    s_sub_u32 s12, s10, s18
 ; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
@@ -145,9 +141,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
@@ -1029,15 +1023,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_subb_u32 s9, s7, s10
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[8:9]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s8
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s9
-; GCN-IR-NEXT:    s_min_u32 s12, s6, s7
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_min_u32 s20, s6, s7
+; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[2:3]
 ; GCN-IR-NEXT:    s_sub_u32 s14, s12, s20
 ; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
@@ -1180,15 +1168,9 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s10
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[4:5], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[6:7]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT:    s_min_u32 s12, s8, s9
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s4
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s5
-; GCN-IR-NEXT:    s_min_u32 s20, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[4:5]
 ; GCN-IR-NEXT:    s_sub_u32 s14, s12, s20
 ; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
@@ -1393,10 +1375,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
 ; GCN-IR-NEXT:    s_sub_u32 s4, s2, s8
 ; GCN-IR-NEXT:    s_subb_u32 s5, s3, s8
-; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s4
-; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s5
-; GCN-IR-NEXT:    s_min_u32 s12, s2, s3
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[4:5]
 ; GCN-IR-NEXT:    s_add_u32 s2, s12, 0xffffffc5
 ; GCN-IR-NEXT:    s_addc_u32 s3, 0, -1
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
new file mode 100644
index 000000000000..0339fca4d56c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GFX7 %s
+
+declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
+declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
+
+define float @v_constrained_fpext_f16_to_f32(ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fpext_f16_to_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %val = load half, ptr addrspace(1) %ptr
+  %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %val, metadata !"fpexcept.strict")
+  ret float %result
+}
+
+define <2 x float> @v_constrained_fpext_v2f16_to_v2f32(ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fpext_v2f16_to_v2f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <2 x half>, ptr addrspace(1) %ptr
+  %result = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %val, metadata !"fpexcept.strict")
+  ret <2 x float> %result
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 9301170c034d..7a8d19200a72 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -125,18 +125,14 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s4
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s5
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
-; GCN-IR-NEXT:    s_min_u32 s16, s6, s7
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s10, s[4:5]
+; GCN-IR-NEXT:    s_flbit_i32_b64 s16, s[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GCN-IR-NEXT:    s_sub_u32 s12, s10, s16
 ; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
@@ -146,9 +142,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
@@ -796,15 +790,9 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s10, s[2:3]
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[6:7]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s2
-; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s4, s5
-; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s8
-; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s9
-; GCN-IR-NEXT:    s_min_u32 s16, s4, s5
+; GCN-IR-NEXT:    s_flbit_i32_b64 s16, s[8:9]
 ; GCN-IR-NEXT:    s_sub_u32 s12, s10, s16
 ; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
@@ -986,10 +974,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_min_u32 s12, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[2:3]
 ; GCN-IR-NEXT:    s_add_u32 s8, s12, 0xffffffc5
 ; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
@@ -1406,10 +1391,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_min_u32 s12, s6, s7
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[2:3]
 ; GCN-IR-NEXT:    s_sub_u32 s8, 59, s12
 ; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 784993ccd3bd..3ec51b01c7a3 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -124,18 +124,14 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s4
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s5
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
-; GCN-IR-NEXT:    s_min_u32 s18, s6, s7
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s10, s[4:5]
+; GCN-IR-NEXT:    s_flbit_i32_b64 s18, s[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GCN-IR-NEXT:    s_sub_u32 s12, s10, s18
 ; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
@@ -145,9 +141,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
@@ -814,10 +808,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_min_u32 s12, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[2:3]
 ; GCN-IR-NEXT:    s_add_u32 s8, s12, 0xffffffc5
 ; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
@@ -973,10 +964,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_min_u32 s12, s6, s7
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[2:3]
 ; GCN-IR-NEXT:    s_sub_u32 s8, 59, s12
 ; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 91cff9d1a541..844aa57de05c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1582,8 +1582,8 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
   ret <6 x half> %shuffle
 }
 
-define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C)  {
-; GFX9-LABEL: fma_shuffle:
+define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C)  {
+; GFX9-LABEL: fma_shuffle_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
@@ -1600,7 +1600,7 @@ define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, p
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: fma_shuffle:
+; GFX10-LABEL: fma_shuffle_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1619,7 +1619,7 @@ define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, p
 ; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fma_shuffle:
+; GFX11-LABEL: fma_shuffle_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
@@ -1758,12 +1758,8 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in,
   ret void
 }
 
-declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-
-attributes #0 = { nounwind readnone speculatable }
-define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: low16bits:
+define <2 x half> @low16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16bits_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
@@ -1773,7 +1769,7 @@ define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: low16bits:
+; GFX10-LABEL: low16bits_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
@@ -1782,7 +1778,7 @@ define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: low16bits:
+; GFX11-LABEL: low16bits_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
@@ -1798,8 +1794,8 @@ entry:
   ret <2 x half> %vy1.2.vec.insert
 }
 
-define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: hi16bits:
+define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16bits_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
@@ -1809,7 +1805,7 @@ define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: hi16bits:
+; GFX10-LABEL: hi16bits_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
@@ -1818,7 +1814,7 @@ define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: hi16bits:
+; GFX11-LABEL: hi16bits_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
@@ -1834,8 +1830,8 @@ entry:
   ret <2 x half> %vy1.2.vec.insert
 }
 
-define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: low16hi16bits:
+define <2 x half> @low16hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16hi16bits_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
@@ -1845,7 +1841,7 @@ define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v4, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: low16hi16bits:
+; GFX10-LABEL: low16hi16bits_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
@@ -1854,7 +1850,7 @@ define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v4, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: low16hi16bits:
+; GFX11-LABEL: low16hi16bits_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
@@ -1870,8 +1866,8 @@ entry:
   ret <2 x half> %vy1.2.vec.insert
 }
 
-define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: hi16low16bits:
+define <2 x half> @hi16low16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16low16bits_v2bf16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
@@ -1880,7 +1876,7 @@ define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: hi16low16bits:
+; GFX10-LABEL: hi16low16bits_v2bf16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
@@ -1889,7 +1885,7 @@ define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: hi16low16bits:
+; GFX11-LABEL: hi16low16bits_v2bf16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
@@ -2675,3 +2671,2354 @@ define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
   store <16 x i32> %shuffle, ptr addrspace(1) %out
   ret void
 }
+
+define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_23uu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_23uu:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_23uu:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_234u:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_234u:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_234u:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_u1u3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_u1u3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_u1u3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_u3u1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_u3u1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_u3u1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_u3uu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_u3uu:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_u3uu:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_3u6u:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_3u6u:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_3u6u:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_3uu7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_3uu7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_3uu7:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_35u5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s5, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_35u5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_35u5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_357u:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v6, v4, s4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_357u:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v6, v4, 0x3020706
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_357u:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0101:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0101:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0101:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0123:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0123:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0123:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0145:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0145:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0145:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0167:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0167:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0167:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2301:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2301:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2301:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2323(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2323:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2323:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2323:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2345:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2345:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2345:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2367:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2367:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2367:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4501:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4501:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4501:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4523:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4523:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4523:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4545:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4545:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4545:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4567(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4567:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4567:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4567:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6701:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6701:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6701:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6723:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6723:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6723:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6745:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6745:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6745:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6767:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6767:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6767:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2356:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v6
+; GFX9-NEXT:    v_and_or_b32 v0, v6, s4, v0
+; GFX9-NEXT:    v_perm_b32 v1, v4, v1, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2356:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v6, v0
+; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2356:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v2, v2, v1, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_5623:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_5623:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_5623:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_3456:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v2, v4, v6, 16
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s5
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_3456:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX10-NEXT:    v_alignbit_b32 v2, v5, v4, 16
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_3456:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    v_alignbit_b32 v2, v2, v1, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_5634:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s5
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_5634:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_5634:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v2, v0, v4, 16
+; GFX11-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_5734:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_5734:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_5734:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v2, v0, v4, 16
+; GFX11-NEXT:    v_perm_b32 v1, v0, v1, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0000:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0000:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0000:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> zeroinitializer
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_1010:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v0, 16
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_1010:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_1010:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v0, v0, v0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_1100:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x7060706
+; GFX9-NEXT:    s_mov_b32 s6, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s7, 0x3020504
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    v_perm_b32 v2, v1, v1, s5
+; GFX9-NEXT:    v_and_or_b32 v3, v1, s4, v0
+; GFX9-NEXT:    v_perm_b32 v0, v1, v2, s6
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_1100:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_perm_b32 v2, v1, v1, 0x7060706
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
+; GFX10-NEXT:    v_perm_b32 v0, v1, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x3020504
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_1100:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    v_perm_b32 v2, v1, v1, 0x7060706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
+; GFX11-NEXT:    v_perm_b32 v0, v1, v2, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x3020504
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6161:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6161:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6161:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2333:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v1, v0, v0, s4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2333:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2333:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6667:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v1, v0, v0, s4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6667:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6667:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_0101:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_0101:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_0101:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_0123:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_0123:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_0123:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_4589:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_4589:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:8
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_4589:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_10_11_2_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_10_11_2_3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_10_11_2_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:8
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v3bf16_0122:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v3bf16_0122:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v3bf16_0122:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <3 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <3 x bfloat> %val0, <3 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v2bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v2bf16_0122:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v2bf16_0122:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v2bf16_0122:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <2 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <2 x bfloat> %val0, <2 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+  ret <4 x bfloat> %shuffle
+}
+
+define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v6bf16_452367:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
+; GFX9-NEXT:    global_load_dword v7, v[3:4], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v6bf16_452367:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
+; GFX10-NEXT:    global_load_dword v7, v[3:4], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, v7
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v6bf16_452367:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[3:4], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <6 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <6 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <6 x bfloat> %val0, <6 x bfloat> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
+  ret <6 x bfloat> %shuffle
+}
+
+define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C)  {
+; GFX9-LABEL: fma_shuffle_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s0, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_fma_f32 v7, v9, v8, v7
+; GFX9-NEXT:    v_fma_f32 v0, v9, v2, v0
+; GFX9-NEXT:    v_fma_f32 v8, v11, v8, v12
+; GFX9-NEXT:    v_fma_f32 v1, v11, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
+; GFX9-NEXT:    v_fma_f32 v0, v4, v10, v0
+; GFX9-NEXT:    v_fma_f32 v2, v4, v3, v2
+; GFX9-NEXT:    v_fma_f32 v1, v5, v10, v1
+; GFX9-NEXT:    v_fma_f32 v3, v5, v3, v7
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s0
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s0
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: fma_shuffle_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v8
+; GFX10-NEXT:    v_fmac_f32_e32 v0, v9, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v12, v11, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v11, v8
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v0, v4, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v4, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v2, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v2, v3
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v7, 0x3020706
+; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fma_shuffle_v2bf16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x10
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_fmac_f32_e32 v1, v11, v8
+; GFX11-NEXT:    v_dual_fmac_f32 v12, v11, v2 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v0, v9, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX11-NEXT:    v_dual_fmac_f32 v1, v2, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v7, v9, v8 :: v_dual_fmac_f32 v0, v4, v10
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v4, v3
+; GFX11-NEXT:    v_fmac_f32_e32 v7, v2, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v7, 0x3020706
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp12 = zext i32 %tmp1 to i64
+  %arrayidx = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %A, i64 %tmp12
+  %tmp14 = load <4 x bfloat>, ptr addrspace(1) %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %B, i64 %tmp12
+  %tmp15 = load <4 x bfloat>, ptr addrspace(1) %arrayidx1, align 8
+  %arrayidx2 = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %C, i64 %tmp12
+  %tmp16 = load <4 x bfloat>, ptr addrspace(1) %arrayidx2, align 8
+  %tmp17 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> zeroinitializer
+  %tmp18 = shufflevector <4 x bfloat> %tmp15, <4 x bfloat> undef, <2 x i32> <i32 0, i32 1>
+  %tmp19 = shufflevector <4 x bfloat> %tmp16, <4 x bfloat> undef, <2 x i32> <i32 0, i32 1>
+  %tmp20 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp17, <2 x bfloat> %tmp18, <2 x bfloat> %tmp19)
+  %tmp21 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 1, i32 1>
+  %tmp22 = shufflevector <4 x bfloat> %tmp15, <4 x bfloat> undef, <2 x i32> <i32 2, i32 3>
+  %tmp23 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp21, <2 x bfloat> %tmp22, <2 x bfloat> %tmp20)
+  %tmp24 = shufflevector <2 x bfloat> %tmp23, <2 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %tmp25 = shufflevector <4 x bfloat> %tmp24, <4 x bfloat> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %tmp26 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 2, i32 2>
+  %tmp27 = shufflevector <4 x bfloat> %tmp25, <4 x bfloat> undef, <2 x i32> <i32 2, i32 3>
+  %tmp28 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp26, <2 x bfloat> %tmp18, <2 x bfloat> %tmp27)
+  %tmp29 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 3, i32 3>
+  %tmp30 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp29, <2 x bfloat> %tmp22, <2 x bfloat> %tmp28)
+  %tmp31 = shufflevector <2 x bfloat> %tmp30, <2 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %tmp32 = shufflevector <4 x bfloat> %tmp25, <4 x bfloat> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  store <4 x bfloat> %tmp32, ptr addrspace(1) %arrayidx2, align 8
+  ret void
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0456:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
+; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v6, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v4, v1, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0456:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v6, v4, 0x1000504
+; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0456:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x1000504
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+  ret <4 x bfloat> %shuffle
+}
+
+define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16bits:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: low16bits:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x1000504
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: low16bits:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x1000504
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 0, i32 undef>
+  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 2>
+  ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16bits_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: hi16bits_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: hi16bits_v2bf16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 undef>
+  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 3>
+  ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16hi16bits_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020504
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: low16hi16bits_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020504
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: low16hi16bits_v2bf16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020504
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 0, i32 undef>
+  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 3>
+  ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16low16bits:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: hi16low16bits:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: hi16low16bits:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 undef>
+  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 2>
+  ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @v2bfloat_hi16bits(ptr addrspace(1) %x0) {
+; GFX9-LABEL: v2bfloat_hi16bits:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v2bfloat_hi16bits:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v2bfloat_hi16bits:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %load0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %insert1 = insertelement <2 x bfloat> undef, bfloat 0.0, i32 0
+  %insert2 = insertelement <2 x bfloat> %insert1, bfloat 0.0, i32 1
+  %vec.ret = shufflevector <2 x bfloat> %insert2, <2 x bfloat> %load0, <2 x i32> <i32 0, i32 3>
+  ret <2 x bfloat> %vec.ret
+}
+
+define void @shuffle_v8bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
+; GFX9-LABEL: shuffle_v8bf16_concat:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_concat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_concat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x bfloat> %shuffle, ptr addrspace(1) %out
+  ret void
+}
+
+define void @shuffle_v16bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
+; GFX9-LABEL: shuffle_v16bf16_concat:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v16bf16_concat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v16bf16_concat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x bfloat> %shuffle, ptr addrspace(1) %out
+  ret void
+}
+
+define void @shuffle_v32bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
+; GFX9-LABEL: shuffle_v32bf16_concat:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v32bf16_concat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v32bf16_concat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
+; GFX11-NEXT:    global_load_b128 v[10:13], v[2:3], off offset:16
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[14:17], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:32
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:48
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[14:17], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <16 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <16 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <16 x bfloat> %val0, <16 x bfloat> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x bfloat> %shuffle, ptr addrspace(1) %out
+  ret void
+}
+
+declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
+declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/verify-gfx12-gds.mir b/llvm/test/CodeGen/AMDGPU/verify-gfx12-gds.mir
new file mode 100644
index 000000000000..a2182aa8d6ef
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/verify-gfx12-gds.mir
@@ -0,0 +1,10 @@
+# RUN: not --crash llc -march=amdgcn -mcpu=gfx1200 -run-pass=none -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12 %s
+
+---
+name: gds
+body: |
+  bb.0:
+    ; GFX12: *** Bad machine code: GDS is not supported on this subtarget ***
+    ; GFX12: - instruction: DS_ADD_U32 %0:vgpr_32, %1:vgpr_32, 0, 1, implicit $m0, implicit $exec :: (load store acq_rel (s32), addrspace 2)
+    DS_ADD_U32 %0:vgpr_32, %2:vgpr_32, 0, 1, implicit $m0, implicit $exec :: (load store acq_rel (s32), addrspace 2)
+...
diff --git a/llvm/test/CodeGen/BPF/loop-exit-cond.ll b/llvm/test/CodeGen/BPF/loop-exit-cond.ll
index 7666d961753a..df6a2489a432 100644
--- a/llvm/test/CodeGen/BPF/loop-exit-cond.ll
+++ b/llvm/test/CodeGen/BPF/loop-exit-cond.ll
@@ -26,7 +26,7 @@ target triple = "bpf"
 
 ; Function Attrs: nounwind
 define dso_local i32 @test(i32 %len, ptr %data) #0 {
-; CHECK-LABEL: define dso_local i32 @test(
+; CHECK-LABEL: define dso_local noundef i32 @test(
 ; CHECK-SAME: i32 [[LEN:%.*]], ptr nocapture readonly [[DATA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[D:%.*]] = alloca [1 x i64], align 8
diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
index 6824ab5cda8d..ae6f31aaec64 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
@@ -57,7 +57,7 @@ define void @buildvector_v8f32_splat(ptr %dst, float %a0) nounwind {
 ; CHECK-LABEL: buildvector_v8f32_splat:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
-; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 0
+; CHECK-NEXT:    xvreplve0.w $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -71,7 +71,7 @@ define void @buildvector_v4f64_splat(ptr %dst, double %a0) nounwind {
 ; CHECK-LABEL: buildvector_v4f64_splat:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
-; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr0, 0
+; CHECK-NEXT:    xvreplve0.d $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
index 78f584cd09a8..fc2929d8e6db 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
@@ -31,7 +31,7 @@ define void @extract_8xi32(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: extract_8xi32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 1
+; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 1
 ; CHECK-NEXT:    st.w $a0, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x i32>, ptr %src
@@ -44,7 +44,7 @@ define void @extract_4xi64(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: extract_4xi64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
 ; CHECK-NEXT:    st.d $a0, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i64>, ptr %src
@@ -57,8 +57,8 @@ define void @extract_8xfloat(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: extract_8xfloat:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    ori $a0, $zero, 7
-; CHECK-NEXT:    xvreplve.w $xr0, $xr0, $a0
+; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
+; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    fst.s $fa0, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x float>, ptr %src
@@ -71,8 +71,8 @@ define void @extract_4xdouble(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: extract_4xdouble:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    ori $a0, $zero, 3
-; CHECK-NEXT:    xvreplve.d $xr0, $xr0, $a0
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
+; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    fst.d $fa0, $a1, 0
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x double>, ptr %src
@@ -84,12 +84,21 @@ define void @extract_4xdouble(ptr %src, ptr %dst) nounwind {
 define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_32xi8_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
+; CHECK-NEXT:    addi.d $sp, $sp, -64
+; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvreplve.b $xr0, $xr0, $a2
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    srai.w $a0, $a0, 24
+; CHECK-NEXT:    xvst $xr0, $sp, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 0
+; CHECK-NEXT:    ld.b $a0, $a0, 0
 ; CHECK-NEXT:    st.b $a0, $a1, 0
+; CHECK-NEXT:    addi.d $sp, $fp, -64
+; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 64
 ; CHECK-NEXT:    ret
   %v = load volatile <32 x i8>, ptr %src
   %e = extractelement <32 x i8> %v, i32 %idx
@@ -100,12 +109,21 @@ define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_16xi16_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
+; CHECK-NEXT:    addi.d $sp, $sp, -64
+; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvreplve.h $xr0, $xr0, $a2
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    srai.w $a0, $a0, 16
+; CHECK-NEXT:    xvst $xr0, $sp, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 1
+; CHECK-NEXT:    ld.h $a0, $a0, 0
 ; CHECK-NEXT:    st.h $a0, $a1, 0
+; CHECK-NEXT:    addi.d $sp, $fp, -64
+; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 64
 ; CHECK-NEXT:    ret
   %v = load volatile <16 x i16>, ptr %src
   %e = extractelement <16 x i16> %v, i32 %idx
@@ -116,11 +134,21 @@ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xi32_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
+; CHECK-NEXT:    addi.d $sp, $sp, -64
+; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvreplve.w $xr0, $xr0, $a2
-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
+; CHECK-NEXT:    xvst $xr0, $sp, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
+; CHECK-NEXT:    ld.w $a0, $a0, 0
 ; CHECK-NEXT:    st.w $a0, $a1, 0
+; CHECK-NEXT:    addi.d $sp, $fp, -64
+; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 64
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x i32>, ptr %src
   %e = extractelement <8 x i32> %v, i32 %idx
@@ -131,11 +159,21 @@ define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_4xi64_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
+; CHECK-NEXT:    addi.d $sp, $sp, -64
+; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvreplve.d $xr0, $xr0, $a2
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
+; CHECK-NEXT:    xvst $xr0, $sp, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
+; CHECK-NEXT:    ld.d $a0, $a0, 0
 ; CHECK-NEXT:    st.d $a0, $a1, 0
+; CHECK-NEXT:    addi.d $sp, $fp, -64
+; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 64
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i64>, ptr %src
   %e = extractelement <4 x i64> %v, i32 %idx
@@ -146,10 +184,21 @@ define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xfloat_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
+; CHECK-NEXT:    addi.d $sp, $sp, -64
+; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvreplve.w $xr0, $xr0, $a2
+; CHECK-NEXT:    xvst $xr0, $sp, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
+; CHECK-NEXT:    fld.s $fa0, $a0, 0
 ; CHECK-NEXT:    fst.s $fa0, $a1, 0
+; CHECK-NEXT:    addi.d $sp, $fp, -64
+; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 64
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x float>, ptr %src
   %e = extractelement <8 x float> %v, i32 %idx
@@ -160,10 +209,21 @@ define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_4xdouble_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_4xdouble_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
+; CHECK-NEXT:    addi.d $sp, $sp, -64
+; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvreplve.d $xr0, $xr0, $a2
+; CHECK-NEXT:    xvst $xr0, $sp, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
+; CHECK-NEXT:    fld.d $fa0, $a0, 0
 ; CHECK-NEXT:    fst.d $fa0, $a1, 0
+; CHECK-NEXT:    addi.d $sp, $fp, -64
+; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 64
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x double>, ptr %src
   %e = extractelement <4 x double> %v, i32 %idx
diff --git a/llvm/test/CodeGen/LoongArch/lasx/vselect.ll b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
index 78b99701832a..a9a54257917a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
@@ -6,11 +6,11 @@ define void @select_v32i8_imm(ptr %res, ptr %a0) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvrepli.h $xr1, -256
-; CHECK-NEXT:    xvbitseli.b $xr0, $xr1, 1
-; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    xvbitseli.b $xr1, $xr0, 1
+; CHECK-NEXT:    xvst $xr1, $a0, 0
 ; CHECK-NEXT:    ret
   %v0 = load <32 x i8>, ptr %a0
-  %sel = select <32 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <32 x i8> %v0, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %sel = select <32 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <32 x i8> %v0
   store <32 x i8> %sel, ptr %res
   ret void
 }
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vselect.ll b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
index 823bd9baba91..97a55532907b 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
@@ -6,11 +6,11 @@ define void @select_v16i8_imm(ptr %res, ptr %a0) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vrepli.h $vr1, -256
-; CHECK-NEXT:    vbitseli.b $vr0, $vr1, 255
-; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    vbitseli.b $vr1, $vr0, 255
+; CHECK-NEXT:    vst $vr1, $a0, 0
 ; CHECK-NEXT:    ret
   %v0 = load <16 x i8>, ptr %a0
-  %sel = select <16 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <16 x i8> %v0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %sel = select <16 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %v0
   store <16 x i8> %sel, ptr %res
   ret void
 }
diff --git a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
index 1d3371cce833..b649b2ba1614 100644
--- a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
@@ -1,9 +1,61 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=m68k-linux -verify-machineinstrs | FileCheck %s
 
+define zeroext i8 @smul_i8(i8 signext %a, i8 signext %b) nounwind ssp {
+; CHECK-LABEL: smul_i8:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    move.b (11,%sp), %d0
+; CHECK-NEXT:    and.l #255, %d0
+; CHECK-NEXT:    move.b (7,%sp), %d1
+; CHECK-NEXT:    and.l #255, %d1
+; CHECK-NEXT:    muls %d0, %d1
+; CHECK-NEXT:    move.l %d1, %d0
+; CHECK-NEXT:    and.l #65535, %d0
+; CHECK-NEXT:    and.l #255, %d0
+; CHECK-NEXT:    rts
+entry:
+  %smul = tail call { i8, i1 } @llvm.smul.with.overflow.i8(i8 %a, i8 %b)
+  %cmp = extractvalue { i8, i1 } %smul, 1
+  %smul.result = extractvalue { i8, i1 } %smul, 0
+  %X = select i1 %cmp, i8 42, i8 %smul.result
+  ret i8 %X
+}
+
+define zeroext i8 @smul_i8_no_ovf(i8 signext %a, i8 signext %b) nounwind ssp {
+; CHECK-LABEL: smul_i8_no_ovf:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    move.l #42, %d0
+; CHECK-NEXT:    rts
+entry:
+  %smul = tail call { i8, i1 } @llvm.smul.with.overflow.i8(i8 %a, i8 %b)
+  %cmp = extractvalue { i8, i1 } %smul, 1
+  %smul.result = extractvalue { i8, i1 } %smul, 0
+  %X = select i1 %cmp, i8 %smul.result, i8 42
+  ret i8 %X
+}
+
+declare { i8, i1 } @llvm.smul.with.overflow.i8(i8, i8) nounwind readnone
+
+define zeroext i16 @smul_i16(i16 signext %a, i16 signext %b) nounwind ssp {
+; CHECK-LABEL: smul_i16:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    move.w (6,%sp), %d0
+; CHECK-NEXT:    move.w (10,%sp), %d1
+; CHECK-NEXT:    muls %d1, %d0
+; CHECK-NEXT:    and.l #65535, %d0
+; CHECK-NEXT:    rts
+entry:
+  %smul = tail call { i16, i1 } @llvm.smul.with.overflow.i16(i16 %a, i16 %b)
+  %cmp = extractvalue { i16, i1 } %smul, 1
+  %smul.result = extractvalue { i16, i1 } %smul, 0
+  %X = select i1 %cmp, i16 42, i16 %smul.result
+  ret i16 %X
+}
+
+declare { i16, i1 } @llvm.smul.with.overflow.i16(i16, i16) nounwind readnone
+
 declare i32 @printf(i8*, ...) nounwind
 declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32)
-declare { i63, i1 } @llvm.smul.with.overflow.i63(i63, i63)
 
 @ok = internal constant [4 x i8] c"%d\0A\00"
 @no = internal constant [4 x i8] c"no\0A\00"
@@ -11,37 +63,23 @@ declare { i63, i1 } @llvm.smul.with.overflow.i63(i63, i63)
 define fastcc i1 @test1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    suba.l #28, %sp
-; CHECK-NEXT:    movem.l %d2-%d3, (20,%sp) ; 12-byte Folded Spill
-; CHECK-NEXT:    move.l %d1, (12,%sp)
-; CHECK-NEXT:    move.l #31, %d2
-; CHECK-NEXT:    asr.l %d2, %d1
-; CHECK-NEXT:    move.l %d1, (8,%sp)
-; CHECK-NEXT:    move.l %d0, (4,%sp)
-; CHECK-NEXT:    asr.l %d2, %d0
-; CHECK-NEXT:    move.l %d0, (%sp)
-; CHECK-NEXT:    jsr __muldi3@PLT
-; CHECK-NEXT:    move.l %d1, %d3
-; CHECK-NEXT:    asr.l %d2, %d3
-; CHECK-NEXT:    sub.l %d3, %d0
-; CHECK-NEXT:    sne %d0
-; CHECK-NEXT:    cmpi.b #0, %d0
-; CHECK-NEXT:    beq .LBB0_1
+; CHECK-NEXT:    suba.l #12, %sp
+; CHECK-NEXT:    muls.l %d1, %d0
+; CHECK-NEXT:    bvc .LBB3_1
 ; CHECK-NEXT:  ; %bb.2: ; %overflow
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
 ; CHECK-NEXT:    move.b #0, %d0
-; CHECK-NEXT:    bra .LBB0_3
-; CHECK-NEXT:  .LBB0_1: ; %normal
-; CHECK-NEXT:    move.l %d1, (4,%sp)
+; CHECK-NEXT:    adda.l #12, %sp
+; CHECK-NEXT:    rts
+; CHECK-NEXT:  .LBB3_1: ; %normal
+; CHECK-NEXT:    move.l %d0, (4,%sp)
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
 ; CHECK-NEXT:    move.b #1, %d0
-; CHECK-NEXT:  .LBB0_3: ; %overflow
-; CHECK-NEXT:    movem.l (20,%sp), %d2-%d3 ; 12-byte Folded Reload
-; CHECK-NEXT:    adda.l #28, %sp
+; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -61,37 +99,25 @@ overflow:
 define fastcc i1 @test2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    suba.l #28, %sp
-; CHECK-NEXT:    movem.l %d2-%d3, (20,%sp) ; 12-byte Folded Spill
-; CHECK-NEXT:    move.l %d1, (12,%sp)
-; CHECK-NEXT:    move.l #31, %d2
-; CHECK-NEXT:    asr.l %d2, %d1
-; CHECK-NEXT:    move.l %d1, (8,%sp)
-; CHECK-NEXT:    move.l %d0, (4,%sp)
-; CHECK-NEXT:    asr.l %d2, %d0
-; CHECK-NEXT:    move.l %d0, (%sp)
-; CHECK-NEXT:    jsr __muldi3@PLT
-; CHECK-NEXT:    move.l %d1, %d3
-; CHECK-NEXT:    asr.l %d2, %d3
-; CHECK-NEXT:    sub.l %d3, %d0
-; CHECK-NEXT:    sne %d0
-; CHECK-NEXT:    sub.b #1, %d0
-; CHECK-NEXT:    bne .LBB1_3
+; CHECK-NEXT:    suba.l #12, %sp
+; CHECK-NEXT:    muls.l %d1, %d0
+; CHECK-NEXT:    svs %d1
+; CHECK-NEXT:    sub.b #1, %d1
+; CHECK-NEXT:    bne .LBB4_2
 ; CHECK-NEXT:  ; %bb.1: ; %overflow
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
 ; CHECK-NEXT:    move.b #0, %d0
-; CHECK-NEXT:    bra .LBB1_2
-; CHECK-NEXT:  .LBB1_3: ; %normal
-; CHECK-NEXT:    move.l %d1, (4,%sp)
+; CHECK-NEXT:    adda.l #12, %sp
+; CHECK-NEXT:    rts
+; CHECK-NEXT:  .LBB4_2: ; %normal
+; CHECK-NEXT:    move.l %d0, (4,%sp)
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
 ; CHECK-NEXT:    move.b #1, %d0
-; CHECK-NEXT:  .LBB1_2: ; %overflow
-; CHECK-NEXT:    movem.l (20,%sp), %d2-%d3 ; 12-byte Folded Reload
-; CHECK-NEXT:    adda.l #28, %sp
+; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -129,7 +155,8 @@ define i32 @test4(i32 %a, i32 %b) nounwind readnone {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    move.l (8,%sp), %d0
 ; CHECK-NEXT:    add.l (4,%sp), %d0
-; CHECK-NEXT:    lsl.l #2, %d0
+; CHECK-NEXT:    move.l #4, %d1
+; CHECK-NEXT:    muls.l %d1, %d0
 ; CHECK-NEXT:    rts
 entry:
 	%tmp0 = add i32 %b, %a
diff --git a/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
index 16dc1036fd28..fd128a3e52bd 100644
--- a/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
@@ -1,20 +1,68 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=m68k -verify-machineinstrs | FileCheck %s
 
+define zeroext i8 @umul_i8(i8 signext %a, i8 signext %b) nounwind ssp {
+; CHECK-LABEL: umul_i8:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    move.b (11,%sp), %d0
+; CHECK-NEXT:    and.l #255, %d0
+; CHECK-NEXT:    move.b (7,%sp), %d1
+; CHECK-NEXT:    and.l #255, %d1
+; CHECK-NEXT:    muls %d0, %d1
+; CHECK-NEXT:    move.l %d1, %d0
+; CHECK-NEXT:    and.l #65535, %d0
+; CHECK-NEXT:    and.l #255, %d0
+; CHECK-NEXT:    rts
+entry:
+  %umul = tail call { i8, i1 } @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
+  %cmp = extractvalue { i8, i1 } %umul, 1
+  %umul.result = extractvalue { i8, i1 } %umul, 0
+  %X = select i1 %cmp, i8 42, i8 %umul.result
+  ret i8 %X
+}
+
+define zeroext i8 @umul_i8_no_ovf(i8 signext %a, i8 signext %b) nounwind ssp {
+; CHECK-LABEL: umul_i8_no_ovf:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    move.l #42, %d0
+; CHECK-NEXT:    rts
+entry:
+  %umul = tail call { i8, i1 } @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
+  %cmp = extractvalue { i8, i1 } %umul, 1
+  %umul.result = extractvalue { i8, i1 } %umul, 0
+  %X = select i1 %cmp, i8 %umul.result, i8 42
+  ret i8 %X
+}
+
+declare { i8, i1 } @llvm.umul.with.overflow.i8(i8, i8) nounwind readnone
+
+define zeroext i16 @umul_i16(i16 signext %a, i16 signext %b) nounwind ssp {
+; CHECK-LABEL: umul_i16:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    move.w (6,%sp), %d0
+; CHECK-NEXT:    move.w (10,%sp), %d1
+; CHECK-NEXT:    muls %d1, %d0
+; CHECK-NEXT:    and.l #65535, %d0
+; CHECK-NEXT:    rts
+entry:
+  %umul = tail call { i16, i1 } @llvm.umul.with.overflow.i16(i16 %a, i16 %b)
+  %cmp = extractvalue { i16, i1 } %umul, 1
+  %umul.result = extractvalue { i16, i1 } %umul, 0
+  %X = select i1 %cmp, i16 42, i16 %umul.result
+  ret i16 %X
+}
+
+declare { i16, i1 } @llvm.umul.with.overflow.i16(i16, i16) nounwind readnone
+
 declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
 
 define i1 @a(i32 %x)  nounwind {
 ; CHECK-LABEL: a:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    suba.l #20, %sp
-; CHECK-NEXT:    move.l #3, (12,%sp)
-; CHECK-NEXT:    move.l #0, (8,%sp)
-; CHECK-NEXT:    move.l (24,%sp), (4,%sp)
-; CHECK-NEXT:    move.l #0, (%sp)
-; CHECK-NEXT:    jsr __muldi3@PLT
-; CHECK-NEXT:    cmpi.l #0, %d0
-; CHECK-NEXT:    sne %d0
-; CHECK-NEXT:    adda.l #20, %sp
+; CHECK-NEXT:    move.l #3, %d0
+; CHECK-NEXT:    move.l (4,%sp), %d1
+; CHECK-NEXT:    mulu.l %d0, %d1
+; CHECK-NEXT:    svs %d0
 ; CHECK-NEXT:    rts
   %res = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %x, i32 3)
   %obil = extractvalue {i32, i1} %res, 1
@@ -42,7 +90,8 @@ define i32 @test3(i32 %a, i32 %b) nounwind readnone {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    move.l (8,%sp), %d0
 ; CHECK-NEXT:    add.l (4,%sp), %d0
-; CHECK-NEXT:    lsl.l #2, %d0
+; CHECK-NEXT:    move.l #4, %d1
+; CHECK-NEXT:    mulu.l %d1, %d0
 ; CHECK-NEXT:    rts
 entry:
 	%tmp0 = add i32 %b, %a
diff --git a/llvm/test/CodeGen/M68k/global-address.ll b/llvm/test/CodeGen/M68k/global-address.ll
new file mode 100644
index 000000000000..8af37f9f733f
--- /dev/null
+++ b/llvm/test/CodeGen/M68k/global-address.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=m68k < %s | FileCheck %s
+
+@VBRTag = external dso_local global [2147483647 x i8]
+
+define i1 @folded_offset(i32 %conv29) {
+; CHECK-LABEL: folded_offset:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    move.b (VBRTag+1,%pc), %d0
+; CHECK-NEXT:    ext.w %d0
+; CHECK-NEXT:    ext.l %d0
+; CHECK-NEXT:    sub.l (4,%sp), %d0
+; CHECK-NEXT:    seq %d0
+; CHECK-NEXT:    rts
+entry:
+  %0 = load i8, ptr getelementptr inbounds ([2147483647 x i8], ptr @VBRTag, i32 0, i32 1), align 1
+  %conv30 = sext i8 %0 to i32
+  %cmp31.not = icmp eq i32 %conv30, %conv29
+  ret i1 %cmp31.not
+}
+
+define i1 @non_folded_offset(i32 %conv29) {
+; CHECK-LABEL: non_folded_offset:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    move.l #2147483645, %d0
+; CHECK-NEXT:    lea (VBRTag,%pc), %a0
+; CHECK-NEXT:    move.b (0,%a0,%d0), %d0
+; CHECK-NEXT:    ext.w %d0
+; CHECK-NEXT:    ext.l %d0
+; CHECK-NEXT:    sub.l (4,%sp), %d0
+; CHECK-NEXT:    seq %d0
+; CHECK-NEXT:    rts
+entry:
+  %0 = load i8, ptr getelementptr inbounds ([2147483647 x i8], ptr @VBRTag, i32 0, i32 2147483645), align 1
+  %conv30 = sext i8 %0 to i32
+  %cmp31.not = icmp eq i32 %conv30, %conv29
+  ret i1 %cmp31.not
+}
diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
index 1104bdfb004a..c989d2bca651 100644
--- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
+++ b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
@@ -26,7 +26,7 @@
 ; Also, the first eviction problem is significantly less than 300 instructions. Check
 ; that there is a zero value.
 ; Note: we're regex-ing some of the opcodes to avoid test flakyness.
-; CHECK: instructions: 19,{{([0-9]{4})}},13{{([0-9]{2})}},13{{([0-9]{2})}},{{.*}},0,
+; CHECK: instructions: 19,{{([0-9]{4})}},16{{([0-9]{2})}},16{{([0-9]{2})}},{{.*}},0,
 ; Only the candidate virtreg and the 10th LR are included in this problem. Make
 ; sure the other LRs have values of zero. There are 2700 0s followed by some 1s.
 ; There's a limit to how many repetitions can be matched.
diff --git a/llvm/test/CodeGen/Mips/funnel-shift-rot.ll b/llvm/test/CodeGen/Mips/funnel-shift-rot.ll
index e17980e98e9b..ee187678949e 100644
--- a/llvm/test/CodeGen/Mips/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/Mips/funnel-shift-rot.ll
@@ -62,10 +62,8 @@ define i16 @rotl_i16(i16 %x, i16 %z) {
 define i32 @rotl_i32(i32 %x, i32 %z) {
 ; CHECK-LABEL: rotl_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi $1, $5, 31
-; CHECK-NEXT:    sllv $1, $4, $1
+; CHECK-NEXT:    sllv $1, $4, $5
 ; CHECK-NEXT:    negu $2, $5
-; CHECK-NEXT:    andi $2, $2, 31
 ; CHECK-NEXT:    srlv $2, $4, $2
 ; CHECK-NEXT:    jr $ra
 ; CHECK-NEXT:    or $2, $1, $2
@@ -80,15 +78,13 @@ define i64 @rotl_i64(i64 %x, i64 %z) {
 ; CHECK-BE-NEXT:    andi $1, $1, 1
 ; CHECK-BE-NEXT:    move $3, $4
 ; CHECK-BE-NEXT:    movn $3, $5, $1
-; CHECK-BE-NEXT:    andi $6, $7, 31
-; CHECK-BE-NEXT:    sllv $2, $3, $6
+; CHECK-BE-NEXT:    sllv $2, $3, $7
 ; CHECK-BE-NEXT:    movn $5, $4, $1
 ; CHECK-BE-NEXT:    srl $1, $5, 1
 ; CHECK-BE-NEXT:    not $4, $7
-; CHECK-BE-NEXT:    andi $4, $4, 31
 ; CHECK-BE-NEXT:    srlv $1, $1, $4
 ; CHECK-BE-NEXT:    or $2, $2, $1
-; CHECK-BE-NEXT:    sllv $1, $5, $6
+; CHECK-BE-NEXT:    sllv $1, $5, $7
 ; CHECK-BE-NEXT:    srl $3, $3, 1
 ; CHECK-BE-NEXT:    srlv $3, $3, $4
 ; CHECK-BE-NEXT:    jr $ra
@@ -100,15 +96,13 @@ define i64 @rotl_i64(i64 %x, i64 %z) {
 ; CHECK-LE-NEXT:    andi $1, $1, 1
 ; CHECK-LE-NEXT:    move $3, $4
 ; CHECK-LE-NEXT:    movn $3, $5, $1
-; CHECK-LE-NEXT:    andi $7, $6, 31
-; CHECK-LE-NEXT:    sllv $2, $3, $7
+; CHECK-LE-NEXT:    sllv $2, $3, $6
 ; CHECK-LE-NEXT:    movn $5, $4, $1
 ; CHECK-LE-NEXT:    srl $1, $5, 1
 ; CHECK-LE-NEXT:    not $4, $6
-; CHECK-LE-NEXT:    andi $4, $4, 31
 ; CHECK-LE-NEXT:    srlv $1, $1, $4
 ; CHECK-LE-NEXT:    or $2, $2, $1
-; CHECK-LE-NEXT:    sllv $1, $5, $7
+; CHECK-LE-NEXT:    sllv $1, $5, $6
 ; CHECK-LE-NEXT:    srl $3, $3, 1
 ; CHECK-LE-NEXT:    srlv $3, $3, $4
 ; CHECK-LE-NEXT:    jr $ra
@@ -122,35 +116,27 @@ define i64 @rotl_i64(i64 %x, i64 %z) {
 define <4 x i32> @rotl_v4i32(<4 x i32> %x, <4 x i32> %z) {
 ; CHECK-LABEL: rotl_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw $1, 24($sp)
+; CHECK-NEXT:    lw $1, 20($sp)
 ; CHECK-NEXT:    negu $2, $1
-; CHECK-NEXT:    lw $3, 20($sp)
+; CHECK-NEXT:    lw $3, 24($sp)
 ; CHECK-NEXT:    negu $8, $3
-; CHECK-NEXT:    andi $8, $8, 31
-; CHECK-NEXT:    andi $2, $2, 31
-; CHECK-NEXT:    andi $3, $3, 31
-; CHECK-NEXT:    andi $1, $1, 31
-; CHECK-NEXT:    lw $9, 16($sp)
-; CHECK-NEXT:    sllv $1, $6, $1
-; CHECK-NEXT:    srlv $6, $6, $2
-; CHECK-NEXT:    sllv $3, $5, $3
-; CHECK-NEXT:    srlv $5, $5, $8
-; CHECK-NEXT:    andi $2, $9, 31
-; CHECK-NEXT:    sllv $2, $4, $2
-; CHECK-NEXT:    negu $8, $9
-; CHECK-NEXT:    andi $8, $8, 31
-; CHECK-NEXT:    srlv $4, $4, $8
-; CHECK-NEXT:    lw $8, 28($sp)
-; CHECK-NEXT:    or $2, $2, $4
-; CHECK-NEXT:    or $3, $3, $5
-; CHECK-NEXT:    or $4, $1, $6
-; CHECK-NEXT:    andi $1, $8, 31
-; CHECK-NEXT:    sllv $1, $7, $1
-; CHECK-NEXT:    negu $5, $8
-; CHECK-NEXT:    andi $5, $5, 31
-; CHECK-NEXT:    srlv $5, $7, $5
+; CHECK-NEXT:    sllv $9, $6, $3
+; CHECK-NEXT:    srlv $6, $6, $8
+; CHECK-NEXT:    sllv $1, $5, $1
+; CHECK-NEXT:    srlv $3, $5, $2
+; CHECK-NEXT:    lw $2, 16($sp)
+; CHECK-NEXT:    sllv $5, $4, $2
+; CHECK-NEXT:    negu $2, $2
+; CHECK-NEXT:    srlv $2, $4, $2
+; CHECK-NEXT:    or $2, $5, $2
+; CHECK-NEXT:    or $3, $1, $3
+; CHECK-NEXT:    or $4, $9, $6
+; CHECK-NEXT:    lw $1, 28($sp)
+; CHECK-NEXT:    sllv $5, $7, $1
+; CHECK-NEXT:    negu $1, $1
+; CHECK-NEXT:    srlv $1, $7, $1
 ; CHECK-NEXT:    jr $ra
-; CHECK-NEXT:    or $5, $1, $5
+; CHECK-NEXT:    or $5, $5, $1
   %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z)
   ret <4 x i32> %f
 }
@@ -224,10 +210,8 @@ define i16 @rotr_i16(i16 %x, i16 %z) {
 define i32 @rotr_i32(i32 %x, i32 %z) {
 ; CHECK-LABEL: rotr_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi $1, $5, 31
-; CHECK-NEXT:    srlv $1, $4, $1
+; CHECK-NEXT:    srlv $1, $4, $5
 ; CHECK-NEXT:    negu $2, $5
-; CHECK-NEXT:    andi $2, $2, 31
 ; CHECK-NEXT:    sllv $2, $4, $2
 ; CHECK-NEXT:    jr $ra
 ; CHECK-NEXT:    or $2, $1, $2
@@ -241,15 +225,13 @@ define i64 @rotr_i64(i64 %x, i64 %z) {
 ; CHECK-BE-NEXT:    andi $1, $7, 32
 ; CHECK-BE-NEXT:    move $3, $5
 ; CHECK-BE-NEXT:    movz $3, $4, $1
-; CHECK-BE-NEXT:    andi $6, $7, 31
-; CHECK-BE-NEXT:    srlv $2, $3, $6
+; CHECK-BE-NEXT:    srlv $2, $3, $7
 ; CHECK-BE-NEXT:    movz $4, $5, $1
 ; CHECK-BE-NEXT:    sll $1, $4, 1
 ; CHECK-BE-NEXT:    not $5, $7
-; CHECK-BE-NEXT:    andi $5, $5, 31
 ; CHECK-BE-NEXT:    sllv $1, $1, $5
 ; CHECK-BE-NEXT:    or $2, $1, $2
-; CHECK-BE-NEXT:    srlv $1, $4, $6
+; CHECK-BE-NEXT:    srlv $1, $4, $7
 ; CHECK-BE-NEXT:    sll $3, $3, 1
 ; CHECK-BE-NEXT:    sllv $3, $3, $5
 ; CHECK-BE-NEXT:    jr $ra
@@ -260,15 +242,13 @@ define i64 @rotr_i64(i64 %x, i64 %z) {
 ; CHECK-LE-NEXT:    andi $1, $6, 32
 ; CHECK-LE-NEXT:    move $3, $5
 ; CHECK-LE-NEXT:    movz $3, $4, $1
-; CHECK-LE-NEXT:    andi $7, $6, 31
-; CHECK-LE-NEXT:    srlv $2, $3, $7
+; CHECK-LE-NEXT:    srlv $2, $3, $6
 ; CHECK-LE-NEXT:    movz $4, $5, $1
 ; CHECK-LE-NEXT:    sll $1, $4, 1
 ; CHECK-LE-NEXT:    not $5, $6
-; CHECK-LE-NEXT:    andi $5, $5, 31
 ; CHECK-LE-NEXT:    sllv $1, $1, $5
 ; CHECK-LE-NEXT:    or $2, $1, $2
-; CHECK-LE-NEXT:    srlv $1, $4, $7
+; CHECK-LE-NEXT:    srlv $1, $4, $6
 ; CHECK-LE-NEXT:    sll $3, $3, 1
 ; CHECK-LE-NEXT:    sllv $3, $3, $5
 ; CHECK-LE-NEXT:    jr $ra
@@ -282,35 +262,27 @@ define i64 @rotr_i64(i64 %x, i64 %z) {
 define <4 x i32> @rotr_v4i32(<4 x i32> %x, <4 x i32> %z) {
 ; CHECK-LABEL: rotr_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw $1, 24($sp)
+; CHECK-NEXT:    lw $1, 20($sp)
 ; CHECK-NEXT:    negu $2, $1
-; CHECK-NEXT:    lw $3, 20($sp)
+; CHECK-NEXT:    lw $3, 24($sp)
 ; CHECK-NEXT:    negu $8, $3
-; CHECK-NEXT:    andi $8, $8, 31
-; CHECK-NEXT:    andi $2, $2, 31
-; CHECK-NEXT:    andi $3, $3, 31
-; CHECK-NEXT:    andi $1, $1, 31
-; CHECK-NEXT:    lw $9, 16($sp)
-; CHECK-NEXT:    srlv $1, $6, $1
-; CHECK-NEXT:    sllv $6, $6, $2
-; CHECK-NEXT:    srlv $3, $5, $3
-; CHECK-NEXT:    sllv $5, $5, $8
-; CHECK-NEXT:    andi $2, $9, 31
-; CHECK-NEXT:    srlv $2, $4, $2
-; CHECK-NEXT:    negu $8, $9
-; CHECK-NEXT:    andi $8, $8, 31
-; CHECK-NEXT:    sllv $4, $4, $8
-; CHECK-NEXT:    lw $8, 28($sp)
-; CHECK-NEXT:    or $2, $2, $4
-; CHECK-NEXT:    or $3, $3, $5
-; CHECK-NEXT:    or $4, $1, $6
-; CHECK-NEXT:    andi $1, $8, 31
-; CHECK-NEXT:    srlv $1, $7, $1
-; CHECK-NEXT:    negu $5, $8
-; CHECK-NEXT:    andi $5, $5, 31
-; CHECK-NEXT:    sllv $5, $7, $5
+; CHECK-NEXT:    srlv $9, $6, $3
+; CHECK-NEXT:    sllv $6, $6, $8
+; CHECK-NEXT:    srlv $1, $5, $1
+; CHECK-NEXT:    sllv $3, $5, $2
+; CHECK-NEXT:    lw $2, 16($sp)
+; CHECK-NEXT:    srlv $5, $4, $2
+; CHECK-NEXT:    negu $2, $2
+; CHECK-NEXT:    sllv $2, $4, $2
+; CHECK-NEXT:    or $2, $5, $2
+; CHECK-NEXT:    or $3, $1, $3
+; CHECK-NEXT:    or $4, $9, $6
+; CHECK-NEXT:    lw $1, 28($sp)
+; CHECK-NEXT:    srlv $5, $7, $1
+; CHECK-NEXT:    negu $1, $1
+; CHECK-NEXT:    sllv $1, $7, $1
 ; CHECK-NEXT:    jr $ra
-; CHECK-NEXT:    or $5, $1, $5
+; CHECK-NEXT:    or $5, $5, $1
   %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z)
   ret <4 x i32> %f
 }
diff --git a/llvm/test/CodeGen/Mips/funnel-shift.ll b/llvm/test/CodeGen/Mips/funnel-shift.ll
index 737e95c8262a..bda2b477b52f 100644
--- a/llvm/test/CodeGen/Mips/funnel-shift.ll
+++ b/llvm/test/CodeGen/Mips/funnel-shift.ll
@@ -33,12 +33,10 @@ define i16 @fshl_i16(i16 %x, i16 %y, i16 %z) {
 define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: fshl_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi $1, $6, 31
-; CHECK-NEXT:    sllv $1, $4, $1
-; CHECK-NEXT:    srl $2, $5, 1
-; CHECK-NEXT:    not $3, $6
-; CHECK-NEXT:    andi $3, $3, 31
-; CHECK-NEXT:    srlv $2, $2, $3
+; CHECK-NEXT:    sllv $1, $4, $6
+; CHECK-NEXT:    not $2, $6
+; CHECK-NEXT:    srl $3, $5, 1
+; CHECK-NEXT:    srlv $2, $3, $2
 ; CHECK-NEXT:    jr $ra
 ; CHECK-NEXT:    or $2, $1, $2
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
@@ -72,25 +70,23 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
 ; CHECK-BE-NEXT:    addiu $6, $zero, 0
 ; CHECK-BE-NEXT:    jal __umoddi3
 ; CHECK-BE-NEXT:    addiu $7, $zero, 37
-; CHECK-BE-NEXT:    not $1, $3
-; CHECK-BE-NEXT:    srl $2, $3, 5
-; CHECK-BE-NEXT:    andi $4, $2, 1
-; CHECK-BE-NEXT:    movn $19, $18, $4
-; CHECK-BE-NEXT:    andi $3, $3, 31
+; CHECK-BE-NEXT:    srl $1, $3, 5
+; CHECK-BE-NEXT:    andi $1, $1, 1
+; CHECK-BE-NEXT:    movn $19, $18, $1
 ; CHECK-BE-NEXT:    sllv $2, $19, $3
-; CHECK-BE-NEXT:    andi $1, $1, 31
+; CHECK-BE-NEXT:    not $4, $3
 ; CHECK-BE-NEXT:    srl $5, $16, 5
 ; CHECK-BE-NEXT:    sll $6, $17, 27
 ; CHECK-BE-NEXT:    or $5, $6, $5
-; CHECK-BE-NEXT:    movn $18, $5, $4
+; CHECK-BE-NEXT:    movn $18, $5, $1
 ; CHECK-BE-NEXT:    srl $6, $18, 1
-; CHECK-BE-NEXT:    srlv $6, $6, $1
+; CHECK-BE-NEXT:    srlv $6, $6, $4
 ; CHECK-BE-NEXT:    or $2, $2, $6
 ; CHECK-BE-NEXT:    sllv $3, $18, $3
 ; CHECK-BE-NEXT:    sll $6, $16, 27
-; CHECK-BE-NEXT:    movn $5, $6, $4
-; CHECK-BE-NEXT:    srl $4, $5, 1
-; CHECK-BE-NEXT:    srlv $1, $4, $1
+; CHECK-BE-NEXT:    movn $5, $6, $1
+; CHECK-BE-NEXT:    srl $1, $5, 1
+; CHECK-BE-NEXT:    srlv $1, $1, $4
 ; CHECK-BE-NEXT:    or $3, $3, $1
 ; CHECK-BE-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
 ; CHECK-BE-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
@@ -125,26 +121,25 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
 ; CHECK-LE-NEXT:    jal __umoddi3
 ; CHECK-LE-NEXT:    addiu $7, $zero, 0
 ; CHECK-LE-NEXT:    srl $1, $2, 5
-; CHECK-LE-NEXT:    andi $1, $1, 1
-; CHECK-LE-NEXT:    srl $3, $17, 5
+; CHECK-LE-NEXT:    andi $3, $1, 1
+; CHECK-LE-NEXT:    srl $1, $17, 5
 ; CHECK-LE-NEXT:    sll $4, $16, 27
-; CHECK-LE-NEXT:    or $3, $4, $3
+; CHECK-LE-NEXT:    or $1, $4, $1
 ; CHECK-LE-NEXT:    move $4, $19
-; CHECK-LE-NEXT:    movn $4, $3, $1
-; CHECK-LE-NEXT:    andi $5, $2, 31
-; CHECK-LE-NEXT:    sllv $6, $4, $5
-; CHECK-LE-NEXT:    not $2, $2
-; CHECK-LE-NEXT:    andi $7, $2, 31
-; CHECK-LE-NEXT:    sll $2, $17, 27
-; CHECK-LE-NEXT:    movn $3, $2, $1
-; CHECK-LE-NEXT:    srl $2, $3, 1
-; CHECK-LE-NEXT:    srlv $2, $2, $7
-; CHECK-LE-NEXT:    or $2, $6, $2
-; CHECK-LE-NEXT:    movn $18, $19, $1
-; CHECK-LE-NEXT:    sllv $1, $18, $5
+; CHECK-LE-NEXT:    movn $4, $1, $3
+; CHECK-LE-NEXT:    sllv $5, $4, $2
+; CHECK-LE-NEXT:    not $6, $2
+; CHECK-LE-NEXT:    sll $7, $17, 27
+; CHECK-LE-NEXT:    movn $1, $7, $3
+; CHECK-LE-NEXT:    srl $1, $1, 1
+; CHECK-LE-NEXT:    srlv $1, $1, $6
+; CHECK-LE-NEXT:    or $1, $5, $1
+; CHECK-LE-NEXT:    movn $18, $19, $3
+; CHECK-LE-NEXT:    sllv $2, $18, $2
 ; CHECK-LE-NEXT:    srl $3, $4, 1
-; CHECK-LE-NEXT:    srlv $3, $3, $7
-; CHECK-LE-NEXT:    or $3, $1, $3
+; CHECK-LE-NEXT:    srlv $3, $3, $6
+; CHECK-LE-NEXT:    or $3, $2, $3
+; CHECK-LE-NEXT:    move $2, $1
 ; CHECK-LE-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
 ; CHECK-LE-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
 ; CHECK-LE-NEXT:    lw $18, 28($sp) # 4-byte Folded Reload
@@ -278,12 +273,10 @@ define i16 @fshr_i16(i16 %x, i16 %y, i16 %z) {
 define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: fshr_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi $1, $6, 31
-; CHECK-NEXT:    srlv $1, $5, $1
-; CHECK-NEXT:    sll $2, $4, 1
-; CHECK-NEXT:    not $3, $6
-; CHECK-NEXT:    andi $3, $3, 31
-; CHECK-NEXT:    sllv $2, $2, $3
+; CHECK-NEXT:    srlv $1, $5, $6
+; CHECK-NEXT:    not $2, $6
+; CHECK-NEXT:    sll $3, $4, 1
+; CHECK-NEXT:    sllv $2, $3, $2
 ; CHECK-NEXT:    jr $ra
 ; CHECK-NEXT:    or $2, $2, $1
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
@@ -324,19 +317,17 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
 ; CHECK-BE-NEXT:    or $4, $4, $2
 ; CHECK-BE-NEXT:    movz $19, $18, $3
 ; CHECK-BE-NEXT:    movz $18, $4, $3
-; CHECK-BE-NEXT:    andi $5, $1, 31
-; CHECK-BE-NEXT:    srlv $2, $18, $5
-; CHECK-BE-NEXT:    not $1, $1
-; CHECK-BE-NEXT:    andi $1, $1, 31
+; CHECK-BE-NEXT:    srlv $2, $18, $1
+; CHECK-BE-NEXT:    not $5, $1
 ; CHECK-BE-NEXT:    sll $6, $19, 1
-; CHECK-BE-NEXT:    sllv $6, $6, $1
+; CHECK-BE-NEXT:    sllv $6, $6, $5
 ; CHECK-BE-NEXT:    sll $7, $16, 27
 ; CHECK-BE-NEXT:    or $2, $6, $2
 ; CHECK-BE-NEXT:    movz $4, $7, $3
-; CHECK-BE-NEXT:    srlv $3, $4, $5
-; CHECK-BE-NEXT:    sll $4, $18, 1
-; CHECK-BE-NEXT:    sllv $1, $4, $1
-; CHECK-BE-NEXT:    or $3, $1, $3
+; CHECK-BE-NEXT:    srlv $1, $4, $1
+; CHECK-BE-NEXT:    sll $3, $18, 1
+; CHECK-BE-NEXT:    sllv $3, $3, $5
+; CHECK-BE-NEXT:    or $3, $3, $1
 ; CHECK-BE-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
 ; CHECK-BE-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
 ; CHECK-BE-NEXT:    lw $18, 28($sp) # 4-byte Folded Reload
@@ -378,18 +369,16 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
 ; CHECK-LE-NEXT:    move $5, $19
 ; CHECK-LE-NEXT:    movz $5, $2, $3
 ; CHECK-LE-NEXT:    movz $2, $4, $3
-; CHECK-LE-NEXT:    andi $4, $1, 31
-; CHECK-LE-NEXT:    srlv $2, $2, $4
-; CHECK-LE-NEXT:    not $1, $1
-; CHECK-LE-NEXT:    andi $1, $1, 31
+; CHECK-LE-NEXT:    srlv $2, $2, $1
+; CHECK-LE-NEXT:    not $4, $1
 ; CHECK-LE-NEXT:    sll $6, $5, 1
-; CHECK-LE-NEXT:    sllv $6, $6, $1
+; CHECK-LE-NEXT:    sllv $6, $6, $4
 ; CHECK-LE-NEXT:    or $2, $6, $2
-; CHECK-LE-NEXT:    srlv $4, $5, $4
+; CHECK-LE-NEXT:    srlv $1, $5, $1
 ; CHECK-LE-NEXT:    movz $18, $19, $3
 ; CHECK-LE-NEXT:    sll $3, $18, 1
-; CHECK-LE-NEXT:    sllv $1, $3, $1
-; CHECK-LE-NEXT:    or $3, $1, $4
+; CHECK-LE-NEXT:    sllv $3, $3, $4
+; CHECK-LE-NEXT:    or $3, $3, $1
 ; CHECK-LE-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
 ; CHECK-LE-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
 ; CHECK-LE-NEXT:    lw $18, 28($sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
index 453ca0d6bab3..450fe968d491 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -89,51 +89,43 @@ entry:
 define signext i8 @ashr_i8(i8 signext %a, i8 signext %b) {
 ; MIPS-LABEL: ashr_i8:
 ; MIPS:       # %bb.0: # %entry
-; MIPS-NEXT:    andi $1, $5, 255
 ; MIPS-NEXT:    jr $ra
-; MIPS-NEXT:    srav $2, $4, $1
+; MIPS-NEXT:    srav $2, $4, $5
 ;
 ; MIPS32-LABEL: ashr_i8:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    andi $1, $5, 255
 ; MIPS32-NEXT:    jr $ra
-; MIPS32-NEXT:    srav $2, $4, $1
+; MIPS32-NEXT:    srav $2, $4, $5
 ;
 ; 32R2-LABEL: ashr_i8:
 ; 32R2:       # %bb.0: # %entry
-; 32R2-NEXT:    andi $1, $5, 255
 ; 32R2-NEXT:    jr $ra
-; 32R2-NEXT:    srav $2, $4, $1
+; 32R2-NEXT:    srav $2, $4, $5
 ;
 ; 32R6-LABEL: ashr_i8:
 ; 32R6:       # %bb.0: # %entry
-; 32R6-NEXT:    andi $1, $5, 255
 ; 32R6-NEXT:    jr $ra
-; 32R6-NEXT:    srav $2, $4, $1
+; 32R6-NEXT:    srav $2, $4, $5
 ;
 ; MIPS3-LABEL: ashr_i8:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    andi $1, $5, 255
 ; MIPS3-NEXT:    jr $ra
-; MIPS3-NEXT:    srav $2, $4, $1
+; MIPS3-NEXT:    srav $2, $4, $5
 ;
 ; MIPS64-LABEL: ashr_i8:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    andi $1, $5, 255
 ; MIPS64-NEXT:    jr $ra
-; MIPS64-NEXT:    srav $2, $4, $1
+; MIPS64-NEXT:    srav $2, $4, $5
 ;
 ; MIPS64R2-LABEL: ashr_i8:
 ; MIPS64R2:       # %bb.0: # %entry
-; MIPS64R2-NEXT:    andi $1, $5, 255
 ; MIPS64R2-NEXT:    jr $ra
-; MIPS64R2-NEXT:    srav $2, $4, $1
+; MIPS64R2-NEXT:    srav $2, $4, $5
 ;
 ; MIPS64R6-LABEL: ashr_i8:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    andi $1, $5, 255
 ; MIPS64R6-NEXT:    jr $ra
-; MIPS64R6-NEXT:    srav $2, $4, $1
+; MIPS64R6-NEXT:    srav $2, $4, $5
 ;
 ; MMR3-LABEL: ashr_i8:
 ; MMR3:       # %bb.0: # %entry
@@ -155,51 +147,43 @@ entry:
 define signext i16 @ashr_i16(i16 signext %a, i16 signext %b) {
 ; MIPS-LABEL: ashr_i16:
 ; MIPS:       # %bb.0: # %entry
-; MIPS-NEXT:    andi $1, $5, 65535
 ; MIPS-NEXT:    jr $ra
-; MIPS-NEXT:    srav $2, $4, $1
+; MIPS-NEXT:    srav $2, $4, $5
 ;
 ; MIPS32-LABEL: ashr_i16:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    andi $1, $5, 65535
 ; MIPS32-NEXT:    jr $ra
-; MIPS32-NEXT:    srav $2, $4, $1
+; MIPS32-NEXT:    srav $2, $4, $5
 ;
 ; 32R2-LABEL: ashr_i16:
 ; 32R2:       # %bb.0: # %entry
-; 32R2-NEXT:    andi $1, $5, 65535
 ; 32R2-NEXT:    jr $ra
-; 32R2-NEXT:    srav $2, $4, $1
+; 32R2-NEXT:    srav $2, $4, $5
 ;
 ; 32R6-LABEL: ashr_i16:
 ; 32R6:       # %bb.0: # %entry
-; 32R6-NEXT:    andi $1, $5, 65535
 ; 32R6-NEXT:    jr $ra
-; 32R6-NEXT:    srav $2, $4, $1
+; 32R6-NEXT:    srav $2, $4, $5
 ;
 ; MIPS3-LABEL: ashr_i16:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    andi $1, $5, 65535
 ; MIPS3-NEXT:    jr $ra
-; MIPS3-NEXT:    srav $2, $4, $1
+; MIPS3-NEXT:    srav $2, $4, $5
 ;
 ; MIPS64-LABEL: ashr_i16:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    andi $1, $5, 65535
 ; MIPS64-NEXT:    jr $ra
-; MIPS64-NEXT:    srav $2, $4, $1
+; MIPS64-NEXT:    srav $2, $4, $5
 ;
 ; MIPS64R2-LABEL: ashr_i16:
 ; MIPS64R2:       # %bb.0: # %entry
-; MIPS64R2-NEXT:    andi $1, $5, 65535
 ; MIPS64R2-NEXT:    jr $ra
-; MIPS64R2-NEXT:    srav $2, $4, $1
+; MIPS64R2-NEXT:    srav $2, $4, $5
 ;
 ; MIPS64R6-LABEL: ashr_i16:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    andi $1, $5, 65535
 ; MIPS64R6-NEXT:    jr $ra
-; MIPS64R6-NEXT:    srav $2, $4, $1
+; MIPS64R6-NEXT:    srav $2, $4, $5
 ;
 ; MMR3-LABEL: ashr_i16:
 ; MMR3:       # %bb.0: # %entry
@@ -428,7 +412,6 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS-NEXT:    lwr $5, 11($1)
 ; MIPS-NEXT:    andi $2, $2, 7
 ; MIPS-NEXT:    not $6, $2
-; MIPS-NEXT:    andi $6, $6, 31
 ; MIPS-NEXT:    srlv $7, $5, $2
 ; MIPS-NEXT:    sllv $4, $4, $6
 ; MIPS-NEXT:    srlv $3, $3, $2
@@ -483,7 +466,6 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32-NEXT:    lwr $5, 11($1)
 ; MIPS32-NEXT:    andi $2, $2, 7
 ; MIPS32-NEXT:    not $6, $2
-; MIPS32-NEXT:    andi $6, $6, 31
 ; MIPS32-NEXT:    srlv $7, $5, $2
 ; MIPS32-NEXT:    sllv $4, $4, $6
 ; MIPS32-NEXT:    srlv $3, $3, $2
@@ -537,7 +519,6 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; 32R2-NEXT:    lwr $5, 11($1)
 ; 32R2-NEXT:    andi $2, $2, 7
 ; 32R2-NEXT:    not $6, $2
-; 32R2-NEXT:    andi $6, $6, 31
 ; 32R2-NEXT:    srlv $7, $5, $2
 ; 32R2-NEXT:    sllv $4, $4, $6
 ; 32R2-NEXT:    srlv $3, $3, $2
@@ -581,7 +562,6 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
 ; 32R6-NEXT:    lw $5, 8($1)
 ; 32R6-NEXT:    andi $2, $2, 7
 ; 32R6-NEXT:    not $6, $2
-; 32R6-NEXT:    andi $6, $6, 31
 ; 32R6-NEXT:    srlv $7, $5, $2
 ; 32R6-NEXT:    sllv $4, $4, $6
 ; 32R6-NEXT:    srlv $3, $3, $2
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
index ddbb1f217837..03cf104e3120 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -427,7 +427,6 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS2-NEXT:    lwr $5, 11($1)
 ; MIPS2-NEXT:    andi $2, $2, 7
 ; MIPS2-NEXT:    not $6, $2
-; MIPS2-NEXT:    andi $6, $6, 31
 ; MIPS2-NEXT:    srlv $7, $5, $2
 ; MIPS2-NEXT:    sllv $4, $4, $6
 ; MIPS2-NEXT:    srlv $3, $3, $2
@@ -481,7 +480,6 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32-NEXT:    lwr $5, 11($1)
 ; MIPS32-NEXT:    andi $2, $2, 7
 ; MIPS32-NEXT:    not $6, $2
-; MIPS32-NEXT:    andi $6, $6, 31
 ; MIPS32-NEXT:    srlv $7, $5, $2
 ; MIPS32-NEXT:    sllv $4, $4, $6
 ; MIPS32-NEXT:    srlv $3, $3, $2
@@ -534,7 +532,6 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R2-NEXT:    lwr $5, 11($1)
 ; MIPS32R2-NEXT:    andi $2, $2, 7
 ; MIPS32R2-NEXT:    not $6, $2
-; MIPS32R2-NEXT:    andi $6, $6, 31
 ; MIPS32R2-NEXT:    srlv $7, $5, $2
 ; MIPS32R2-NEXT:    sllv $4, $4, $6
 ; MIPS32R2-NEXT:    srlv $3, $3, $2
@@ -577,7 +574,6 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R6-NEXT:    lw $5, 8($1)
 ; MIPS32R6-NEXT:    andi $2, $2, 7
 ; MIPS32R6-NEXT:    not $6, $2
-; MIPS32R6-NEXT:    andi $6, $6, 31
 ; MIPS32R6-NEXT:    srlv $7, $5, $2
 ; MIPS32R6-NEXT:    sllv $4, $4, $6
 ; MIPS32R6-NEXT:    srlv $3, $3, $2
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
index 256da0b89e60..81f089a52947 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -94,69 +94,60 @@ entry:
 define signext i8 @shl_i8(i8 signext %a, i8 signext %b) {
 ; MIPS2-LABEL: shl_i8:
 ; MIPS2:       # %bb.0: # %entry
-; MIPS2-NEXT:    andi $1, $5, 255
-; MIPS2-NEXT:    sllv $1, $4, $1
+; MIPS2-NEXT:    sllv $1, $4, $5
 ; MIPS2-NEXT:    sll $1, $1, 24
 ; MIPS2-NEXT:    jr $ra
 ; MIPS2-NEXT:    sra $2, $1, 24
 ;
 ; MIPS32-LABEL: shl_i8:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    andi $1, $5, 255
-; MIPS32-NEXT:    sllv $1, $4, $1
+; MIPS32-NEXT:    sllv $1, $4, $5
 ; MIPS32-NEXT:    sll $1, $1, 24
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    sra $2, $1, 24
 ;
 ; MIPS32R2-LABEL: shl_i8:
 ; MIPS32R2:       # %bb.0: # %entry
-; MIPS32R2-NEXT:    andi $1, $5, 255
-; MIPS32R2-NEXT:    sllv $1, $4, $1
+; MIPS32R2-NEXT:    sllv $1, $4, $5
 ; MIPS32R2-NEXT:    jr $ra
 ; MIPS32R2-NEXT:    seb $2, $1
 ;
 ; MIPS32R6-LABEL: shl_i8:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    andi $1, $5, 255
-; MIPS32R6-NEXT:    sllv $1, $4, $1
+; MIPS32R6-NEXT:    sllv $1, $4, $5
 ; MIPS32R6-NEXT:    jr $ra
 ; MIPS32R6-NEXT:    seb $2, $1
 ;
 ; MIPS3-LABEL: shl_i8:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    andi $1, $5, 255
-; MIPS3-NEXT:    sllv $1, $4, $1
+; MIPS3-NEXT:    sllv $1, $4, $5
 ; MIPS3-NEXT:    sll $1, $1, 24
 ; MIPS3-NEXT:    jr $ra
 ; MIPS3-NEXT:    sra $2, $1, 24
 ;
 ; MIPS4-LABEL: shl_i8:
 ; MIPS4:       # %bb.0: # %entry
-; MIPS4-NEXT:    andi $1, $5, 255
-; MIPS4-NEXT:    sllv $1, $4, $1
+; MIPS4-NEXT:    sllv $1, $4, $5
 ; MIPS4-NEXT:    sll $1, $1, 24
 ; MIPS4-NEXT:    jr $ra
 ; MIPS4-NEXT:    sra $2, $1, 24
 ;
 ; MIPS64-LABEL: shl_i8:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    andi $1, $5, 255
-; MIPS64-NEXT:    sllv $1, $4, $1
+; MIPS64-NEXT:    sllv $1, $4, $5
 ; MIPS64-NEXT:    sll $1, $1, 24
 ; MIPS64-NEXT:    jr $ra
 ; MIPS64-NEXT:    sra $2, $1, 24
 ;
 ; MIPS64R2-LABEL: shl_i8:
 ; MIPS64R2:       # %bb.0: # %entry
-; MIPS64R2-NEXT:    andi $1, $5, 255
-; MIPS64R2-NEXT:    sllv $1, $4, $1
+; MIPS64R2-NEXT:    sllv $1, $4, $5
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    seb $2, $1
 ;
 ; MIPS64R6-LABEL: shl_i8:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    andi $1, $5, 255
-; MIPS64R6-NEXT:    sllv $1, $4, $1
+; MIPS64R6-NEXT:    sllv $1, $4, $5
 ; MIPS64R6-NEXT:    jr $ra
 ; MIPS64R6-NEXT:    seb $2, $1
 ;
@@ -182,69 +173,60 @@ entry:
 define signext i16 @shl_i16(i16 signext %a, i16 signext %b) {
 ; MIPS2-LABEL: shl_i16:
 ; MIPS2:       # %bb.0: # %entry
-; MIPS2-NEXT:    andi $1, $5, 65535
-; MIPS2-NEXT:    sllv $1, $4, $1
+; MIPS2-NEXT:    sllv $1, $4, $5
 ; MIPS2-NEXT:    sll $1, $1, 16
 ; MIPS2-NEXT:    jr $ra
 ; MIPS2-NEXT:    sra $2, $1, 16
 ;
 ; MIPS32-LABEL: shl_i16:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    andi $1, $5, 65535
-; MIPS32-NEXT:    sllv $1, $4, $1
+; MIPS32-NEXT:    sllv $1, $4, $5
 ; MIPS32-NEXT:    sll $1, $1, 16
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    sra $2, $1, 16
 ;
 ; MIPS32R2-LABEL: shl_i16:
 ; MIPS32R2:       # %bb.0: # %entry
-; MIPS32R2-NEXT:    andi $1, $5, 65535
-; MIPS32R2-NEXT:    sllv $1, $4, $1
+; MIPS32R2-NEXT:    sllv $1, $4, $5
 ; MIPS32R2-NEXT:    jr $ra
 ; MIPS32R2-NEXT:    seh $2, $1
 ;
 ; MIPS32R6-LABEL: shl_i16:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    andi $1, $5, 65535
-; MIPS32R6-NEXT:    sllv $1, $4, $1
+; MIPS32R6-NEXT:    sllv $1, $4, $5
 ; MIPS32R6-NEXT:    jr $ra
 ; MIPS32R6-NEXT:    seh $2, $1
 ;
 ; MIPS3-LABEL: shl_i16:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    andi $1, $5, 65535
-; MIPS3-NEXT:    sllv $1, $4, $1
+; MIPS3-NEXT:    sllv $1, $4, $5
 ; MIPS3-NEXT:    sll $1, $1, 16
 ; MIPS3-NEXT:    jr $ra
 ; MIPS3-NEXT:    sra $2, $1, 16
 ;
 ; MIPS4-LABEL: shl_i16:
 ; MIPS4:       # %bb.0: # %entry
-; MIPS4-NEXT:    andi $1, $5, 65535
-; MIPS4-NEXT:    sllv $1, $4, $1
+; MIPS4-NEXT:    sllv $1, $4, $5
 ; MIPS4-NEXT:    sll $1, $1, 16
 ; MIPS4-NEXT:    jr $ra
 ; MIPS4-NEXT:    sra $2, $1, 16
 ;
 ; MIPS64-LABEL: shl_i16:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    andi $1, $5, 65535
-; MIPS64-NEXT:    sllv $1, $4, $1
+; MIPS64-NEXT:    sllv $1, $4, $5
 ; MIPS64-NEXT:    sll $1, $1, 16
 ; MIPS64-NEXT:    jr $ra
 ; MIPS64-NEXT:    sra $2, $1, 16
 ;
 ; MIPS64R2-LABEL: shl_i16:
 ; MIPS64R2:       # %bb.0: # %entry
-; MIPS64R2-NEXT:    andi $1, $5, 65535
-; MIPS64R2-NEXT:    sllv $1, $4, $1
+; MIPS64R2-NEXT:    sllv $1, $4, $5
 ; MIPS64R2-NEXT:    jr $ra
 ; MIPS64R2-NEXT:    seh $2, $1
 ;
 ; MIPS64R6-LABEL: shl_i16:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    andi $1, $5, 65535
-; MIPS64R6-NEXT:    sllv $1, $4, $1
+; MIPS64R6-NEXT:    sllv $1, $4, $5
 ; MIPS64R6-NEXT:    jr $ra
 ; MIPS64R6-NEXT:    seh $2, $1
 ;
@@ -486,7 +468,6 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS2-NEXT:    lwr $3, 7($4)
 ; MIPS2-NEXT:    andi $1, $1, 7
 ; MIPS2-NEXT:    not $6, $1
-; MIPS2-NEXT:    andi $6, $6, 31
 ; MIPS2-NEXT:    sllv $7, $3, $1
 ; MIPS2-NEXT:    srlv $6, $2, $6
 ; MIPS2-NEXT:    lwl $2, 0($4)
@@ -539,7 +520,6 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32-NEXT:    lwr $3, 7($4)
 ; MIPS32-NEXT:    andi $1, $1, 7
 ; MIPS32-NEXT:    not $6, $1
-; MIPS32-NEXT:    andi $6, $6, 31
 ; MIPS32-NEXT:    sllv $7, $3, $1
 ; MIPS32-NEXT:    srlv $6, $2, $6
 ; MIPS32-NEXT:    lwl $2, 0($4)
@@ -591,7 +571,6 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R2-NEXT:    lwr $3, 7($4)
 ; MIPS32R2-NEXT:    andi $1, $1, 7
 ; MIPS32R2-NEXT:    not $6, $1
-; MIPS32R2-NEXT:    andi $6, $6, 31
 ; MIPS32R2-NEXT:    sllv $7, $3, $1
 ; MIPS32R2-NEXT:    srlv $6, $2, $6
 ; MIPS32R2-NEXT:    lwl $2, 0($4)
@@ -633,7 +612,6 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
 ; MIPS32R6-NEXT:    lw $3, 4($4)
 ; MIPS32R6-NEXT:    andi $1, $1, 7
 ; MIPS32R6-NEXT:    not $6, $1
-; MIPS32R6-NEXT:    andi $6, $6, 31
 ; MIPS32R6-NEXT:    sllv $7, $3, $1
 ; MIPS32R6-NEXT:    srlv $6, $2, $6
 ; MIPS32R6-NEXT:    lw $2, 0($4)
diff --git a/llvm/test/CodeGen/Mips/optimizeAndPlusShift.ll b/llvm/test/CodeGen/Mips/optimizeAndPlusShift.ll
new file mode 100644
index 000000000000..bf69adf6702f
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/optimizeAndPlusShift.ll
@@ -0,0 +1,84 @@
+; RUN: llc < %s -mtriple=mipsel-unknown-linux-gnu | FileCheck %s --check-prefixes=MIPS32
+; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=MIPS64
+; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=MIPS64
+
+define i32 @shl_32(i32 %a, i32 %b) {
+; MIPS32-LABLE:   shl_32:
+; MIPS32:	  # %bb.0:
+; MIPS32-NEXT:    jr	$ra
+; MIPS32-NEXT:    sllv	$2, $4, $5
+; MIPS64-LABLE:   shl_32:
+; MIPS64:	  # %bb.0:
+; MIPS64-NEXT:    sll   $1, $5, 0
+; MIPS64-NEXT:    sll   $2, $4, 0
+; MIPS64-NEXT:    jr	$ra
+; MIPS64-NEXT:    sllv	$2, $2, $1
+  %_1 = and i32 %b, 31
+  %_0 = shl i32 %a, %_1
+  ret i32 %_0
+}
+
+define i32 @lshr_32(i32 %a, i32 %b) {
+; MIPS32-LABLE:   lshr_32:
+; MIPS32:	  # %bb.0:
+; MIPS32-NEXT:    jr	$ra
+; MIPS32-NEXT:    srlv	$2, $4, $5
+; MIPS64-LABLE:   lshr_32:
+; MIPS64:	  # %bb.0:
+; MIPS64-NEXT:    sll   $1, $5, 0
+; MIPS64-NEXT:    sll   $2, $4, 0
+; MIPS64-NEXT:    jr	$ra
+; MIPS64-NEXT:    srlv	$2, $2, $1
+  %_1 = and i32 %b, 31
+  %_0 = lshr i32 %a, %_1
+  ret i32 %_0
+}
+
+define i32 @ashr_32(i32 %a, i32 %b) {
+; MIPS32-LABLE:   ashr_32:
+; MIPS32:	  # %bb.0:
+; MIPS32-NEXT:    jr	$ra
+; MIPS32-NEXT:    srav	$2, $4, $5
+; MIPS64-LABLE:   ashr_32:
+; MIPS64:	  # %bb.0:
+; MIPS64-NEXT:    sll   $1, $5, 0
+; MIPS64-NEXT:    sll   $2, $4, 0
+; MIPS64-NEXT:    jr	$ra
+; MIPS64-NEXT:    srav	$2, $2, $1
+  %_1 = and i32 %b, 31
+  %_0 = ashr i32 %a, %_1
+  ret i32 %_0
+}
+
+define i64 @shl_64(i64 %a, i64 %b) {
+; MIPS64-LABLE:   shl_64:
+; MIPS64:	  # %bb.0:
+; MIPS64-NEXT:    sll   $1, $5, 0
+; MIPS64-NEXT:    jr	$ra
+; MIPS64-NEXT:    dsllv	$2, $4, $1
+  %_1 = and i64 %b, 63
+  %_0 = shl i64 %a, %_1
+  ret i64 %_0
+}
+
+define i64 @lshr_64(i64 %a, i64 %b) {
+; MIPS64-LABLE:   lshr_64:
+; MIPS64:	  # %bb.0:
+; MIPS64-NEXT:    sll   $1, $5, 0
+; MIPS64-NEXT:    jr	$ra
+; MIPS64-NEXT:    dsrlv	$2, $4, $1
+  %_1 = and i64 %b, 63
+  %_0 = lshr i64 %a, %_1
+  ret i64 %_0
+}
+
+define i64 @ashr_64(i64 %a, i64 %b) {
+; MIPS64-LABLE:   ashr_64:
+; MIPS64:	  # %bb.0:
+; MIPS64-NEXT:    sll   $1, $5, 0
+; MIPS64-NEXT:    jr	$ra
+; MIPS64-NEXT:    dsrav	$2, $4, $1
+  %_1 = and i64 %b, 63
+  %_0 = ashr i64 %a, %_1
+  ret i64 %_0
+}
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
index 73831603df94..9199bf6ae12a 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
@@ -43,7 +43,7 @@ exit:
 
 declare i32 @llvm.nvvm.reflect.p0i8(ptr)
 
-; CHECK-LABEL: define i32 @intrinsic
+; CHECK-LABEL: define noundef i32 @intrinsic
 define i32 @intrinsic() {
 ; CHECK-NOT: call i32 @llvm.nvvm.reflect
 ; USE_FTZ_0: ret i32 0
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
index c482ac087c41..9b1939f37208 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
@@ -43,7 +43,7 @@ exit:
 
 declare i32 @llvm.nvvm.reflect.p0(ptr)
 
-; CHECK-LABEL: define i32 @intrinsic
+; CHECK-LABEL: define noundef i32 @intrinsic
 define i32 @intrinsic() {
 ; CHECK-NOT: call i32 @llvm.nvvm.reflect
 ; USE_FTZ_0: ret i32 0
diff --git a/llvm/test/CodeGen/PowerPC/aix-return55.ll b/llvm/test/CodeGen/PowerPC/aix-return55.ll
index c7d481ced140..a36deda9c146 100644
--- a/llvm/test/CodeGen/PowerPC/aix-return55.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-return55.ll
@@ -39,7 +39,7 @@ entry:
 ;CHECKOBJ-NEXT:      1c: 67 8a bc de                   oris 10, 28, 48350{{[[:space:]] *}}
 ;CHECKOBJ32-NEXT: 00000020 <d>:
 ;CHECKOBJ64-NEXT: 0000000000000020 <d>:
-;CHECKOBJ-NEXT:      20: 40 14 00 00                   bdnzf   20, 0x20
+;CHECKOBJ-NEXT:      20: 40 14 00 00                   bdnzf   20, 0x20 <d>
 ;CHECKOBJ-NEXT:      24: 00 00 00 00                   <unknown>{{[[:space:]] *}}
 ;CHECKOBJ32-NEXT: 00000028 <foo>:
 ;CHECKOBJ32-NEXT:    28: 00 00 00 00                   <unknown>
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-funcsect.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-funcsect.ll
index a5056d407b76..a557b6f4f171 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-funcsect.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-funcsect.ll
@@ -189,7 +189,7 @@ entry:
 ; DIS32-NEXT:       40: 7c 08 02 a6  	mflr 0
 ; DIS32-NEXT:       44: 94 21 ff c0  	stwu 1, -64(1)
 ; DIS32-NEXT:       48: 90 01 00 48  	stw 0, 72(1)
-; DIS32-NEXT:       4c: 4b ff ff b5  	bl 0x0 <.alias_foo>
+; DIS32-NEXT:       4c: 4b ff ff b5  	bl 0x0 <.foo>
 ; DIS32-NEXT: 			0000004c:  R_RBR	(idx: 7) .foo[PR]
 ; DIS32-NEXT:       50: 60 00 00 00  	nop
 ; DIS32-NEXT:       54: 48 00 00 6d  	bl 0xc0 <.static_overalign_foo>
@@ -198,7 +198,7 @@ entry:
 ; DIS32-NEXT:       5c: 4b ff ff a5  	bl 0x0 <.alias_foo>
 ; DIS32-NEXT: 			0000005c:  R_RBR	(idx: 9) .alias_foo
 ; DIS32-NEXT:       60: 60 00 00 00  	nop
-; DIS32-NEXT:       64: 4b ff ff 9d  	bl 0x0 <.alias_foo>
+; DIS32-NEXT:       64: 4b ff ff 9d  	bl 0x0 <.extern_foo>
 ; DIS32-NEXT: 			00000064:  R_RBR	(idx: 1) .extern_foo[PR]
 ; DIS32-NEXT:       68: 60 00 00 00  	nop
 ; DIS32-NEXT:       6c: 4b ff ff b5  	bl 0x20 <.hidden_foo>
@@ -212,7 +212,7 @@ entry:
 ; DIS64-NEXT:       40: 7c 08 02 a6  	mflr 0
 ; DIS64-NEXT:       44: f8 21 ff 91  	stdu 1, -112(1)
 ; DIS64-NEXT:       48: f8 01 00 80  	std 0, 128(1)
-; DIS64-NEXT:       4c: 4b ff ff b5  	bl 0x0 <.alias_foo>
+; DIS64-NEXT:       4c: 4b ff ff b5  	bl 0x0 <.foo>
 ; DIS64-NEXT: 		000000000000004c:  R_RBR	(idx: 7) .foo[PR]
 ; DIS64-NEXT:       50: 60 00 00 00  	nop
 ; DIS64-NEXT:       54: 48 00 00 6d  	bl 0xc0 <.static_overalign_foo>
@@ -221,7 +221,7 @@ entry:
 ; DIS64-NEXT:       5c: 4b ff ff a5  	bl 0x0 <.alias_foo>
 ; DIS64-NEXT: 		000000000000005c:  R_RBR	(idx: 9) .alias_foo
 ; DIS64-NEXT:       60: 60 00 00 00  	nop
-; DIS64-NEXT:       64: 4b ff ff 9d  	bl 0x0 <.alias_foo>
+; DIS64-NEXT:       64: 4b ff ff 9d  	bl 0x0 <.extern_foo>
 ; DIS64-NEXT: 		0000000000000064:  R_RBR	(idx: 1) .extern_foo[PR]
 ; DIS64-NEXT:       68: 60 00 00 00  	nop
 ; DIS64-NEXT:       6c: 4b ff ff b5  	bl 0x20 <.hidden_foo>
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
index 97a5fbcf78f5..5ac6a7af0db2 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
@@ -456,7 +456,7 @@ declare i32 @bar(i32)
 ; SYM-NEXT: ]
 
 
-; DIS:      {{.*}}aix-xcoff-reloc.ll.tmp.o:   file format aixcoff-rs6000
+; DIS:      :   file format aixcoff-rs6000
 ; DIS:      Disassembly of section .text:
 ; DIS:      00000000 <.foo>:
 ; DIS-NEXT:        0: 7c 08 02 a6                   mflr 0
@@ -495,7 +495,7 @@ declare i32 @bar(i32)
 ; DIS:      00000084 <globalB>:
 ; DIS-NEXT:       84: 00 00 00 44                   <unknown>
 
-; DIS_REL:       {{.*}}aix-xcoff-reloc.ll.tmp.o:   file format aixcoff-rs6000
+; DIS_REL:       :   file format aixcoff-rs6000
 ; DIS_REL:       RELOCATION RECORDS FOR [.text]:
 ; DIS_REL-NEXT:  OFFSET   TYPE                     VALUE
 ; DIS_REL-NEXT:  00000010 R_RBR                    .bar
@@ -515,7 +515,7 @@ declare i32 @bar(i32)
 ; DIS64-NEXT:        4: f8 21 ff 91  	stdu 1, -112(1)
 ; DIS64-NEXT:        8: 38 60 00 01  	li 3, 1
 ; DIS64-NEXT:        c: f8 01 00 80  	std 0, 128(1)
-; DIS64-NEXT:       10: 4b ff ff f1  	bl 0x0 <.foo>
+; DIS64-NEXT:       10: 4b ff ff f1  	bl 0x0 <.bar>
 ; DIS64-NEXT:       14: 60 00 00 00  	nop
 ; DIS64-NEXT:       18: e8 82 00 00  	ld 4, 0(2)
 ; DIS64-NEXT:       1c: e8 a2 00 08  	ld 5, 8(2)
diff --git a/llvm/test/CodeGen/PowerPC/coalescer-remat-with-undef-implicit-def-operand.mir b/llvm/test/CodeGen/PowerPC/coalescer-remat-with-undef-implicit-def-operand.mir
new file mode 100644
index 000000000000..8e4e3be55600
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/coalescer-remat-with-undef-implicit-def-operand.mir
@@ -0,0 +1,28 @@
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-coalescing -run-pass=register-coalescer \
+# RUN:   -o - %s | FileCheck %s
+---
+name:            _Z13testTransposeIfLj31ELj17EEvv
+alignment:       16
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    128
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name:            _Z13testTransposeIfLj31ELj17EEvv
+  ; CHECK: undef %[[REG:[0-9]+]].sub_64:vsrc = IMPLICIT_DEF implicit-def %[[REG]]
+  bb.0:
+    liveins: $x2
+    %2:vssrc = IMPLICIT_DEF
+    B %bb.2
+
+  bb.1:
+    %0:vsrc = SUBREG_TO_REG 1, killed %2, %subreg.sub_64
+    %1:vsrc = XXPERMDI killed undef %0, killed %0, 0
+    BLR8 implicit $lr8, implicit $rm
+
+  bb.2:
+    successors: %bb.2(0x7c000000), %bb.1(0x04000000)
+    BDNZ8 %bb.2, implicit-def $ctr8, implicit $ctr8
+    B %bb.1
+
+...
diff --git a/llvm/test/CodeGen/PowerPC/f128-arith.ll b/llvm/test/CodeGen/PowerPC/f128-arith.ll
index 18c0f25ed10e..35e5d61947ea 100644
--- a/llvm/test/CodeGen/PowerPC/f128-arith.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-arith.ll
@@ -419,6 +419,60 @@ entry:
 }
 declare fp128 @llvm.cos.f128(fp128 %Val)
 
+define fp128 @qp_sincos(ptr nocapture readonly %a) nounwind {
+; CHECK-LABEL: qp_sincos:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -64(r1)
+; CHECK-NEXT:    std r0, 80(r1)
+; CHECK-NEXT:    addi r5, r1, 48
+; CHECK-NEXT:    addi r6, r1, 32
+; CHECK-NEXT:    lxv v2, 0(r3)
+; CHECK-NEXT:    bl sincosf128
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lxv v2, 48(r1)
+; CHECK-NEXT:    lxv v3, 32(r1)
+; CHECK-NEXT:    xsmulqp v2, v3, v2
+; CHECK-NEXT:    addi r1, r1, 64
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: qp_sincos:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-P8-NEXT:    stdu r1, -96(r1)
+; CHECK-P8-NEXT:    std r0, 112(r1)
+; CHECK-P8-NEXT:    addi r30, r1, 48
+; CHECK-P8-NEXT:    addi r29, r1, 32
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    mr r5, r30
+; CHECK-P8-NEXT:    mr r6, r29
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    bl sincosf128
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r29
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r30
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    bl __mulkf3
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 96
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-P8-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+entry:
+  %0 = load fp128, ptr %a, align 16
+  %1 = tail call fp128 @llvm.cos.f128(fp128 %0)
+  %2 = tail call fp128 @llvm.sin.f128(fp128 %0)
+  %3 = fmul fp128 %1, %2
+  ret fp128 %3
+}
+
 define fp128 @qp_log(ptr nocapture readonly %a) {
 ; CHECK-LABEL: qp_log:
 ; CHECK:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv32.mir
index f1f570f08ae4..5b0e52dd4f67 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv32.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=riscv32 -mattr=+zbb -run-pass=instruction-select \
 # RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv32 -mattr=+zbkb -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
 
 ---
 name:            rotl_i32
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv64.mir
index 2210b8887041..6731f54e055d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv64.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=riscv64 -mattr=+zbb -run-pass=instruction-select \
 # RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+zbkb -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
 
 ---
 name:            rotl_i32
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv32.mir
index 4395481328b6..cb7ffdf10c19 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv32.mir
@@ -2,7 +2,9 @@
 # RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \
 # RUN: | FileCheck %s --check-prefixes=CHECK,RV32I
 # RUN: llc -mtriple=riscv32 -mattr=+zbb -run-pass=legalizer %s -o - \
-# RUN: | FileCheck %s --check-prefixes=CHECK,RV32ZBB
+# RUN: | FileCheck %s --check-prefixes=CHECK,RV32ZBB_OR_RV32ZBKB
+# RUN: llc -mtriple=riscv32 -mattr=+zbkb -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s --check-prefixes=CHECK,RV32ZBB_OR_RV32ZBKB
 
 ---
 name:            rotl_i8
@@ -92,14 +94,14 @@ body:             |
     ; RV32I-NEXT: $x10 = COPY [[OR]](s32)
     ; RV32I-NEXT: PseudoRET implicit $x10
     ;
-    ; RV32ZBB-LABEL: name: rotl_i32
-    ; RV32ZBB: liveins: $x10, $x11
-    ; RV32ZBB-NEXT: {{  $}}
-    ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
-    ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
-    ; RV32ZBB-NEXT: [[ROTL:%[0-9]+]]:_(s32) = G_ROTL [[COPY]], [[COPY1]](s32)
-    ; RV32ZBB-NEXT: $x10 = COPY [[ROTL]](s32)
-    ; RV32ZBB-NEXT: PseudoRET implicit $x10
+    ; RV32ZBB_OR_RV32ZBKB-LABEL: name: rotl_i32
+    ; RV32ZBB_OR_RV32ZBKB: liveins: $x10, $x11
+    ; RV32ZBB_OR_RV32ZBKB-NEXT: {{  $}}
+    ; RV32ZBB_OR_RV32ZBKB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32ZBB_OR_RV32ZBKB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32ZBB_OR_RV32ZBKB-NEXT: [[ROTL:%[0-9]+]]:_(s32) = G_ROTL [[COPY]], [[COPY1]](s32)
+    ; RV32ZBB_OR_RV32ZBKB-NEXT: $x10 = COPY [[ROTL]](s32)
+    ; RV32ZBB_OR_RV32ZBKB-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
     %1:_(s32) = COPY $x11
     %2:_(s32) = G_ROTL %0, %1(s32)
@@ -260,14 +262,14 @@ body:             |
     ; RV32I-NEXT: $x10 = COPY [[OR]](s32)
     ; RV32I-NEXT: PseudoRET implicit $x10
     ;
-    ; RV32ZBB-LABEL: name: rotr_i32
-    ; RV32ZBB: liveins: $x10, $x11
-    ; RV32ZBB-NEXT: {{  $}}
-    ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
-    ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
-    ; RV32ZBB-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[COPY1]](s32)
-    ; RV32ZBB-NEXT: $x10 = COPY [[ROTR]](s32)
-    ; RV32ZBB-NEXT: PseudoRET implicit $x10
+    ; RV32ZBB_OR_RV32ZBKB-LABEL: name: rotr_i32
+    ; RV32ZBB_OR_RV32ZBKB: liveins: $x10, $x11
+    ; RV32ZBB_OR_RV32ZBKB-NEXT: {{  $}}
+    ; RV32ZBB_OR_RV32ZBKB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32ZBB_OR_RV32ZBKB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32ZBB_OR_RV32ZBKB-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[COPY1]](s32)
+    ; RV32ZBB_OR_RV32ZBKB-NEXT: $x10 = COPY [[ROTR]](s32)
+    ; RV32ZBB_OR_RV32ZBKB-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
     %1:_(s32) = COPY $x11
     %2:_(s32) = G_ROTR %0, %1(s32)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir
index 91e6eeaee576..b9d7b838c3b9 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir
@@ -2,7 +2,9 @@
 # RUN: llc -mtriple=riscv64 -run-pass=legalizer %s -o - \
 # RUN: | FileCheck %s --check-prefixes=CHECK,RV64I
 # RUN: llc -mtriple=riscv64 -mattr=+zbb -run-pass=legalizer %s -o - \
-# RUN: | FileCheck %s --check-prefixes=CHECK,RV64ZBB
+# RUN: | FileCheck %s --check-prefixes=CHECK,RV64ZBB_OR_RV64ZBKB
+# RUN: llc -mtriple=riscv64 -mattr=+zbkb -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s --check-prefixes=CHECK,RV64ZBB_OR_RV64ZBKB
 
 ---
 name:            rotl_i8
@@ -105,18 +107,18 @@ body:             |
     ; RV64I-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
-    ; RV64ZBB-LABEL: name: rotl_i32
-    ; RV64ZBB: liveins: $x10, $x11
-    ; RV64ZBB-NEXT: {{  $}}
-    ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
-    ; RV64ZBB-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
-    ; RV64ZBB-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; RV64ZBB-NEXT: [[ROTL:%[0-9]+]]:_(s32) = G_ROTL [[TRUNC]], [[AND]](s64)
-    ; RV64ZBB-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ROTL]](s32)
-    ; RV64ZBB-NEXT: $x10 = COPY [[ANYEXT]](s64)
-    ; RV64ZBB-NEXT: PseudoRET implicit $x10
+    ; RV64ZBB_OR_RV64ZBKB-LABEL: name: rotl_i32
+    ; RV64ZBB_OR_RV64ZBKB: liveins: $x10, $x11
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: {{  $}}
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[ROTL:%[0-9]+]]:_(s32) = G_ROTL [[TRUNC]], [[AND]](s64)
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ROTL]](s32)
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: PseudoRET implicit $x10
     %2:_(s64) = COPY $x10
     %0:_(s32) = G_TRUNC %2(s64)
     %3:_(s64) = COPY $x11
@@ -149,14 +151,14 @@ body:             |
     ; RV64I-NEXT: $x10 = COPY [[OR]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
-    ; RV64ZBB-LABEL: name: rotl_i64
-    ; RV64ZBB: liveins: $x10, $x11
-    ; RV64ZBB-NEXT: {{  $}}
-    ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
-    ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; RV64ZBB-NEXT: [[ROTL:%[0-9]+]]:_(s64) = G_ROTL [[COPY]], [[COPY1]](s64)
-    ; RV64ZBB-NEXT: $x10 = COPY [[ROTL]](s64)
-    ; RV64ZBB-NEXT: PseudoRET implicit $x10
+    ; RV64ZBB_OR_RV64ZBKB-LABEL: name: rotl_i64
+    ; RV64ZBB_OR_RV64ZBKB: liveins: $x10, $x11
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: {{  $}}
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[ROTL:%[0-9]+]]:_(s64) = G_ROTL [[COPY]], [[COPY1]](s64)
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: $x10 = COPY [[ROTL]](s64)
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
     %2:_(s64) = G_ROTL %0, %1(s64)
@@ -265,18 +267,18 @@ body:             |
     ; RV64I-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
-    ; RV64ZBB-LABEL: name: rotr_i32
-    ; RV64ZBB: liveins: $x10, $x11
-    ; RV64ZBB-NEXT: {{  $}}
-    ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
-    ; RV64ZBB-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
-    ; RV64ZBB-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; RV64ZBB-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[TRUNC]], [[AND]](s64)
-    ; RV64ZBB-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ROTR]](s32)
-    ; RV64ZBB-NEXT: $x10 = COPY [[ANYEXT]](s64)
-    ; RV64ZBB-NEXT: PseudoRET implicit $x10
+    ; RV64ZBB_OR_RV64ZBKB-LABEL: name: rotr_i32
+    ; RV64ZBB_OR_RV64ZBKB: liveins: $x10, $x11
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: {{  $}}
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[TRUNC]], [[AND]](s64)
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ROTR]](s32)
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: PseudoRET implicit $x10
     %2:_(s64) = COPY $x10
     %0:_(s32) = G_TRUNC %2(s64)
     %3:_(s64) = COPY $x11
@@ -309,14 +311,14 @@ body:             |
     ; RV64I-NEXT: $x10 = COPY [[OR]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
-    ; RV64ZBB-LABEL: name: rotr_i64
-    ; RV64ZBB: liveins: $x10, $x11
-    ; RV64ZBB-NEXT: {{  $}}
-    ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
-    ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; RV64ZBB-NEXT: [[ROTR:%[0-9]+]]:_(s64) = G_ROTR [[COPY]], [[COPY1]](s64)
-    ; RV64ZBB-NEXT: $x10 = COPY [[ROTR]](s64)
-    ; RV64ZBB-NEXT: PseudoRET implicit $x10
+    ; RV64ZBB_OR_RV64ZBKB-LABEL: name: rotr_i64
+    ; RV64ZBB_OR_RV64ZBKB: liveins: $x10, $x11
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: {{  $}}
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: [[ROTR:%[0-9]+]]:_(s64) = G_ROTR [[COPY]], [[COPY1]](s64)
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: $x10 = COPY [[ROTR]](s64)
+    ; RV64ZBB_OR_RV64ZBKB-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
     %2:_(s64) = G_ROTR %0, %1(s64)
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 7e14c0f2c43b..9a6e78c09ad8 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -85,6 +85,8 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+zve32x -mattr=+zvkt %s -o - | FileCheck --check-prefix=RV32ZVKT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zvfh %s -o - | FileCheck --check-prefix=RV32ZVFH %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicond %s -o - | FileCheck --check-prefix=RV32ZICOND %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zimop %s -o - | FileCheck --check-prefix=RV32ZIMOP %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zcmop %s -o - | FileCheck --check-prefix=RV32ZCMOP %s
 ; RUN: llc -mtriple=riscv32 -mattr=+smaia %s -o - | FileCheck --check-prefixes=CHECK,RV32SMAIA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+ssaia %s -o - | FileCheck --check-prefixes=CHECK,RV32SSAIA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFBFMIN %s
@@ -177,6 +179,8 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zve32x -mattr=+zvkt %s -o - | FileCheck --check-prefix=RV64ZVKT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zvfh %s -o - | FileCheck --check-prefix=RV64ZVFH %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicond %s -o - | FileCheck --check-prefix=RV64ZICOND %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zimop %s -o - | FileCheck --check-prefix=RV64ZIMOP %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zcmop %s -o - | FileCheck --check-prefix=RV64ZCMOP %s
 ; RUN: llc -mtriple=riscv64 -mattr=+smaia %s -o - | FileCheck --check-prefixes=CHECK,RV64SMAIA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+ssaia %s -o - | FileCheck --check-prefixes=CHECK,RV64SSAIA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFBFMIN %s
@@ -271,6 +275,8 @@
 ; RV32ZVKT: .attribute 5, "rv32i2p1_zicsr2p0_zve32x1p0_zvkt1p0_zvl32b1p0"
 ; RV32ZVFH: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfhmin1p0_zve32f1p0_zve32x1p0_zvfh1p0_zvfhmin1p0_zvl32b1p0"
 ; RV32ZICOND: .attribute 5, "rv32i2p1_zicond1p0"
+; RV32ZIMOP: .attribute 5, "rv32i2p1_zimop0p1"
+; RV32ZCMOP: .attribute 5, "rv32i2p1_zca1p0_zcmop0p2"
 ; RV32SMAIA: .attribute 5, "rv32i2p1_smaia1p0"
 ; RV32SSAIA: .attribute 5, "rv32i2p1_ssaia1p0"
 ; RV32ZFBFMIN: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin0p8"
@@ -362,6 +368,8 @@
 ; RV64ZVKT: .attribute 5, "rv64i2p1_zicsr2p0_zve32x1p0_zvkt1p0_zvl32b1p0"
 ; RV64ZVFH: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfhmin1p0_zve32f1p0_zve32x1p0_zvfh1p0_zvfhmin1p0_zvl32b1p0"
 ; RV64ZICOND: .attribute 5, "rv64i2p1_zicond1p0"
+; RV64ZIMOP: .attribute 5, "rv64i2p1_zimop0p1"
+; RV64ZCMOP: .attribute 5, "rv64i2p1_zca1p0_zcmop0p2"
 ; RV64SMAIA: .attribute 5, "rv64i2p1_smaia1p0"
 ; RV64SSAIA: .attribute 5, "rv64i2p1_ssaia1p0"
 ; RV64ZFBFMIN: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin0p8"
diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
index 8a0c4240d161..bfa2c3bb4a8b 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
@@ -39,8 +39,6 @@ define i16 @fcvt_si_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_si_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.l.s a0, fa5, rtz
@@ -100,8 +98,6 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_si_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    feq.s a0, fa5, fa5
@@ -145,8 +141,6 @@ define i16 @fcvt_ui_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_ui_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -196,8 +190,6 @@ define i16 @fcvt_ui_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-NEXT:    lui a0, %hi(.LCPI3_0)
 ; RV64ID-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-NEXT:    fmv.w.x fa3, zero
@@ -235,8 +227,6 @@ define i32 @fcvt_w_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_w_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.l.s a0, fa5, rtz
@@ -281,8 +271,6 @@ define i32 @fcvt_w_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_w_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.w.s a0, fa5, rtz
@@ -321,8 +309,6 @@ define i32 @fcvt_wu_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_wu_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -361,8 +347,6 @@ define i32 @fcvt_wu_bf16_multiple_use(bfloat %x, ptr %y) nounwind {
 ; RV64ID-LABEL: fcvt_wu_bf16_multiple_use:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -413,8 +397,6 @@ define i32 @fcvt_wu_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_wu_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -463,8 +445,6 @@ define i64 @fcvt_l_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_l_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.l.s a0, fa5, rtz
@@ -606,8 +586,6 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_l_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.l.s a0, fa5, rtz
@@ -654,8 +632,6 @@ define i64 @fcvt_lu_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_lu_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -730,8 +706,6 @@ define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_lu_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -1200,8 +1174,6 @@ define float @fcvt_s_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_s_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa0, a0
 ; RV64ID-NEXT:    ret
@@ -1313,8 +1285,6 @@ define double @fcvt_d_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_d_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.d.s fa0, fa5
@@ -1521,8 +1491,6 @@ define signext i8 @fcvt_w_s_i8(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_w_s_i8:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.l.s a0, fa5, rtz
@@ -1582,8 +1550,6 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_w_s_sat_i8:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    feq.s a0, fa5, fa5
@@ -1627,8 +1593,6 @@ define zeroext i8 @fcvt_wu_s_i8(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_wu_s_i8:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -1676,8 +1640,6 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_wu_s_sat_i8:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fmv.w.x fa4, zero
@@ -1731,8 +1693,6 @@ define zeroext i32 @fcvt_wu_bf16_sat_zext(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_wu_bf16_sat_zext:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -1784,8 +1744,6 @@ define signext i32 @fcvt_w_bf16_sat_sext(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_w_bf16_sat_sext:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.w.s a0, fa5, rtz
diff --git a/llvm/test/CodeGen/RISCV/bfloat.ll b/llvm/test/CodeGen/RISCV/bfloat.ll
index 5013f76f9b0b..d62f35388123 100644
--- a/llvm/test/CodeGen/RISCV/bfloat.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat.ll
@@ -164,8 +164,6 @@ define float @bfloat_to_float(bfloat %a) nounwind {
 ;
 ; RV64ID-LP64-LABEL: bfloat_to_float:
 ; RV64ID-LP64:       # %bb.0:
-; RV64ID-LP64-NEXT:    slli a0, a0, 48
-; RV64ID-LP64-NEXT:    srli a0, a0, 48
 ; RV64ID-LP64-NEXT:    slli a0, a0, 16
 ; RV64ID-LP64-NEXT:    ret
 ;
@@ -179,8 +177,6 @@ define float @bfloat_to_float(bfloat %a) nounwind {
 ; RV64ID-LP64D-LABEL: bfloat_to_float:
 ; RV64ID-LP64D:       # %bb.0:
 ; RV64ID-LP64D-NEXT:    fmv.x.w a0, fa0
-; RV64ID-LP64D-NEXT:    slli a0, a0, 48
-; RV64ID-LP64D-NEXT:    srli a0, a0, 48
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa0, a0
 ; RV64ID-LP64D-NEXT:    ret
@@ -223,8 +219,6 @@ define double @bfloat_to_double(bfloat %a) nounwind {
 ;
 ; RV64ID-LP64-LABEL: bfloat_to_double:
 ; RV64ID-LP64:       # %bb.0:
-; RV64ID-LP64-NEXT:    slli a0, a0, 48
-; RV64ID-LP64-NEXT:    srli a0, a0, 48
 ; RV64ID-LP64-NEXT:    slli a0, a0, 16
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-LP64-NEXT:    fcvt.d.s fa5, fa5
@@ -242,8 +236,6 @@ define double @bfloat_to_double(bfloat %a) nounwind {
 ; RV64ID-LP64D-LABEL: bfloat_to_double:
 ; RV64ID-LP64D:       # %bb.0:
 ; RV64ID-LP64D-NEXT:    fmv.x.w a0, fa0
-; RV64ID-LP64D-NEXT:    slli a0, a0, 48
-; RV64ID-LP64D-NEXT:    srli a0, a0, 48
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-LP64D-NEXT:    fcvt.d.s fa0, fa5
@@ -366,10 +358,6 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind {
 ; RV64ID-LP64:       # %bb.0:
 ; RV64ID-LP64-NEXT:    addi sp, sp, -16
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64ID-LP64-NEXT:    lui a2, 16
-; RV64ID-LP64-NEXT:    addi a2, a2, -1
-; RV64ID-LP64-NEXT:    and a0, a0, a2
-; RV64ID-LP64-NEXT:    and a1, a1, a2
 ; RV64ID-LP64-NEXT:    slli a1, a1, 16
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64-NEXT:    slli a0, a0, 16
@@ -408,11 +396,7 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind {
 ; RV64ID-LP64D-NEXT:    addi sp, sp, -16
 ; RV64ID-LP64D-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64D-NEXT:    fmv.x.w a0, fa0
-; RV64ID-LP64D-NEXT:    lui a1, 16
-; RV64ID-LP64D-NEXT:    addi a1, a1, -1
-; RV64ID-LP64D-NEXT:    and a0, a0, a1
-; RV64ID-LP64D-NEXT:    fmv.x.w a2, fa1
-; RV64ID-LP64D-NEXT:    and a1, a2, a1
+; RV64ID-LP64D-NEXT:    fmv.x.w a1, fa1
 ; RV64ID-LP64D-NEXT:    slli a1, a1, 16
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
@@ -604,12 +588,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind {
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    mv s0, a0
-; RV64ID-LP64-NEXT:    lui a0, 16
-; RV64ID-LP64-NEXT:    addi a0, a0, -1
-; RV64ID-LP64-NEXT:    and a1, a1, a0
-; RV64ID-LP64-NEXT:    and a0, a2, a0
-; RV64ID-LP64-NEXT:    slli a0, a0, 16
-; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
+; RV64ID-LP64-NEXT:    slli a2, a2, 16
+; RV64ID-LP64-NEXT:    fmv.w.x fa5, a2
 ; RV64ID-LP64-NEXT:    slli a1, a1, 16
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-LP64-NEXT:    fadd.s fa5, fa4, fa5
@@ -651,11 +631,7 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind {
 ; RV64ID-LP64D-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64ID-LP64D-NEXT:    mv s0, a0
 ; RV64ID-LP64D-NEXT:    fmv.x.w a0, fa0
-; RV64ID-LP64D-NEXT:    lui a1, 16
-; RV64ID-LP64D-NEXT:    addi a1, a1, -1
-; RV64ID-LP64D-NEXT:    and a0, a0, a1
-; RV64ID-LP64D-NEXT:    fmv.x.w a2, fa1
-; RV64ID-LP64D-NEXT:    and a1, a2, a1
+; RV64ID-LP64D-NEXT:    fmv.x.w a1, fa1
 ; RV64ID-LP64D-NEXT:    slli a1, a1, 16
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
diff --git a/llvm/test/CodeGen/RISCV/bittest.ll b/llvm/test/CodeGen/RISCV/bittest.ll
index f560a112dd92..a05c518bbf3a 100644
--- a/llvm/test/CodeGen/RISCV/bittest.ll
+++ b/llvm/test/CodeGen/RISCV/bittest.ll
@@ -3521,4 +3521,3 @@ define void @bit_64_1_nz_branch_i64(i64 %0) {
 5:
   ret void
 }
-
diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
index 02aeebdeb377..e8cd1e35936a 100644
--- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll
+++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll
@@ -120,45 +120,36 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
 ; RV32-LABEL: test_lshr2:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    srli a2, a2, 2
-; RV32-NEXT:    beqz a2, .LBB3_3
-; RV32-NEXT:  # %bb.1: # %while.body.preheader
-; RV32-NEXT:    slli a2, a2, 2
-; RV32-NEXT:    add a2, a1, a2
-; RV32-NEXT:  .LBB3_2: # %while.body
+; RV32-NEXT:    beqz a2, .LBB3_2
+; RV32-NEXT:  .LBB3_1: # %while.body
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    lw a3, 0(a1)
-; RV32-NEXT:    addi a4, a1, 4
+; RV32-NEXT:    addi a1, a1, 4
 ; RV32-NEXT:    slli a3, a3, 1
-; RV32-NEXT:    addi a1, a0, 4
+; RV32-NEXT:    addi a4, a0, 4
+; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    mv a1, a4
-; RV32-NEXT:    bne a4, a2, .LBB3_2
-; RV32-NEXT:  .LBB3_3: # %while.end
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    bnez a2, .LBB3_1
+; RV32-NEXT:  .LBB3_2: # %while.end
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_lshr2:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    srliw a2, a2, 2
-; RV64-NEXT:    beqz a2, .LBB3_3
-; RV64-NEXT:  # %bb.1: # %while.body.preheader
-; RV64-NEXT:    addi a2, a2, -1
-; RV64-NEXT:    slli a2, a2, 32
-; RV64-NEXT:    srli a2, a2, 30
-; RV64-NEXT:    add a2, a2, a1
-; RV64-NEXT:    addi a2, a2, 4
-; RV64-NEXT:  .LBB3_2: # %while.body
+; RV64-NEXT:    beqz a2, .LBB3_2
+; RV64-NEXT:  .LBB3_1: # %while.body
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-NEXT:    lw a3, 0(a1)
-; RV64-NEXT:    addi a4, a1, 4
+; RV64-NEXT:    addi a1, a1, 4
 ; RV64-NEXT:    slli a3, a3, 1
-; RV64-NEXT:    addi a1, a0, 4
+; RV64-NEXT:    addi a4, a0, 4
+; RV64-NEXT:    addiw a2, a2, -1
 ; RV64-NEXT:    sw a3, 0(a0)
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    mv a1, a4
-; RV64-NEXT:    bne a4, a2, .LBB3_2
-; RV64-NEXT:  .LBB3_3: # %while.end
+; RV64-NEXT:    mv a0, a4
+; RV64-NEXT:    bnez a2, .LBB3_1
+; RV64-NEXT:  .LBB3_2: # %while.end
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/compress-inline-asm.ll b/llvm/test/CodeGen/RISCV/compress-inline-asm.ll
index 848b1ad70c0e..bb84ec6704a1 100644
--- a/llvm/test/CodeGen/RISCV/compress-inline-asm.ll
+++ b/llvm/test/CodeGen/RISCV/compress-inline-asm.ll
@@ -12,4 +12,3 @@ define i32 @compress_test(i32 %a) {
   %2 = tail call i32 asm "add $0, $1, $2", "=r,r,r"(i32 %a, i32 %1)
   ret i32 %2
 }
-
diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
index 02072b3e4e5c..9bfd30daf7d4 100644
--- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
+++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
@@ -978,4 +978,3 @@ entry:
 declare i64 @llvm.cttz.i64(i64, i1 immarg)
 declare i32 @llvm.cttz.i32(i32, i1 immarg)
 declare i64 @llvm.ctlz.i64(i64, i1 immarg)
-
diff --git a/llvm/test/CodeGen/RISCV/div_minsize.ll b/llvm/test/CodeGen/RISCV/div_minsize.ll
index 6bc3bc7134e7..601821b932a5 100644
--- a/llvm/test/CodeGen/RISCV/div_minsize.ll
+++ b/llvm/test/CodeGen/RISCV/div_minsize.ll
@@ -68,4 +68,3 @@ define i32 @testsize4(i32 %x) minsize nounwind {
   %div = udiv i32 %x, 33
   ret i32 %div
 }
-
diff --git a/llvm/test/CodeGen/RISCV/double-select-icmp.ll b/llvm/test/CodeGen/RISCV/double-select-icmp.ll
index cd28b589ff76..d864ff51b466 100644
--- a/llvm/test/CodeGen/RISCV/double-select-icmp.ll
+++ b/llvm/test/CodeGen/RISCV/double-select-icmp.ll
@@ -508,4 +508,3 @@ define double @select_icmp_sgt_zero(i32 signext %a) {
   %2 = select i1 %1, double 0.000000e+00, double 1.000000e+00
   ret double %2
 }
-
diff --git a/llvm/test/CodeGen/RISCV/float-imm.ll b/llvm/test/CodeGen/RISCV/float-imm.ll
index a25955e2ef34..c38416d994ba 100644
--- a/llvm/test/CodeGen/RISCV/float-imm.ll
+++ b/llvm/test/CodeGen/RISCV/float-imm.ll
@@ -73,4 +73,3 @@ define float @float_negative_zero(ptr %pf) nounwind {
 ; CHECKZFINX-NEXT:    ret
   ret float -0.0
 }
-
diff --git a/llvm/test/CodeGen/RISCV/float-select-verify.ll b/llvm/test/CodeGen/RISCV/float-select-verify.ll
index 6f414f16163b..b38560fa4136 100644
--- a/llvm/test/CodeGen/RISCV/float-select-verify.ll
+++ b/llvm/test/CodeGen/RISCV/float-select-verify.ll
@@ -90,4 +90,3 @@ declare void @foo(i64)
 declare void @bar(float)
 
 declare float @llvm.round.f32(float)
-
diff --git a/llvm/test/CodeGen/RISCV/fmax-fmin.ll b/llvm/test/CodeGen/RISCV/fmax-fmin.ll
index aac5e5efc2cf..b67093d72439 100644
--- a/llvm/test/CodeGen/RISCV/fmax-fmin.ll
+++ b/llvm/test/CodeGen/RISCV/fmax-fmin.ll
@@ -304,4 +304,3 @@ declare float @llvm.maxnum.f32(float, float)
 declare double @llvm.maxnum.f64(double, double)
 declare float @llvm.minnum.f32(float, float)
 declare double @llvm.minnum.f64(double, double)
-
diff --git a/llvm/test/CodeGen/RISCV/half-select-icmp.ll b/llvm/test/CodeGen/RISCV/half-select-icmp.ll
index a2da15e1fe16..0e4030049e91 100644
--- a/llvm/test/CodeGen/RISCV/half-select-icmp.ll
+++ b/llvm/test/CodeGen/RISCV/half-select-icmp.ll
@@ -537,4 +537,3 @@ define half @select_icmp_sgt_zero(i32 signext %a) {
   %2 = select i1 %1, half 0.000000e+00, half 1.000000e+00
   ret half %2
 }
-
diff --git a/llvm/test/CodeGen/RISCV/init-array.ll b/llvm/test/CodeGen/RISCV/init-array.ll
index 7935f320e803..b514e6fe8f4e 100644
--- a/llvm/test/CodeGen/RISCV/init-array.ll
+++ b/llvm/test/CodeGen/RISCV/init-array.ll
@@ -27,4 +27,3 @@ define internal void @_GLOBAL__I_a() section ".text.startup" {
 
 ;CTOR: .section .ctors
 ;CTOR-NOT:  section .init_array
-
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-abi-names.ll b/llvm/test/CodeGen/RISCV/inline-asm-abi-names.ll
index 8bfce389497b..e235372b0587 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-abi-names.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-abi-names.ll
@@ -156,26 +156,18 @@ define i32 @explicit_register_sp(i32 %a) nounwind {
 define i32 @explicit_register_x3(i32 %a) nounwind {
 ; RV32I-LABEL: explicit_register_x3:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw gp, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv gp, a0
 ; RV32I-NEXT:    #APP
 ; RV32I-NEXT:    addi a0, gp, 0
 ; RV32I-NEXT:    #NO_APP
-; RV32I-NEXT:    lw gp, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: explicit_register_x3:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd gp, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv gp, a0
 ; RV64I-NEXT:    #APP
 ; RV64I-NEXT:    addi a0, gp, 0
 ; RV64I-NEXT:    #NO_APP
-; RV64I-NEXT:    ld gp, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
   %1 = tail call i32 asm "addi $0, $1, 0", "=r,{x3}"(i32 %a)
   ret i32 %1
@@ -185,26 +177,18 @@ define i32 @explicit_register_x3(i32 %a) nounwind {
 define i32 @explicit_register_gp(i32 %a) nounwind {
 ; RV32I-LABEL: explicit_register_gp:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw gp, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv gp, a0
 ; RV32I-NEXT:    #APP
 ; RV32I-NEXT:    addi a0, gp, 0
 ; RV32I-NEXT:    #NO_APP
-; RV32I-NEXT:    lw gp, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: explicit_register_gp:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd gp, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv gp, a0
 ; RV64I-NEXT:    #APP
 ; RV64I-NEXT:    addi a0, gp, 0
 ; RV64I-NEXT:    #NO_APP
-; RV64I-NEXT:    ld gp, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
   %1 = tail call i32 asm "addi $0, $1, 0", "=r,{gp}"(i32 %a)
   ret i32 %1
@@ -214,26 +198,18 @@ define i32 @explicit_register_gp(i32 %a) nounwind {
 define i32 @explicit_register_x4(i32 %a) nounwind {
 ; RV32I-LABEL: explicit_register_x4:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw tp, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv tp, a0
 ; RV32I-NEXT:    #APP
 ; RV32I-NEXT:    addi a0, tp, 0
 ; RV32I-NEXT:    #NO_APP
-; RV32I-NEXT:    lw tp, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: explicit_register_x4:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd tp, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv tp, a0
 ; RV64I-NEXT:    #APP
 ; RV64I-NEXT:    addi a0, tp, 0
 ; RV64I-NEXT:    #NO_APP
-; RV64I-NEXT:    ld tp, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
   %1 = tail call i32 asm "addi $0, $1, 0", "=r,{x4}"(i32 %a)
   ret i32 %1
@@ -243,26 +219,18 @@ define i32 @explicit_register_x4(i32 %a) nounwind {
 define i32 @explicit_register_tp(i32 %a) nounwind {
 ; RV32I-LABEL: explicit_register_tp:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw tp, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv tp, a0
 ; RV32I-NEXT:    #APP
 ; RV32I-NEXT:    addi a0, tp, 0
 ; RV32I-NEXT:    #NO_APP
-; RV32I-NEXT:    lw tp, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: explicit_register_tp:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd tp, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv tp, a0
 ; RV64I-NEXT:    #APP
 ; RV64I-NEXT:    addi a0, tp, 0
 ; RV64I-NEXT:    #NO_APP
-; RV64I-NEXT:    ld tp, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
   %1 = tail call i32 asm "addi $0, $1, 0", "=r,{tp}"(i32 %a)
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/macro-fusions-veyron-v1.mir b/llvm/test/CodeGen/RISCV/macro-fusions.mir
index 6d1e92e997b3..13464141ce27 100644
--- a/llvm/test/CodeGen/RISCV/macro-fusions-veyron-v1.mir
+++ b/llvm/test/CodeGen/RISCV/macro-fusions.mir
@@ -1,7 +1,7 @@
 # REQUIRES: asserts
-# RUN: llc -mtriple=riscv64-linux-gnu  -mcpu=veyron-v1 -x=mir < %s \
+# RUN: llc -mtriple=riscv64-linux-gnu -x=mir < %s \
 # RUN:   -debug-only=machine-scheduler -start-before=machine-scheduler 2>&1 \
-# RUN:   -mattr=+lui-addi-fusion,+auipc-addi-fusion,+shifted-zext-fusion,+ld-add-fusion \
+# RUN:   -mattr=+lui-addi-fusion,+auipc-addi-fusion,+zexth-fusion,+zextw-fusion,+shifted-zextw-fusion,+ld-add-fusion \
 # RUN:   | FileCheck %s
 
 # CHECK: lui_addi:%bb.0
@@ -38,10 +38,10 @@ body:             |
     PseudoRET
 ...
 
-# CHECK: slli_srli
+# CHECK: slli_srli_shifted_zext
 # CHECK: Macro fuse: {{.*}}SLLI - SRLI
 ---
-name: slli_srli
+name: slli_srli_shifted_zext
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
@@ -55,10 +55,10 @@ body:             |
     PseudoRET
 ...
 
-# CHECK: slli_srli_48
+# CHECK: slli_srli_zexth
 # CHECK: Macro fuse: {{.*}}SLLI - SRLI
 ---
-name: slli_srli_48
+name: slli_srli_zexth
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
@@ -72,6 +72,23 @@ body:             |
     PseudoRET
 ...
 
+# CHECK: slli_srli_zextw
+# CHECK: Macro fuse: {{.*}}SLLI - SRLI
+---
+name: slli_srli_zextw
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    %1:gpr = COPY $x10
+    %2:gpr = SLLI %1, 32
+    %3:gpr = XORI %1, 3
+    %4:gpr = SRLI %2, 32
+    $x10 = COPY %3
+    $x11 = COPY %4
+    PseudoRET
+...
+
 # CHECK: slli_srli_no_fusion_0
 # CHECK-NOT: Macro fuse: {{.*}}SLLI - SRLI
 ---
diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll
index 06be6cbd9641..6f301882b452 100644
--- a/llvm/test/CodeGen/RISCV/neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/neg-abs.ll
@@ -256,4 +256,3 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
   %neg = sub nsw i64 0, %abs
   ret i64 %neg
 }
-
diff --git a/llvm/test/CodeGen/RISCV/pr63816.ll b/llvm/test/CodeGen/RISCV/pr63816.ll
index 21730dfcf13b..6eaec08abb90 100644
--- a/llvm/test/CodeGen/RISCV/pr63816.ll
+++ b/llvm/test/CodeGen/RISCV/pr63816.ll
@@ -80,4 +80,3 @@ define void @test(ptr %0, ptr %1) nounwind {
   store <8 x double> %V2, ptr %1
   ret void
 }
-
diff --git a/llvm/test/CodeGen/RISCV/reduction-formation.ll b/llvm/test/CodeGen/RISCV/reduction-formation.ll
index b2dea4237f5a..6b4dc0cd3699 100644
--- a/llvm/test/CodeGen/RISCV/reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/reduction-formation.ll
@@ -100,4 +100,3 @@ define i32 @reduce_or_4xi32(<4 x i32> %v) {
   %or2 = or i32 %or1, %e3
   ret i32 %or2
 }
-
diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
index 31ca8eab3350..c7454469fd69 100644
--- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
@@ -8,18 +8,16 @@
 define void @test1(ptr nocapture noundef %a, i32 noundef signext %n) {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    blez a1, .LBB0_3
-; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a1, 2
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:  .LBB0_2: # %for.body
+; CHECK-NEXT:    blez a1, .LBB0_2
+; CHECK-NEXT:  .LBB0_1: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    lw a2, 0(a0)
 ; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    bne a0, a1, .LBB0_2
-; CHECK-NEXT:  .LBB0_3: # %for.cond.cleanup
+; CHECK-NEXT:    bnez a1, .LBB0_1
+; CHECK-NEXT:  .LBB0_2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
   %cmp3 = icmp sgt i32 %n, 0
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
index 6d302fcfc78d..3bf7704dd183 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
@@ -320,4 +320,3 @@ define i32 @mul288(i32 %a) {
   %c = mul i32 %a, 288
   ret i32 %c
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbs.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbs.ll
index 1c8727668f68..34301ba5aadc 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadbs.ll
@@ -73,4 +73,3 @@ define i64 @th_tst_i64_cmp(i64 %a) nounwind {
   %zext = zext i1 %cmp to i64
   ret i64 %zext
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
index 20eb4cc10f40..62ea25218e2d 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
@@ -1309,4 +1309,3 @@ declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
 declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
-
diff --git a/llvm/test/CodeGen/RISCV/rv64-patchpoint.ll b/llvm/test/CodeGen/RISCV/rv64-patchpoint.ll
index 3fbd13f1f259..d2a3bccfef7b 100644
--- a/llvm/test/CodeGen/RISCV/rv64-patchpoint.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-patchpoint.ll
@@ -21,4 +21,3 @@ entry:
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
-
diff --git a/llvm/test/CodeGen/RISCV/rv64i-double-softfloat.ll b/llvm/test/CodeGen/RISCV/rv64i-double-softfloat.ll
index 70a4c1726ca3..25278caee64d 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-double-softfloat.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-double-softfloat.ll
@@ -55,4 +55,3 @@ entry:
   %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %a, metadata !"fpexcept.strict")
   ret i32 %conv
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
index ed2b159d0ae7..6f56babf28f5 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
@@ -316,4 +316,3 @@ define i64 @mul288(i64 %a) {
   %c = mul i64 %a, 288
   ret i64 %c
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbs.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbs.ll
index f36c618af199..4e20594f2b29 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadbs.ll
@@ -69,4 +69,3 @@ define i64 @th_tst_i64_cmp(i64 %a) nounwind {
   %zext = zext i1 %cmp to i64
   ret i64 %zext
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll
index f9e6f5fe06df..6875925adad8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll
@@ -619,4 +619,3 @@ define <vscale x 1 x double> @nxv2f64(double %x, double %y) {
   %v = fadd <vscale x 1 x double> %splat.x, %splat.y
   ret <vscale x 1 x double> %v
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/concat-vector-insert-elt.ll b/llvm/test/CodeGen/RISCV/rvv/concat-vector-insert-elt.ll
index 1e83b0b0dea8..bd65ed52be68 100644
--- a/llvm/test/CodeGen/RISCV/rvv/concat-vector-insert-elt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/concat-vector-insert-elt.ll
@@ -219,4 +219,3 @@ define void @v4xi64_concat_vector_insert_idx3(ptr %a, ptr %b, i64 %x) {
   store <4 x i64> %ins, ptr %a
   ret void
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index 47d65c2593a4..fc94f8c2a527 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -1231,16 +1231,17 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_nxv1i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    li a0, 190
+; CHECK-F-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-F-NEXT:    vmv.v.x v9, a0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v9, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v9, 23
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v9, v8
-; CHECK-F-NEXT:    li a1, 190
-; CHECK-F-NEXT:    vrsub.vx v8, v9, a1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
+; CHECK-F-NEXT:    vwsubu.wv v9, v9, v8
 ; CHECK-F-NEXT:    li a1, 64
-; CHECK-F-NEXT:    vminu.vx v8, v8, a1
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-F-NEXT:    vminu.vx v8, v9, a1
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1371,16 +1372,17 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_nxv2i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    li a0, 190
+; CHECK-F-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-F-NEXT:    vmv.v.x v10, a0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v10, v8
-; CHECK-F-NEXT:    li a1, 190
-; CHECK-F-NEXT:    vrsub.vx v8, v10, a1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v12, 23
+; CHECK-F-NEXT:    vwsubu.wv v10, v10, v8
 ; CHECK-F-NEXT:    li a1, 64
-; CHECK-F-NEXT:    vminu.vx v8, v8, a1
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-F-NEXT:    vminu.vx v8, v10, a1
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1511,16 +1513,17 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_nxv4i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    li a0, 190
+; CHECK-F-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-F-NEXT:    vmv.v.x v12, a0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v12, 23
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v12, v8
-; CHECK-F-NEXT:    li a1, 190
-; CHECK-F-NEXT:    vrsub.vx v8, v12, a1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v16, 23
+; CHECK-F-NEXT:    vwsubu.wv v12, v12, v8
 ; CHECK-F-NEXT:    li a1, 64
-; CHECK-F-NEXT:    vminu.vx v8, v8, a1
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-F-NEXT:    vminu.vx v8, v12, a1
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1651,16 +1654,17 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_nxv8i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    li a0, 190
+; CHECK-F-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-F-NEXT:    vmv.v.x v16, a0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v16, 23
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v16, v8
-; CHECK-F-NEXT:    li a1, 190
-; CHECK-F-NEXT:    vrsub.vx v8, v16, a1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v24, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v24, 23
+; CHECK-F-NEXT:    vwsubu.wv v16, v16, v8
 ; CHECK-F-NEXT:    li a1, 64
-; CHECK-F-NEXT:    vminu.vx v8, v8, a1
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-F-NEXT:    vminu.vx v8, v16, a1
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -2833,15 +2837,16 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv1i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    li a0, 190
+; CHECK-F-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-F-NEXT:    vmv.v.x v9, a0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v9, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v9, 23
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v9, v8
-; CHECK-F-NEXT:    li a1, 190
-; CHECK-F-NEXT:    vrsub.vx v8, v9, a1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
+; CHECK-F-NEXT:    vwsubu.wv v9, v9, v8
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vmv1r.v v8, v9
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i64:
@@ -2968,15 +2973,16 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv2i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    li a0, 190
+; CHECK-F-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-F-NEXT:    vmv.v.x v10, a0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v10, v8
-; CHECK-F-NEXT:    li a1, 190
-; CHECK-F-NEXT:    vrsub.vx v8, v10, a1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v12, 23
+; CHECK-F-NEXT:    vwsubu.wv v10, v10, v8
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vmv2r.v v8, v10
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i64:
@@ -3103,15 +3109,16 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv4i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    li a0, 190
+; CHECK-F-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; CHECK-F-NEXT:    vmv.v.x v12, a0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v12, 23
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v12, v8
-; CHECK-F-NEXT:    li a1, 190
-; CHECK-F-NEXT:    vrsub.vx v8, v12, a1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
+; CHECK-F-NEXT:    vsrl.vi v8, v16, 23
+; CHECK-F-NEXT:    vwsubu.wv v12, v12, v8
 ; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vmv4r.v v8, v12
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i64:
@@ -3238,14 +3245,15 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ;
 ; CHECK-F-LABEL: ctlz_zero_undef_nxv8i64:
 ; CHECK-F:       # %bb.0:
-; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vmv8r.v v16, v8
+; CHECK-F-NEXT:    li a0, 190
+; CHECK-F-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-F-NEXT:    vmv.v.x v8, a0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v16, 23
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-F-NEXT:    vzext.vf2 v16, v8
-; CHECK-F-NEXT:    li a1, 190
-; CHECK-F-NEXT:    vrsub.vx v8, v16, a1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v24, v16
+; CHECK-F-NEXT:    vsrl.vi v16, v24, 23
+; CHECK-F-NEXT:    vwsubu.wv v8, v8, v16
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll
index 2ff0b21cd251..ee9ad097b442 100644
--- a/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll
@@ -1209,33 +1209,15 @@ define <vscale x 1 x i16> @rint_nxv1f64_to_ui16(<vscale x 1 x double> %x) {
 define <vscale x 1 x i32> @rint_nxv1f64_to_si32(<vscale x 1 x double> %x) {
 ; RV32-LABEL: rint_nxv1f64_to_si32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI36_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI36_0)(a0)
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vfabs.v v9, v8
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
-; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; RV32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v9, v8
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv1f64_to_si32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI36_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI36_0)(a0)
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vfabs.v v9, v8
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
-; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; RV64-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64-NEXT:    vfncvt.x.f.w v9, v8
 ; RV64-NEXT:    vmv1r.v v8, v9
 ; RV64-NEXT:    ret
   %a = call <vscale x 1 x double> @llvm.rint.nxv1f64(<vscale x 1 x double> %x)
@@ -1246,33 +1228,15 @@ define <vscale x 1 x i32> @rint_nxv1f64_to_si32(<vscale x 1 x double> %x) {
 define <vscale x 1 x i32> @rint_nxv1f64_to_ui32(<vscale x 1 x double> %x) {
 ; RV32-LABEL: rint_nxv1f64_to_ui32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI37_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI37_0)(a0)
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vfabs.v v9, v8
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
-; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32-NEXT:    vfncvt.rtz.xu.f.w v9, v8
+; RV32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV32-NEXT:    vfncvt.xu.f.w v9, v8
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv1f64_to_ui32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI37_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI37_0)(a0)
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vfabs.v v9, v8
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
-; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64-NEXT:    vfncvt.rtz.xu.f.w v9, v8
+; RV64-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64-NEXT:    vfncvt.xu.f.w v9, v8
 ; RV64-NEXT:    vmv1r.v v8, v9
 ; RV64-NEXT:    ret
   %a = call <vscale x 1 x double> @llvm.rint.nxv1f64(<vscale x 1 x double> %x)
@@ -1283,30 +1247,14 @@ define <vscale x 1 x i32> @rint_nxv1f64_to_ui32(<vscale x 1 x double> %x) {
 define <vscale x 1 x i64> @rint_nxv1f64_to_si64(<vscale x 1 x double> %x) {
 ; RV32-LABEL: rint_nxv1f64_to_si64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI38_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI38_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vfabs.v v9, v8
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
-; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv1f64_to_si64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI38_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI38_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vfabs.v v9, v8
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
-; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; RV64-NEXT:    vfcvt.x.f.v v8, v8
 ; RV64-NEXT:    ret
   %a = call <vscale x 1 x double> @llvm.rint.nxv1f64(<vscale x 1 x double> %x)
   %b = fptosi <vscale x 1 x double> %a to <vscale x 1 x i64>
@@ -1316,30 +1264,14 @@ define <vscale x 1 x i64> @rint_nxv1f64_to_si64(<vscale x 1 x double> %x) {
 define <vscale x 1 x i64> @rint_nxv1f64_to_ui64(<vscale x 1 x double> %x) {
 ; RV32-LABEL: rint_nxv1f64_to_ui64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI39_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI39_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vfabs.v v9, v8
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
-; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; RV32-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv1f64_to_ui64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI39_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI39_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vfabs.v v9, v8
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
-; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; RV64-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV64-NEXT:    ret
   %a = call <vscale x 1 x double> @llvm.rint.nxv1f64(<vscale x 1 x double> %x)
   %b = fptoui <vscale x 1 x double> %a to <vscale x 1 x i64>
@@ -1519,33 +1451,15 @@ define <vscale x 4 x i16> @rint_nxv4f64_to_ui16(<vscale x 4 x double> %x) {
 define <vscale x 4 x i32> @rint_nxv4f64_to_si32(<vscale x 4 x double> %x) {
 ; RV32-LABEL: rint_nxv4f64_to_si32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI44_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI44_0)(a0)
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vfabs.v v12, v8
-; RV32-NEXT:    vmflt.vf v0, v12, fa5
-; RV32-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfncvt.rtz.x.f.w v12, v8
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v12, v8
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f64_to_si32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI44_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI44_0)(a0)
-; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vfabs.v v12, v8
-; RV64-NEXT:    vmflt.vf v0, v12, fa5
-; RV64-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfncvt.rtz.x.f.w v12, v8
+; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64-NEXT:    vfncvt.x.f.w v12, v8
 ; RV64-NEXT:    vmv.v.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x double> @llvm.rint.nxv4f64(<vscale x 4 x double> %x)
@@ -1556,33 +1470,15 @@ define <vscale x 4 x i32> @rint_nxv4f64_to_si32(<vscale x 4 x double> %x) {
 define <vscale x 4 x i32> @rint_nxv4f64_to_ui32(<vscale x 4 x double> %x) {
 ; RV32-LABEL: rint_nxv4f64_to_ui32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI45_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI45_0)(a0)
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vfabs.v v12, v8
-; RV32-NEXT:    vmflt.vf v0, v12, fa5
-; RV32-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfncvt.rtz.xu.f.w v12, v8
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vfncvt.xu.f.w v12, v8
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f64_to_ui32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI45_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI45_0)(a0)
-; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vfabs.v v12, v8
-; RV64-NEXT:    vmflt.vf v0, v12, fa5
-; RV64-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfncvt.rtz.xu.f.w v12, v8
+; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64-NEXT:    vfncvt.xu.f.w v12, v8
 ; RV64-NEXT:    vmv.v.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x double> @llvm.rint.nxv4f64(<vscale x 4 x double> %x)
@@ -1593,30 +1489,14 @@ define <vscale x 4 x i32> @rint_nxv4f64_to_ui32(<vscale x 4 x double> %x) {
 define <vscale x 4 x i64> @rint_nxv4f64_to_si64(<vscale x 4 x double> %x) {
 ; RV32-LABEL: rint_nxv4f64_to_si64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI46_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vfabs.v v12, v8
-; RV32-NEXT:    vmflt.vf v0, v12, fa5
-; RV32-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f64_to_si64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI46_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vfabs.v v12, v8
-; RV64-NEXT:    vmflt.vf v0, v12, fa5
-; RV64-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; RV64-NEXT:    vfcvt.x.f.v v8, v8
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x double> @llvm.rint.nxv4f64(<vscale x 4 x double> %x)
   %b = fptosi <vscale x 4 x double> %a to <vscale x 4 x i64>
@@ -1626,30 +1506,14 @@ define <vscale x 4 x i64> @rint_nxv4f64_to_si64(<vscale x 4 x double> %x) {
 define <vscale x 4 x i64> @rint_nxv4f64_to_ui64(<vscale x 4 x double> %x) {
 ; RV32-LABEL: rint_nxv4f64_to_ui64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI47_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI47_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vfabs.v v12, v8
-; RV32-NEXT:    vmflt.vf v0, v12, fa5
-; RV32-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; RV32-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f64_to_ui64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI47_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI47_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vfabs.v v12, v8
-; RV64-NEXT:    vmflt.vf v0, v12, fa5
-; RV64-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; RV64-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x double> @llvm.rint.nxv4f64(<vscale x 4 x double> %x)
   %b = fptoui <vscale x 4 x double> %a to <vscale x 4 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
index 5d88bc02f1f4..ee8c322961c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
@@ -636,4 +636,3 @@ define <1 x double> @v2f64(double %x, double %y) {
   %v = fadd <1 x double> %splat.x, %splat.y
   ret <1 x double> %v
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 4852850f234b..b8b41b9e4c91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -206,19 +206,33 @@ define <8 x float> @splat_idx_v8f32(<8 x float> %v, i64 %idx) {
 
 ; Test that we pull the vlse of the constant pool out of the loop.
 define dso_local void @splat_load_licm(float* %0) {
-; CHECK-LABEL: splat_load_licm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    lui a2, 263168
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a2
-; CHECK-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a1, .LBB12_1
-; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    ret
+; RV32-LABEL: splat_load_licm:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 1024
+; RV32-NEXT:    lui a2, 263168
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a2
+; RV32-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a1, a1, -4
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    bnez a1, .LBB12_1
+; RV32-NEXT:  # %bb.2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: splat_load_licm:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 1024
+; RV64-NEXT:    lui a2, 263168
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a2
+; RV64-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
+; RV64-NEXT:    vse32.v v8, (a0)
+; RV64-NEXT:    addiw a1, a1, -4
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    bnez a1, .LBB12_1
+; RV64-NEXT:  # %bb.2:
+; RV64-NEXT:    ret
   br label %2
 
 2:                                                ; preds = %2, %1
@@ -1394,6 +1408,3 @@ define <2 x double> @vid_step2_v2f64() {
 ; CHECK-NEXT:    ret
   ret <2 x double> <double 0.0, double 2.0>
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; RV32: {{.*}}
-; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index d9958f4aae35..5407eadb160b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -637,7 +637,6 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmv1r.v v0, v1
 ; CHECK-NEXT:    vmflt.vf v1, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index fd4a54b468f1..2a0ec47a3de0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -982,5 +982,3 @@ define float @reduce_fadd_4xi32_non_associative2(ptr %p) {
   %fadd2 = fadd fast float %fadd1, %e3
   ret float %fadd2
 }
-
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index eeb188627577..846295b3ead2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -13,7 +13,7 @@
 define void @gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: gather:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi a2, a0, 1024
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a4, 32
 ; CHECK-NEXT:    li a3, 5
 ; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
@@ -23,9 +23,10 @@ define void @gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B)
 ; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -32
 ; CHECK-NEXT:    addi a0, a0, 32
 ; CHECK-NEXT:    addi a1, a1, 160
-; CHECK-NEXT:    bne a0, a2, .LBB0_1
+; CHECK-NEXT:    bnez a2, .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -53,7 +54,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
 ; V-LABEL: gather_masked:
 ; V:       # %bb.0: # %entry
-; V-NEXT:    addi a2, a0, 1024
+; V-NEXT:    li a2, 1024
 ; V-NEXT:    lui a3, 983765
 ; V-NEXT:    addi a3, a3, 873
 ; V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
@@ -68,15 +69,16 @@ define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; V-NEXT:    vle8.v v10, (a0)
 ; V-NEXT:    vadd.vv v9, v10, v9
 ; V-NEXT:    vse8.v v9, (a0)
+; V-NEXT:    addi a2, a2, -32
 ; V-NEXT:    addi a0, a0, 32
 ; V-NEXT:    addi a1, a1, 160
-; V-NEXT:    bne a0, a2, .LBB1_1
+; V-NEXT:    bnez a2, .LBB1_1
 ; V-NEXT:  # %bb.2: # %for.cond.cleanup
 ; V-NEXT:    ret
 ;
 ; ZVE32F-LABEL: gather_masked:
 ; ZVE32F:       # %bb.0: # %entry
-; ZVE32F-NEXT:    addi a2, a0, 1024
+; ZVE32F-NEXT:    li a2, 1024
 ; ZVE32F-NEXT:    lui a3, 983765
 ; ZVE32F-NEXT:    addi a3, a3, 873
 ; ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
@@ -91,9 +93,10 @@ define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; ZVE32F-NEXT:    vle8.v v10, (a0)
 ; ZVE32F-NEXT:    vadd.vv v9, v10, v9
 ; ZVE32F-NEXT:    vse8.v v9, (a0)
+; ZVE32F-NEXT:    addi a2, a2, -32
 ; ZVE32F-NEXT:    addi a0, a0, 32
 ; ZVE32F-NEXT:    addi a1, a1, 160
-; ZVE32F-NEXT:    bne a0, a2, .LBB1_1
+; ZVE32F-NEXT:    bnez a2, .LBB1_1
 ; ZVE32F-NEXT:  # %bb.2: # %for.cond.cleanup
 ; ZVE32F-NEXT:    ret
 entry:
@@ -122,7 +125,7 @@ define void @gather_negative_stride(ptr noalias nocapture %A, ptr noalias nocapt
 ; CHECK-LABEL: gather_negative_stride:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi a1, a1, 155
-; CHECK-NEXT:    addi a2, a0, 1024
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a4, 32
 ; CHECK-NEXT:    li a3, -5
 ; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
@@ -132,9 +135,10 @@ define void @gather_negative_stride(ptr noalias nocapture %A, ptr noalias nocapt
 ; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -32
 ; CHECK-NEXT:    addi a0, a0, 32
 ; CHECK-NEXT:    addi a1, a1, 160
-; CHECK-NEXT:    bne a0, a2, .LBB2_1
+; CHECK-NEXT:    bnez a2, .LBB2_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -162,7 +166,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: gather_zero_stride:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi a2, a0, 1024
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-NEXT:  .LBB3_1: # %vector.body
@@ -171,9 +175,10 @@ define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vadd.vx v8, v8, a3
 ; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -32
 ; CHECK-NEXT:    addi a0, a0, 32
 ; CHECK-NEXT:    addi a1, a1, 160
-; CHECK-NEXT:    bne a0, a2, .LBB3_1
+; CHECK-NEXT:    bnez a2, .LBB3_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -201,7 +206,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; V-LABEL: gather_zero_stride_unfold:
 ; V:       # %bb.0: # %entry
-; V-NEXT:    addi a2, a0, 1024
+; V-NEXT:    li a2, 1024
 ; V-NEXT:    li a3, 32
 ; V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; V-NEXT:  .LBB4_1: # %vector.body
@@ -210,15 +215,16 @@ define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias noc
 ; V-NEXT:    vle8.v v9, (a0)
 ; V-NEXT:    vdivu.vv v8, v8, v9
 ; V-NEXT:    vse8.v v8, (a0)
+; V-NEXT:    addi a2, a2, -32
 ; V-NEXT:    addi a0, a0, 32
 ; V-NEXT:    addi a1, a1, 160
-; V-NEXT:    bne a0, a2, .LBB4_1
+; V-NEXT:    bnez a2, .LBB4_1
 ; V-NEXT:  # %bb.2: # %for.cond.cleanup
 ; V-NEXT:    ret
 ;
 ; ZVE32F-LABEL: gather_zero_stride_unfold:
 ; ZVE32F:       # %bb.0: # %entry
-; ZVE32F-NEXT:    addi a2, a0, 1024
+; ZVE32F-NEXT:    li a2, 1024
 ; ZVE32F-NEXT:    li a3, 32
 ; ZVE32F-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; ZVE32F-NEXT:  .LBB4_1: # %vector.body
@@ -227,15 +233,16 @@ define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias noc
 ; ZVE32F-NEXT:    vle8.v v9, (a0)
 ; ZVE32F-NEXT:    vdivu.vv v8, v8, v9
 ; ZVE32F-NEXT:    vse8.v v8, (a0)
+; ZVE32F-NEXT:    addi a2, a2, -32
 ; ZVE32F-NEXT:    addi a0, a0, 32
 ; ZVE32F-NEXT:    addi a1, a1, 160
-; ZVE32F-NEXT:    bne a0, a2, .LBB4_1
+; ZVE32F-NEXT:    bnez a2, .LBB4_1
 ; ZVE32F-NEXT:  # %bb.2: # %for.cond.cleanup
 ; ZVE32F-NEXT:    ret
 ;
 ; NOT-OPTIMIZED-LABEL: gather_zero_stride_unfold:
 ; NOT-OPTIMIZED:       # %bb.0: # %entry
-; NOT-OPTIMIZED-NEXT:    addi a2, a0, 1024
+; NOT-OPTIMIZED-NEXT:    li a2, 1024
 ; NOT-OPTIMIZED-NEXT:    li a3, 32
 ; NOT-OPTIMIZED-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; NOT-OPTIMIZED-NEXT:  .LBB4_1: # %vector.body
@@ -245,9 +252,10 @@ define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias noc
 ; NOT-OPTIMIZED-NEXT:    vmv.v.x v9, a3
 ; NOT-OPTIMIZED-NEXT:    vdivu.vv v8, v9, v8
 ; NOT-OPTIMIZED-NEXT:    vse8.v v8, (a0)
+; NOT-OPTIMIZED-NEXT:    addi a2, a2, -32
 ; NOT-OPTIMIZED-NEXT:    addi a0, a0, 32
 ; NOT-OPTIMIZED-NEXT:    addi a1, a1, 160
-; NOT-OPTIMIZED-NEXT:    bne a0, a2, .LBB4_1
+; NOT-OPTIMIZED-NEXT:    bnez a2, .LBB4_1
 ; NOT-OPTIMIZED-NEXT:  # %bb.2: # %for.cond.cleanup
 ; NOT-OPTIMIZED-NEXT:    ret
 entry:
@@ -279,7 +287,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: scatter:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi a2, a1, 1024
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a4, 32
 ; CHECK-NEXT:    li a3, 5
 ; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
@@ -289,9 +297,10 @@ define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B
 ; CHECK-NEXT:    vlse8.v v9, (a0), a3
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsse8.v v8, (a0), a3
+; CHECK-NEXT:    addi a2, a2, -32
 ; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    addi a0, a0, 160
-; CHECK-NEXT:    bne a1, a2, .LBB5_1
+; CHECK-NEXT:    bnez a2, .LBB5_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -319,7 +328,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
 ; V-LABEL: scatter_masked:
 ; V:       # %bb.0: # %entry
-; V-NEXT:    addi a2, a1, 1024
+; V-NEXT:    li a2, 1024
 ; V-NEXT:    li a3, 32
 ; V-NEXT:    lui a4, 983765
 ; V-NEXT:    addi a4, a4, 873
@@ -334,15 +343,16 @@ define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture read
 ; V-NEXT:    vlse8.v v10, (a0), a4, v0.t
 ; V-NEXT:    vadd.vv v9, v10, v9
 ; V-NEXT:    vsse8.v v9, (a0), a4, v0.t
+; V-NEXT:    addi a2, a2, -32
 ; V-NEXT:    addi a1, a1, 32
 ; V-NEXT:    addi a0, a0, 160
-; V-NEXT:    bne a1, a2, .LBB6_1
+; V-NEXT:    bnez a2, .LBB6_1
 ; V-NEXT:  # %bb.2: # %for.cond.cleanup
 ; V-NEXT:    ret
 ;
 ; ZVE32F-LABEL: scatter_masked:
 ; ZVE32F:       # %bb.0: # %entry
-; ZVE32F-NEXT:    addi a2, a1, 1024
+; ZVE32F-NEXT:    li a2, 1024
 ; ZVE32F-NEXT:    li a3, 32
 ; ZVE32F-NEXT:    lui a4, 983765
 ; ZVE32F-NEXT:    addi a4, a4, 873
@@ -357,9 +367,10 @@ define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture read
 ; ZVE32F-NEXT:    vlse8.v v10, (a0), a4, v0.t
 ; ZVE32F-NEXT:    vadd.vv v9, v10, v9
 ; ZVE32F-NEXT:    vsse8.v v9, (a0), a4, v0.t
+; ZVE32F-NEXT:    addi a2, a2, -32
 ; ZVE32F-NEXT:    addi a1, a1, 32
 ; ZVE32F-NEXT:    addi a0, a0, 160
-; ZVE32F-NEXT:    bne a1, a2, .LBB6_1
+; ZVE32F-NEXT:    bnez a2, .LBB6_1
 ; ZVE32F-NEXT:  # %bb.2: # %for.cond.cleanup
 ; ZVE32F-NEXT:    ret
 entry:
@@ -391,8 +402,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: gather_pow2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:    li a4, 32
 ; CHECK-NEXT:  .LBB7_1: # %vector.body
@@ -405,9 +415,10 @@ define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonl
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -8
 ; CHECK-NEXT:    addi a0, a0, 32
 ; CHECK-NEXT:    addi a1, a1, 128
-; CHECK-NEXT:    bne a0, a2, .LBB7_1
+; CHECK-NEXT:    bnez a2, .LBB7_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -439,8 +450,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: scatter_pow2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a1, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    li a4, 16
 ; CHECK-NEXT:  .LBB8_1: # %vector.body
@@ -451,9 +461,10 @@ define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readon
 ; CHECK-NEXT:    vlse32.v v9, (a0), a4
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsse32.v v8, (a0), a4
+; CHECK-NEXT:    addi a2, a2, -8
 ; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    bne a1, a2, .LBB8_1
+; CHECK-NEXT:    bnez a2, .LBB8_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -490,8 +501,7 @@ define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-LABEL: struct_gather:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi a1, a1, 132
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB9_1: # %vector.body
@@ -506,9 +516,10 @@ define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    vadd.vv v9, v11, v9
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    vse32.v v9, (a4)
+; CHECK-NEXT:    addi a2, a2, -16
 ; CHECK-NEXT:    addi a0, a0, 64
 ; CHECK-NEXT:    addi a1, a1, 256
-; CHECK-NEXT:    bne a0, a2, .LBB9_1
+; CHECK-NEXT:    bnez a2, .LBB9_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -639,8 +650,7 @@ declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32 immarg, <
 define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
 ; V-LABEL: gather_of_pointers:
 ; V:       # %bb.0: # %bb
-; V-NEXT:    lui a2, 2
-; V-NEXT:    add a2, a0, a2
+; V-NEXT:    li a2, 1024
 ; V-NEXT:    li a3, 40
 ; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; V-NEXT:  .LBB11_1: # %bb2
@@ -651,22 +661,22 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; V-NEXT:    addi a4, a0, 16
 ; V-NEXT:    vse64.v v8, (a0)
 ; V-NEXT:    vse64.v v9, (a4)
+; V-NEXT:    addi a2, a2, -4
 ; V-NEXT:    addi a0, a0, 32
 ; V-NEXT:    addi a1, a1, 160
-; V-NEXT:    bne a0, a2, .LBB11_1
+; V-NEXT:    bnez a2, .LBB11_1
 ; V-NEXT:  # %bb.2: # %bb18
 ; V-NEXT:    ret
 ;
 ; ZVE32F-LABEL: gather_of_pointers:
 ; ZVE32F:       # %bb.0: # %bb
 ; ZVE32F-NEXT:    li a2, 0
-; ZVE32F-NEXT:    lui a3, 2
-; ZVE32F-NEXT:    add a3, a0, a3
-; ZVE32F-NEXT:    li a4, 1
+; ZVE32F-NEXT:    li a3, 1
+; ZVE32F-NEXT:    li a4, 1024
 ; ZVE32F-NEXT:    li a5, 40
 ; ZVE32F-NEXT:  .LBB11_1: # %bb2
 ; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
-; ZVE32F-NEXT:    mul a6, a4, a5
+; ZVE32F-NEXT:    mul a6, a3, a5
 ; ZVE32F-NEXT:    add a6, a1, a6
 ; ZVE32F-NEXT:    mul a7, a2, a5
 ; ZVE32F-NEXT:    add a7, a1, a7
@@ -679,9 +689,10 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; ZVE32F-NEXT:    sd a6, 24(a0)
 ; ZVE32F-NEXT:    sd a7, 16(a0)
 ; ZVE32F-NEXT:    addi a2, a2, 4
+; ZVE32F-NEXT:    addi a3, a3, 4
+; ZVE32F-NEXT:    addi a4, a4, -4
 ; ZVE32F-NEXT:    addi a0, a0, 32
-; ZVE32F-NEXT:    addi a4, a4, 4
-; ZVE32F-NEXT:    bne a0, a3, .LBB11_1
+; ZVE32F-NEXT:    bnez a4, .LBB11_1
 ; ZVE32F-NEXT:  # %bb.2: # %bb18
 ; ZVE32F-NEXT:    ret
 bb:
@@ -716,8 +727,7 @@ declare <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr>, i32 immarg, <2 x i1>,
 define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
 ; V-LABEL: scatter_of_pointers:
 ; V:       # %bb.0: # %bb
-; V-NEXT:    lui a2, 2
-; V-NEXT:    add a2, a1, a2
+; V-NEXT:    li a2, 1024
 ; V-NEXT:    li a3, 40
 ; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; V-NEXT:  .LBB12_1: # %bb2
@@ -728,18 +738,18 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; V-NEXT:    addi a4, a0, 80
 ; V-NEXT:    vsse64.v v8, (a0), a3
 ; V-NEXT:    vsse64.v v9, (a4), a3
+; V-NEXT:    addi a2, a2, -4
 ; V-NEXT:    addi a1, a1, 32
 ; V-NEXT:    addi a0, a0, 160
-; V-NEXT:    bne a1, a2, .LBB12_1
+; V-NEXT:    bnez a2, .LBB12_1
 ; V-NEXT:  # %bb.2: # %bb18
 ; V-NEXT:    ret
 ;
 ; ZVE32F-LABEL: scatter_of_pointers:
 ; ZVE32F:       # %bb.0: # %bb
 ; ZVE32F-NEXT:    li a2, 0
-; ZVE32F-NEXT:    lui a3, 2
-; ZVE32F-NEXT:    add a3, a1, a3
-; ZVE32F-NEXT:    li a4, 1
+; ZVE32F-NEXT:    li a3, 1
+; ZVE32F-NEXT:    li a4, 1024
 ; ZVE32F-NEXT:    li a5, 40
 ; ZVE32F-NEXT:  .LBB12_1: # %bb2
 ; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -747,7 +757,7 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-NEXT:    ld a7, 0(a1)
 ; ZVE32F-NEXT:    ld t0, 24(a1)
 ; ZVE32F-NEXT:    ld t1, 16(a1)
-; ZVE32F-NEXT:    mul t2, a4, a5
+; ZVE32F-NEXT:    mul t2, a3, a5
 ; ZVE32F-NEXT:    add t2, a0, t2
 ; ZVE32F-NEXT:    mul t3, a2, a5
 ; ZVE32F-NEXT:    add t3, a0, t3
@@ -756,9 +766,10 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-NEXT:    sd t1, 80(t3)
 ; ZVE32F-NEXT:    sd t0, 80(t2)
 ; ZVE32F-NEXT:    addi a2, a2, 4
+; ZVE32F-NEXT:    addi a3, a3, 4
+; ZVE32F-NEXT:    addi a4, a4, -4
 ; ZVE32F-NEXT:    addi a1, a1, 32
-; ZVE32F-NEXT:    addi a4, a4, 4
-; ZVE32F-NEXT:    bne a1, a3, .LBB12_1
+; ZVE32F-NEXT:    bnez a4, .LBB12_1
 ; ZVE32F-NEXT:  # %bb.2: # %bb18
 ; ZVE32F-NEXT:    ret
 bb:
@@ -795,56 +806,53 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt
 ; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    beq a2, a3, .LBB13_7
 ; CHECK-NEXT:  # %bb.1: # %bb3
-; CHECK-NEXT:    li a3, 1023
-; CHECK-NEXT:    subw a5, a3, a2
-; CHECK-NEXT:    li a6, 31
-; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    bltu a5, a6, .LBB13_5
+; CHECK-NEXT:    li a4, 1023
+; CHECK-NEXT:    subw a4, a4, a2
+; CHECK-NEXT:    li a5, 31
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    bltu a4, a5, .LBB13_5
 ; CHECK-NEXT:  # %bb.2: # %bb9
-; CHECK-NEXT:    slli a5, a5, 32
-; CHECK-NEXT:    srli a5, a5, 32
-; CHECK-NEXT:    addi a5, a5, 1
-; CHECK-NEXT:    andi a6, a5, -32
-; CHECK-NEXT:    add a4, a6, a2
-; CHECK-NEXT:    slli t0, a2, 2
-; CHECK-NEXT:    add a7, a0, a2
+; CHECK-NEXT:    slli a4, a4, 32
+; CHECK-NEXT:    srli a4, a4, 32
+; CHECK-NEXT:    addi a4, a4, 1
+; CHECK-NEXT:    andi a5, a4, -32
+; CHECK-NEXT:    add a3, a5, a2
+; CHECK-NEXT:    slli a7, a2, 2
+; CHECK-NEXT:    add a6, a0, a2
 ; CHECK-NEXT:    add a2, a1, a2
-; CHECK-NEXT:    add a2, a2, t0
-; CHECK-NEXT:    add t0, a4, a0
-; CHECK-NEXT:    li t2, 32
-; CHECK-NEXT:    li t1, 5
-; CHECK-NEXT:    vsetvli zero, t2, e8, m1, ta, ma
+; CHECK-NEXT:    add a2, a2, a7
+; CHECK-NEXT:    li t0, 32
+; CHECK-NEXT:    li a7, 5
+; CHECK-NEXT:    vsetvli zero, t0, e8, m1, ta, ma
+; CHECK-NEXT:    mv t0, a5
 ; CHECK-NEXT:  .LBB13_3: # %bb15
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vlse8.v v8, (a2), t1
-; CHECK-NEXT:    vle8.v v9, (a7)
+; CHECK-NEXT:    vlse8.v v8, (a2), a7
+; CHECK-NEXT:    vle8.v v9, (a6)
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vse8.v v8, (a7)
-; CHECK-NEXT:    addi a7, a7, 32
+; CHECK-NEXT:    vse8.v v8, (a6)
+; CHECK-NEXT:    addi t0, t0, -32
+; CHECK-NEXT:    addi a6, a6, 32
 ; CHECK-NEXT:    addi a2, a2, 160
-; CHECK-NEXT:    bne a7, t0, .LBB13_3
+; CHECK-NEXT:    bnez t0, .LBB13_3
 ; CHECK-NEXT:  # %bb.4: # %bb30
-; CHECK-NEXT:    beq a5, a6, .LBB13_7
+; CHECK-NEXT:    beq a4, a5, .LBB13_7
 ; CHECK-NEXT:  .LBB13_5: # %bb32
-; CHECK-NEXT:    add a2, a0, a4
-; CHECK-NEXT:    slli a5, a4, 2
-; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    add a1, a1, a5
-; CHECK-NEXT:    subw a3, a3, a4
-; CHECK-NEXT:    slli a3, a3, 32
-; CHECK-NEXT:    srli a3, a3, 32
-; CHECK-NEXT:    add a0, a4, a0
+; CHECK-NEXT:    addi a2, a3, -1024
 ; CHECK-NEXT:    add a0, a0, a3
-; CHECK-NEXT:    addi a0, a0, 1
+; CHECK-NEXT:    slli a4, a3, 2
+; CHECK-NEXT:    add a1, a1, a3
+; CHECK-NEXT:    add a1, a1, a4
 ; CHECK-NEXT:  .LBB13_6: # %bb35
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    lbu a3, 0(a1)
-; CHECK-NEXT:    lbu a4, 0(a2)
+; CHECK-NEXT:    lbu a4, 0(a0)
 ; CHECK-NEXT:    add a3, a4, a3
-; CHECK-NEXT:    sb a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 1
+; CHECK-NEXT:    sb a3, 0(a0)
+; CHECK-NEXT:    addiw a2, a2, 1
+; CHECK-NEXT:    addi a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, 5
-; CHECK-NEXT:    bne a2, a0, .LBB13_6
+; CHECK-NEXT:    bnez a2, .LBB13_6
 ; CHECK-NEXT:  .LBB13_7: # %bb34
 ; CHECK-NEXT:    ret
 bb:
@@ -918,10 +926,6 @@ define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr
 ; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    beqz a2, .LBB14_3
 ; CHECK-NEXT:  # %bb.1: # %bb2
-; CHECK-NEXT:    addi a2, a2, -16
-; CHECK-NEXT:    andi a2, a2, -16
-; CHECK-NEXT:    add a2, a2, a0
-; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    li a3, 5
 ; CHECK-NEXT:    vsetivli zero, 16, e8, mf2, ta, ma
 ; CHECK-NEXT:  .LBB14_2: # %bb4
@@ -930,9 +934,10 @@ define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr
 ; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vse8.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -16
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    addi a1, a1, 80
-; CHECK-NEXT:    bne a0, a2, .LBB14_2
+; CHECK-NEXT:    bnez a2, .LBB14_2
 ; CHECK-NEXT:  .LBB14_3: # %bb16
 ; CHECK-NEXT:    ret
 bb:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
index b359f71be0e6..72f2447e80c2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
@@ -1006,4 +1006,3 @@ vector.body:                                      ; preds = %vector.body, %entry
 for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
index 32ef08101407..65776339de07 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
@@ -1037,4 +1037,3 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
   %v = call <32 x double> @llvm.vp.fma.v32f64(<32 x double> %va, <32 x double> %b, <32 x double> %c, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
index e05d6b1525ee..28ab179048ca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
@@ -801,4 +801,3 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
   %v = call <32 x double> @llvm.vp.fmuladd.v32f64(<32 x double> %va, <32 x double> %b, <32 x double> %c, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
index 2453e5423e13..d57c012fec1e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
@@ -923,4 +923,3 @@ define <2 x i64> @vwmulu_vx_v2i64_i64(ptr %x, ptr %y) {
   %g = mul <2 x i64> %e, %f
   ret <2 x i64> %g
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-x.ll
index 4c5b5cfbd9d9..7a20dd1d32b7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-x.ll
@@ -1849,4 +1849,3 @@ entry:
 }
 
 declare <16 x float> @llvm.riscv.sf.vc.v.i.se.nxv16f32.iXLen.iXLen(iXLen, iXLen, iXLen, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xv.ll
index e845dffecff8..b553a62ae496 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xv.ll
@@ -3669,4 +3669,3 @@ entry:
 }
 
 declare <16 x float> @llvm.riscv.sf.vc.v.fv.se.nxv16f32.nxv16f32.iXLen.f32(iXLen, <16 x float>, float, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvv.ll
index 2a9153fc0f82..44ffffc7e59d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvv.ll
@@ -3669,4 +3669,3 @@ entry:
 }
 
 declare <16 x float> @llvm.riscv.sf.vc.v.fvv.se.nxv16f32.nxv16f32.nxv16i32.f32.iXLen(iXLen, <16 x float>, <16 x i32>, float %rs1, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvw.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvw.ll
index ec34791e6572..ea6b936843c2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvw.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-xsfvcp-xvw.ll
@@ -2694,4 +2694,3 @@ entry:
 }
 
 declare <8 x double> @llvm.riscv.sf.vc.v.fvw.se.nxv8f64.nxv8f32.nxv8i32.f32.iXLen(iXLen, <8 x double>, <8 x i32>, float, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
index 46b1dd9d2b46..9dcb6d211cb9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
@@ -931,33 +931,15 @@ define <vscale x 4 x i8> @rint_nxv4f32_to_ui8(<vscale x 4 x float> %x) {
 define <vscale x 4 x i16> @rint_nxv4f32_to_si16(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_si16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfabs.v v10, v8
-; RV32-NEXT:    lui a0, 307200
-; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
-; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32-NEXT:    vfncvt.rtz.x.f.w v10, v8
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v10, v8
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_si16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfabs.v v10, v8
-; RV64-NEXT:    lui a0, 307200
-; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
-; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64-NEXT:    vfncvt.rtz.x.f.w v10, v8
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT:    vfncvt.x.f.w v10, v8
 ; RV64-NEXT:    vmv.v.v v8, v10
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
@@ -968,33 +950,15 @@ define <vscale x 4 x i16> @rint_nxv4f32_to_si16(<vscale x 4 x float> %x) {
 define <vscale x 4 x i16> @rint_nxv4f32_to_ui16(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_ui16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfabs.v v10, v8
-; RV32-NEXT:    lui a0, 307200
-; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
-; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32-NEXT:    vfncvt.rtz.xu.f.w v10, v8
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT:    vfncvt.xu.f.w v10, v8
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_ui16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfabs.v v10, v8
-; RV64-NEXT:    lui a0, 307200
-; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
-; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64-NEXT:    vfncvt.rtz.xu.f.w v10, v8
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT:    vfncvt.xu.f.w v10, v8
 ; RV64-NEXT:    vmv.v.v v8, v10
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
@@ -1006,29 +970,13 @@ define <vscale x 4 x i32> @rint_nxv4f32_to_si32(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_si32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfabs.v v10, v8
-; RV32-NEXT:    lui a0, 307200
-; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
-; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_si32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfabs.v v10, v8
-; RV64-NEXT:    lui a0, 307200
-; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
-; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; RV64-NEXT:    vfcvt.x.f.v v8, v8
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
   %b = fptosi <vscale x 4 x float> %a to <vscale x 4 x i32>
@@ -1039,29 +987,13 @@ define <vscale x 4 x i32> @rint_nxv4f32_to_ui32(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_ui32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfabs.v v10, v8
-; RV32-NEXT:    lui a0, 307200
-; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
-; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; RV32-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_ui32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfabs.v v10, v8
-; RV64-NEXT:    lui a0, 307200
-; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
-; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; RV64-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
   %b = fptoui <vscale x 4 x float> %a to <vscale x 4 x i32>
@@ -1072,30 +1004,14 @@ define <vscale x 4 x i64> @rint_nxv4f32_to_si64(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_si64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfabs.v v10, v8
-; RV32-NEXT:    lui a0, 307200
-; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
-; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vfwcvt.rtz.x.f.v v12, v8
+; RV32-NEXT:    vfwcvt.x.f.v v12, v8
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_si64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfabs.v v10, v8
-; RV64-NEXT:    lui a0, 307200
-; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
-; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vfwcvt.rtz.x.f.v v12, v8
+; RV64-NEXT:    vfwcvt.x.f.v v12, v8
 ; RV64-NEXT:    vmv4r.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
@@ -1107,30 +1023,14 @@ define <vscale x 4 x i64> @rint_nxv4f32_to_ui64(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_ui64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfabs.v v10, v8
-; RV32-NEXT:    lui a0, 307200
-; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
-; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
+; RV32-NEXT:    vfwcvt.xu.f.v v12, v8
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_ui64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfabs.v v10, v8
-; RV64-NEXT:    lui a0, 307200
-; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
-; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
+; RV64-NEXT:    vfwcvt.xu.f.v v12, v8
 ; RV64-NEXT:    vmv4r.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 7497051027fa..f1a82b9e427e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -39,12 +39,8 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.x.f.v v8, v8
-; CHECK-V-NEXT:    lui a0, 524288
-; CHECK-V-NEXT:    addiw a1, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v8, a1
-; CHECK-V-NEXT:    vmax.vx v8, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-V-NEXT:    vnclip.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptosi <2 x double> %x to <2 x i64>
@@ -79,11 +75,8 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.xu.f.v v8, v8
-; CHECK-V-NEXT:    li a0, -1
-; CHECK-V-NEXT:    srli a0, a0, 32
-; CHECK-V-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-V-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptoui <2 x double> %x to <2 x i64>
@@ -198,13 +191,7 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-V-NEXT:    lui a0, 524288
-; CHECK-V-NEXT:    addiw a1, a0, -1
-; CHECK-V-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-V-NEXT:    vmin.vx v8, v10, a1
-; CHECK-V-NEXT:    vmax.vx v10, v8, a0
-; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptosi <4 x float> %x to <4 x i64>
@@ -257,12 +244,7 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-V-NEXT:    li a0, -1
-; CHECK-V-NEXT:    srli a0, a0, 32
-; CHECK-V-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-V-NEXT:    vminu.vx v10, v10, a0
-; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptoui <4 x float> %x to <4 x i64>
@@ -510,12 +492,8 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
-; CHECK-V-NEXT:    lui a0, 524288
-; CHECK-V-NEXT:    addiw a1, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v10, a1
-; CHECK-V-NEXT:    vmax.vx v10, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 2
 ; CHECK-V-NEXT:    add sp, sp, a0
@@ -682,11 +660,8 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
-; CHECK-V-NEXT:    li a0, -1
-; CHECK-V-NEXT:    srli a0, a0, 32
-; CHECK-V-NEXT:    vminu.vx v10, v10, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 2
 ; CHECK-V-NEXT:    add sp, sp, a0
@@ -925,13 +900,8 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; CHECK-V-NEXT:    lui a0, 8
-; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v9, a0
-; CHECK-V-NEXT:    lui a0, 1048568
-; CHECK-V-NEXT:    vmax.vx v8, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-V-NEXT:    vnclip.wi v8, v9, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
@@ -966,11 +936,8 @@ define <2 x i16> @utest_f64i16(<2 x double> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vfncvt.rtz.xu.f.w v9, v8
-; CHECK-V-NEXT:    lui a0, 16
-; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vminu.vx v8, v9, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-V-NEXT:    vnclipu.wi v8, v9, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptoui <2 x double> %x to <2 x i32>
@@ -1087,13 +1054,8 @@ define <4 x i16> @stest_f32i16(<4 x float> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.x.f.v v8, v8
-; CHECK-V-NEXT:    lui a0, 8
-; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v8, a0
-; CHECK-V-NEXT:    lui a0, 1048568
-; CHECK-V-NEXT:    vmax.vx v8, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-V-NEXT:    vnclip.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptosi <4 x float> %x to <4 x i32>
@@ -1146,11 +1108,8 @@ define <4 x i16> @utest_f32i16(<4 x float> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.xu.f.v v8, v8
-; CHECK-V-NEXT:    lui a0, 16
-; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-V-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptoui <4 x float> %x to <4 x i32>
@@ -1525,13 +1484,8 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
-; CHECK-V-NEXT:    lui a0, 8
-; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v10, a0
-; CHECK-V-NEXT:    lui a0, 1048568
-; CHECK-V-NEXT:    vmax.vx v10, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
@@ -1808,11 +1762,8 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
-; CHECK-V-NEXT:    lui a0, 16
-; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vminu.vx v10, v10, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
@@ -3385,12 +3336,8 @@ define <2 x i32> @stest_f64i32_mm(<2 x double> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.x.f.v v8, v8
-; CHECK-V-NEXT:    lui a0, 524288
-; CHECK-V-NEXT:    addiw a1, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v8, a1
-; CHECK-V-NEXT:    vmax.vx v8, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-V-NEXT:    vnclip.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptosi <2 x double> %x to <2 x i64>
@@ -3423,11 +3370,8 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.xu.f.v v8, v8
-; CHECK-V-NEXT:    li a0, -1
-; CHECK-V-NEXT:    srli a0, a0, 32
-; CHECK-V-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-V-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptoui <2 x double> %x to <2 x i64>
@@ -3539,13 +3483,7 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfwcvt.rtz.x.f.v v10, v8
-; CHECK-V-NEXT:    lui a0, 524288
-; CHECK-V-NEXT:    addiw a1, a0, -1
-; CHECK-V-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-V-NEXT:    vmin.vx v8, v10, a1
-; CHECK-V-NEXT:    vmax.vx v10, v8, a0
-; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptosi <4 x float> %x to <4 x i64>
@@ -3596,12 +3534,7 @@ define <4 x i32> @utest_f32i32_mm(<4 x float> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
-; CHECK-V-NEXT:    li a0, -1
-; CHECK-V-NEXT:    srli a0, a0, 32
-; CHECK-V-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-V-NEXT:    vminu.vx v10, v10, a0
-; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptoui <4 x float> %x to <4 x i64>
@@ -3846,12 +3779,8 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
-; CHECK-V-NEXT:    lui a0, 524288
-; CHECK-V-NEXT:    addiw a1, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v10, a1
-; CHECK-V-NEXT:    vmax.vx v10, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 2
 ; CHECK-V-NEXT:    add sp, sp, a0
@@ -4016,11 +3945,8 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
-; CHECK-V-NEXT:    li a0, -1
-; CHECK-V-NEXT:    srli a0, a0, 32
-; CHECK-V-NEXT:    vminu.vx v10, v10, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 2
 ; CHECK-V-NEXT:    add sp, sp, a0
@@ -4256,13 +4182,8 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; CHECK-V-NEXT:    lui a0, 8
-; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v9, a0
-; CHECK-V-NEXT:    lui a0, 1048568
-; CHECK-V-NEXT:    vmax.vx v8, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-V-NEXT:    vnclip.wi v8, v9, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
@@ -4295,11 +4216,8 @@ define <2 x i16> @utest_f64i16_mm(<2 x double> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    vfncvt.rtz.xu.f.w v9, v8
-; CHECK-V-NEXT:    lui a0, 16
-; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vminu.vx v8, v9, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-V-NEXT:    vnclipu.wi v8, v9, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptoui <2 x double> %x to <2 x i32>
@@ -4413,13 +4331,8 @@ define <4 x i16> @stest_f32i16_mm(<4 x float> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.x.f.v v8, v8
-; CHECK-V-NEXT:    lui a0, 8
-; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v8, a0
-; CHECK-V-NEXT:    lui a0, 1048568
-; CHECK-V-NEXT:    vmax.vx v8, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-V-NEXT:    vnclip.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptosi <4 x float> %x to <4 x i32>
@@ -4470,11 +4383,8 @@ define <4 x i16> @utest_f32i16_mm(<4 x float> %x) {
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfcvt.rtz.xu.f.v v8, v8
-; CHECK-V-NEXT:    lui a0, 16
-; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vminu.vx v8, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-V-NEXT:    vnclipu.wi v8, v8, 0
 ; CHECK-V-NEXT:    ret
 entry:
   %conv = fptoui <4 x float> %x to <4 x i32>
@@ -4846,13 +4756,8 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
-; CHECK-V-NEXT:    lui a0, 8
-; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v10, a0
-; CHECK-V-NEXT:    lui a0, 1048568
-; CHECK-V-NEXT:    vmax.vx v10, v8, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
@@ -5125,11 +5030,8 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    addi a0, sp, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
-; CHECK-V-NEXT:    lui a0, 16
-; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vminu.vx v10, v10, a0
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll
index 242034f9826c..52322f64416c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll
@@ -342,4 +342,3 @@ define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
     %x = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4f16.nxv4i64(<vscale x 4 x half> %f)
     ret <vscale x 4 x i64> %x
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
index 2e960209f9ed..6de62214ccc4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
@@ -653,17 +653,8 @@ declare <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half>)
 define <vscale x 1 x i8> @rint_nxv1f16_to_si8(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: rint_nxv1f16_to_si8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI32_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI32_0)(a0)
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vfncvt.x.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half> %x)
@@ -674,17 +665,8 @@ define <vscale x 1 x i8> @rint_nxv1f16_to_si8(<vscale x 1 x half> %x) {
 define <vscale x 1 x i8> @rint_nxv1f16_to_ui8(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: rint_nxv1f16_to_ui8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI33_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI33_0)(a0)
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half> %x)
@@ -695,16 +677,8 @@ define <vscale x 1 x i8> @rint_nxv1f16_to_ui8(<vscale x 1 x half> %x) {
 define <vscale x 1 x i16> @rint_nxv1f16_to_si16(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: rint_nxv1f16_to_si16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI34_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI34_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; CHECK-NEXT:    vfcvt.x.f.v v8, v8
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half> %x)
   %b = fptosi <vscale x 1 x half> %a to <vscale x 1 x i16>
@@ -714,16 +688,8 @@ define <vscale x 1 x i16> @rint_nxv1f16_to_si16(<vscale x 1 x half> %x) {
 define <vscale x 1 x i16> @rint_nxv1f16_to_ui16(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: rint_nxv1f16_to_ui16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI35_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI35_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half> %x)
   %b = fptoui <vscale x 1 x half> %a to <vscale x 1 x i16>
@@ -733,16 +699,8 @@ define <vscale x 1 x i16> @rint_nxv1f16_to_ui16(<vscale x 1 x half> %x) {
 define <vscale x 1 x i32> @rint_nxv1f16_to_si32(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: rint_nxv1f16_to_si32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI36_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI36_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v9, v8
+; CHECK-NEXT:    vfwcvt.x.f.v v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half> %x)
@@ -753,16 +711,8 @@ define <vscale x 1 x i32> @rint_nxv1f16_to_si32(<vscale x 1 x half> %x) {
 define <vscale x 1 x i32> @rint_nxv1f16_to_ui32(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: rint_nxv1f16_to_ui32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI37_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI37_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v9, v8
+; CHECK-NEXT:    vfwcvt.xu.f.v v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half> %x)
@@ -889,17 +839,8 @@ declare <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half>)
 define <vscale x 4 x i8> @rint_nxv4f16_to_si8(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_si8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI40_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI40_0)(a0)
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.x.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
@@ -910,17 +851,8 @@ define <vscale x 4 x i8> @rint_nxv4f16_to_si8(<vscale x 4 x half> %x) {
 define <vscale x 4 x i8> @rint_nxv4f16_to_ui8(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_ui8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI41_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI41_0)(a0)
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
@@ -931,16 +863,8 @@ define <vscale x 4 x i8> @rint_nxv4f16_to_ui8(<vscale x 4 x half> %x) {
 define <vscale x 4 x i16> @rint_nxv4f16_to_si16(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_si16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI42_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI42_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; CHECK-NEXT:    vfcvt.x.f.v v8, v8
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
   %b = fptosi <vscale x 4 x half> %a to <vscale x 4 x i16>
@@ -950,16 +874,8 @@ define <vscale x 4 x i16> @rint_nxv4f16_to_si16(<vscale x 4 x half> %x) {
 define <vscale x 4 x i16> @rint_nxv4f16_to_ui16(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_ui16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI43_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI43_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
   %b = fptoui <vscale x 4 x half> %a to <vscale x 4 x i16>
@@ -969,16 +885,8 @@ define <vscale x 4 x i16> @rint_nxv4f16_to_ui16(<vscale x 4 x half> %x) {
 define <vscale x 4 x i32> @rint_nxv4f16_to_si32(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_si32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI44_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI44_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
+; CHECK-NEXT:    vfwcvt.x.f.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
@@ -989,16 +897,8 @@ define <vscale x 4 x i32> @rint_nxv4f16_to_si32(<vscale x 4 x half> %x) {
 define <vscale x 4 x i32> @rint_nxv4f16_to_ui32(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_ui32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI45_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI45_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
+; CHECK-NEXT:    vfwcvt.xu.f.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
diff --git a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll
index d8b312e9def4..ef5ad1f651fa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll
@@ -726,4 +726,3 @@ define <vscale x 8 x i1> @trunc_nxv8i64_nxv8i1(<vscale x 8 x i64> %v) {
   %r = trunc <vscale x 8 x i64> %v to <vscale x 8 x i1>
   ret <vscale x 8 x i1> %r
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll
index aae935e099fe..aaac0c7b5a41 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll
@@ -726,4 +726,3 @@ define <vscale x 8 x i1> @trunc_nxv8i64_nxv8i1(<vscale x 8 x i64> %v) {
   %r = trunc <vscale x 8 x i64> %v to <vscale x 8 x i1>
   ret <vscale x 8 x i1> %r
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll
index b4f4a879a0b5..b44b57394321 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll
@@ -13,7 +13,7 @@ declare <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv1i8.nxv1f32.iXLen(
 define <vscale x 1 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv1i8_nxv1f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv1i8_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v9, v8, fa0
 ; CHECK-NEXT:    fsrm a0
@@ -39,7 +39,7 @@ declare <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv1i8.nxv1f32.iXL
 define <vscale x 1 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv1i8_nxv1f32(<vscale x 1 x i8> %0, <vscale x 1 x float> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv1i8_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v9, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -64,11 +64,11 @@ declare <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv2i8.nxv2f32.iXLen(
 define <vscale x 2 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv2i8_nxv2f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv2i8_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v9, v8, fa0
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv2i8.nxv2f32.iXLen(
@@ -90,7 +90,7 @@ declare <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv2i8.nxv2f32.iXL
 define <vscale x 2 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv2i8_nxv2f32(<vscale x 2 x i8> %0, <vscale x 2 x float> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv2i8_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v9, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -115,7 +115,7 @@ declare <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv4i8.nxv4f32.iXLen(
 define <vscale x 4 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv4i8_nxv4f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv4i8_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v10, v8, fa0
 ; CHECK-NEXT:    fsrm a0
@@ -141,7 +141,7 @@ declare <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv4i8.nxv4f32.iXL
 define <vscale x 4 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv4i8_nxv4f32(<vscale x 4 x i8> %0, <vscale x 4 x float> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv4i8_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v10, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -166,11 +166,11 @@ declare <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv8i8.nxv8f32.iXLen(
 define <vscale x 8 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv8i8_nxv8f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v12, v8, fa0
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv8i8.nxv8f32.iXLen(
@@ -192,7 +192,7 @@ declare <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv8i8.nxv8f32.iXL
 define <vscale x 8 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv8i8_nxv8f32(<vscale x 8 x i8> %0, <vscale x 8 x float> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v12, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -217,11 +217,11 @@ declare <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv16i8.nxv16f32.iXLen
 define <vscale x 16 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv16i8_nxv16f32(<vscale x 16 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv16i8_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v16, v8, fa0
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv2r.v v8, v16
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv16i8.nxv16f32.iXLen(
@@ -243,7 +243,7 @@ declare <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv16i8.nxv16f32.
 define <vscale x 16 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv16i8_nxv16f32(<vscale x 16 x i8> %0, <vscale x 16 x float> %1, float %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv16i8_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v16, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll
index 363cccd5ad35..bc2f7ca7dc86 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll
@@ -13,7 +13,7 @@ declare <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv1i8.nxv1f32.iXLen(
 define <vscale x 1 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv1i8_nxv1f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv1i8_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v9, v8, fa0
 ; CHECK-NEXT:    fsrm a0
@@ -39,7 +39,7 @@ declare <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv1i8.nxv1f32.iX
 define <vscale x 1 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv1i8_nxv1f32(<vscale x 1 x i8> %0, <vscale x 1 x float> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv1i8_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v9, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -64,11 +64,11 @@ declare <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv2i8.nxv2f32.iXLen(
 define <vscale x 2 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv2i8_nxv2f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv2i8_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v9, v8, fa0
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv2i8.nxv2f32.iXLen(
@@ -90,7 +90,7 @@ declare <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv2i8.nxv2f32.iX
 define <vscale x 2 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv2i8_nxv2f32(<vscale x 2 x i8> %0, <vscale x 2 x float> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv2i8_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v9, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -115,7 +115,7 @@ declare <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv4i8.nxv4f32.iXLen(
 define <vscale x 4 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv4i8_nxv4f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv4i8_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v10, v8, fa0
 ; CHECK-NEXT:    fsrm a0
@@ -141,7 +141,7 @@ declare <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv4i8.nxv4f32.iX
 define <vscale x 4 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv4i8_nxv4f32(<vscale x 4 x i8> %0, <vscale x 4 x float> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv4i8_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v10, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -166,11 +166,11 @@ declare <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv8i8.nxv8f32.iXLen(
 define <vscale x 8 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv8i8_nxv8f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v12, v8, fa0
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv8i8.nxv8f32.iXLen(
@@ -192,7 +192,7 @@ declare <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv8i8.nxv8f32.iX
 define <vscale x 8 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv8i8_nxv8f32(<vscale x 8 x i8> %0, <vscale x 8 x float> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v12, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -217,11 +217,11 @@ declare <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv16i8.nxv16f32.iXLe
 define <vscale x 16 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv16i8_nxv16f32(<vscale x 16 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv16i8_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v16, v8, fa0
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv2r.v v8, v16
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv16i8.nxv16f32.iXLen(
@@ -243,7 +243,7 @@ declare <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv16i8.nxv16f32
 define <vscale x 16 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv16i8_nxv16f32(<vscale x 16 x i8> %0, <vscale x 16 x float> %1, float %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv16i8_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v16, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vqmacc_4x8x4.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vqmacc_4x8x4.ll
index 2d591be2adc2..eebc51619480 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vqmacc_4x8x4.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vqmacc_4x8x4.ll
@@ -7,36 +7,36 @@
 declare <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8(
   <vscale x 2 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
+  <vscale x 4 x i8>,
   iXLen, iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_tu_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_ta_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 2 x i32> %a
@@ -45,36 +45,36 @@ entry:
 declare <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8(
   <vscale x 4 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 16 x i8>,
+  <vscale x 8 x i8>,
   iXLen, iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_tu_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
-; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_ta_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 4 x i32> %a
@@ -83,36 +83,36 @@ entry:
 declare <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8(
   <vscale x 8 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 32 x i8>,
+  <vscale x 16 x i8>,
   iXLen, iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_tu_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
-; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
+; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_ta_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 8 x i32> %a
@@ -121,38 +121,36 @@ entry:
 declare <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8(
   <vscale x 16 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 64 x i8>,
+  <vscale x 32 x i8>,
   iXLen, iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_tu_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
+; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_ta_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 16 x i32> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccsu_4x8x4.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccsu_4x8x4.ll
index bfdab33965c1..0d7052356e55 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccsu_4x8x4.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccsu_4x8x4.ll
@@ -7,36 +7,36 @@
 declare <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8(
   <vscale x 2 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
+  <vscale x 4 x i8>,
   iXLen, iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_tu_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_ta_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 2 x i32> %a
@@ -45,36 +45,36 @@ entry:
 declare <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8(
   <vscale x 4 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 16 x i8>,
+  <vscale x 8 x i8>,
   iXLen, iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_tu_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
-; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_ta_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 4 x i32> %a
@@ -83,36 +83,36 @@ entry:
 declare <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8(
   <vscale x 8 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 32 x i8>,
+  <vscale x 16 x i8>,
   iXLen, iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_tu_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
-; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
+; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_ta_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 8 x i32> %a
@@ -121,38 +121,36 @@ entry:
 declare <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8(
   <vscale x 16 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 64 x i8>,
+  <vscale x 32 x i8>,
   iXLen, iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_tu_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
+; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_ta_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 16 x i32> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccu_4x8x4.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccu_4x8x4.ll
index d1565fb9a634..3332390f71e0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccu_4x8x4.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccu_4x8x4.ll
@@ -7,36 +7,36 @@
 declare <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8(
   <vscale x 2 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
+  <vscale x 4 x i8>,
   iXLen, iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_tu_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_ta_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 2 x i32> %a
@@ -45,36 +45,36 @@ entry:
 declare <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8(
   <vscale x 4 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 16 x i8>,
+  <vscale x 8 x i8>,
   iXLen, iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_tu_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
-; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_ta_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 4 x i32> %a
@@ -83,36 +83,36 @@ entry:
 declare <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8(
   <vscale x 8 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 32 x i8>,
+  <vscale x 16 x i8>,
   iXLen, iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_tu_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
-; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
+; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_ta_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 8 x i32> %a
@@ -121,38 +121,36 @@ entry:
 declare <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8(
   <vscale x 16 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 64 x i8>,
+  <vscale x 32 x i8>,
   iXLen, iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_tu_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
+; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_ta_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 16 x i32> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccus_4x8x4.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccus_4x8x4.ll
index c6d2a048c5cb..74fb66f5bf35 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccus_4x8x4.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccus_4x8x4.ll
@@ -7,36 +7,36 @@
 declare <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8(
   <vscale x 2 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
+  <vscale x 4 x i8>,
   iXLen, iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_tu_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_ta_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 2 x i32> %a
@@ -45,36 +45,36 @@ entry:
 declare <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8(
   <vscale x 4 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 16 x i8>,
+  <vscale x 8 x i8>,
   iXLen, iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_tu_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
-; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_ta_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 4 x i32> %a
@@ -83,36 +83,36 @@ entry:
 declare <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8(
   <vscale x 8 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 32 x i8>,
+  <vscale x 16 x i8>,
   iXLen, iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_tu_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
-; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
+; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_ta_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 8 x i32> %a
@@ -121,38 +121,36 @@ entry:
 declare <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8(
   <vscale x 16 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 64 x i8>,
+  <vscale x 32 x i8>,
   iXLen, iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_tu_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
+; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_ta_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 16 x i32> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands-i1.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands-i1.ll
index 350c888a2c7d..77cf6f6a25ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands-i1.ll
@@ -12,7 +12,7 @@ define void @sink_splat_vp_and_i1(ptr nocapture %a, i1 zeroext %x, <8 x i1> %m,
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    addi a1, a0, 1024
+; CHECK-NEXT:    li a1, 1024
 ; CHECK-NEXT:  .LBB0_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vlm.v v9, (a0)
@@ -20,8 +20,9 @@ define void @sink_splat_vp_and_i1(ptr nocapture %a, i1 zeroext %x, <8 x i1> %m,
 ; CHECK-NEXT:    vmand.mm v9, v9, v8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsm.v v9, (a0)
+; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    bne a0, a1, .LBB0_1
+; CHECK-NEXT:    bnez a1, .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index ede331cc376f..9b083fc286e7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -5,16 +5,16 @@
 define void @sink_splat_mul(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_mul:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB0_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmul.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB0_1
+; CHECK-NEXT:    bnez a2, .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -39,16 +39,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_add(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_add:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB1_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vadd.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB1_1
+; CHECK-NEXT:    bnez a2, .LBB1_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -73,16 +73,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_sub(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_sub:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB2_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB2_1
+; CHECK-NEXT:    bnez a2, .LBB2_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -107,16 +107,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_rsub(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_rsub:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB3_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB3_1
+; CHECK-NEXT:    bnez a2, .LBB3_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -141,16 +141,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_and(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_and:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB4_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB4_1
+; CHECK-NEXT:    bnez a2, .LBB4_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -175,16 +175,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_or(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_or:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB5_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vor.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB5_1
+; CHECK-NEXT:    bnez a2, .LBB5_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -209,16 +209,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_xor(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_xor:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB6_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB6_1
+; CHECK-NEXT:    bnez a2, .LBB6_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -244,42 +244,42 @@ define void @sink_splat_mul_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_mul_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB7_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB7_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB7_5
 ; CHECK-NEXT:  .LBB7_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:  .LBB7_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
 ; CHECK-NEXT:    vmul.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB7_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB7_7
 ; CHECK-NEXT:  .LBB7_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    addi a3, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB7_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    mul a3, a3, a1
-; CHECK-NEXT:    sw a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 4
-; CHECK-NEXT:    bne a2, a0, .LBB7_6
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    mul a2, a2, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a3, .LBB7_6
 ; CHECK-NEXT:  .LBB7_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -335,42 +335,42 @@ define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_add_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB8_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB8_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB8_5
 ; CHECK-NEXT:  .LBB8_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:  .LBB8_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
 ; CHECK-NEXT:    vadd.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB8_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB8_7
 ; CHECK-NEXT:  .LBB8_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    addi a3, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB8_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    sw a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 4
-; CHECK-NEXT:    bne a2, a0, .LBB8_6
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    add a2, a2, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a3, .LBB8_6
 ; CHECK-NEXT:  .LBB8_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -426,42 +426,42 @@ define void @sink_splat_sub_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_sub_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB9_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB9_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB9_5
 ; CHECK-NEXT:  .LBB9_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:  .LBB9_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB9_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB9_7
 ; CHECK-NEXT:  .LBB9_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    addi a3, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB9_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    sw a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 4
-; CHECK-NEXT:    bne a2, a0, .LBB9_6
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    add a2, a2, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a3, .LBB9_6
 ; CHECK-NEXT:  .LBB9_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -517,42 +517,42 @@ define void @sink_splat_rsub_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_rsub_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB10_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB10_5
 ; CHECK-NEXT:  .LBB10_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:  .LBB10_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
 ; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB10_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB10_7
 ; CHECK-NEXT:  .LBB10_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    addi a3, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB10_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    subw a3, a1, a3
-; CHECK-NEXT:    sw a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 4
-; CHECK-NEXT:    bne a2, a0, .LBB10_6
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    subw a2, a1, a2
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a3, .LBB10_6
 ; CHECK-NEXT:  .LBB10_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -608,42 +608,42 @@ define void @sink_splat_and_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_and_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB11_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB11_5
 ; CHECK-NEXT:  .LBB11_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:  .LBB11_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB11_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB11_7
 ; CHECK-NEXT:  .LBB11_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    addi a3, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB11_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    and a3, a3, a1
-; CHECK-NEXT:    sw a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 4
-; CHECK-NEXT:    bne a2, a0, .LBB11_6
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    and a2, a2, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a3, .LBB11_6
 ; CHECK-NEXT:  .LBB11_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -699,42 +699,42 @@ define void @sink_splat_or_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_or_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB12_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB12_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB12_5
 ; CHECK-NEXT:  .LBB12_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:  .LBB12_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
 ; CHECK-NEXT:    vor.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB12_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB12_7
 ; CHECK-NEXT:  .LBB12_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    addi a3, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB12_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    or a3, a3, a1
-; CHECK-NEXT:    sw a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 4
-; CHECK-NEXT:    bne a2, a0, .LBB12_6
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    or a2, a2, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a3, .LBB12_6
 ; CHECK-NEXT:  .LBB12_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -790,42 +790,42 @@ define void @sink_splat_xor_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_xor_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB13_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB13_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB13_5
 ; CHECK-NEXT:  .LBB13_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:  .LBB13_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB13_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB13_7
 ; CHECK-NEXT:  .LBB13_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    addi a3, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB13_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    xor a3, a3, a1
-; CHECK-NEXT:    sw a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 4
-; CHECK-NEXT:    bne a2, a0, .LBB13_6
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    xor a2, a2, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a3, .LBB13_6
 ; CHECK-NEXT:  .LBB13_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -880,16 +880,16 @@ for.body:                                         ; preds = %for.body.preheader,
 define void @sink_splat_shl(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_shl:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB14_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsll.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB14_1
+; CHECK-NEXT:    bnez a2, .LBB14_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -914,16 +914,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_lshr(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_lshr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB15_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB15_1
+; CHECK-NEXT:    bnez a2, .LBB15_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -948,16 +948,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_ashr(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_ashr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB16_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsra.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB16_1
+; CHECK-NEXT:    bnez a2, .LBB16_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -983,42 +983,42 @@ define void @sink_splat_shl_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_shl_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB17_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB17_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB17_5
 ; CHECK-NEXT:  .LBB17_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:  .LBB17_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
 ; CHECK-NEXT:    vsll.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB17_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB17_7
 ; CHECK-NEXT:  .LBB17_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    addi a3, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB17_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    sllw a3, a3, a1
-; CHECK-NEXT:    sw a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 4
-; CHECK-NEXT:    bne a2, a0, .LBB17_6
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    sllw a2, a2, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a3, .LBB17_6
 ; CHECK-NEXT:  .LBB17_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1074,42 +1074,42 @@ define void @sink_splat_lshr_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_lshr_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB18_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB18_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB18_5
 ; CHECK-NEXT:  .LBB18_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:  .LBB18_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB18_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB18_7
 ; CHECK-NEXT:  .LBB18_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    addi a3, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB18_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    srlw a3, a3, a1
-; CHECK-NEXT:    sw a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 4
-; CHECK-NEXT:    bne a2, a0, .LBB18_6
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    srlw a2, a2, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a3, .LBB18_6
 ; CHECK-NEXT:  .LBB18_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1190,17 +1190,17 @@ define void @sink_splat_ashr_scalable(ptr nocapture %a) {
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a3, .LBB19_7
 ; CHECK-NEXT:  .LBB19_5: # %for.body.preheader
+; CHECK-NEXT:    addi a2, a1, -1024
 ; CHECK-NEXT:    slli a1, a1, 2
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:  .LBB19_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a2, 0(a1)
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sw a2, 0(a1)
-; CHECK-NEXT:    addi a1, a1, 4
-; CHECK-NEXT:    bne a1, a0, .LBB19_6
+; CHECK-NEXT:    lw a1, 0(a0)
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    addi a2, a2, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a2, .LBB19_6
 ; CHECK-NEXT:  .LBB19_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1255,16 +1255,16 @@ for.body:                                         ; preds = %for.body.preheader,
 define void @sink_splat_fmul(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fmul:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    li a1, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB20_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a1, a1, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a1, .LBB20_1
+; CHECK-NEXT:    bnez a1, .LBB20_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1289,16 +1289,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_fdiv(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fdiv:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    li a1, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB21_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a1, a1, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a1, .LBB21_1
+; CHECK-NEXT:    bnez a1, .LBB21_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1323,16 +1323,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_frdiv(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_frdiv:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    li a1, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB22_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a1, a1, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a1, .LBB22_1
+; CHECK-NEXT:    bnez a1, .LBB22_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1357,16 +1357,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_fadd(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fadd:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    li a1, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB23_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a1, a1, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a1, .LBB23_1
+; CHECK-NEXT:    bnez a1, .LBB23_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1391,16 +1391,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_fsub(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fsub:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    li a1, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB24_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vfsub.vf v8, v8, fa0
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a1, a1, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a1, .LBB24_1
+; CHECK-NEXT:    bnez a1, .LBB24_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1425,16 +1425,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_frsub(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_frsub:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    li a1, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB25_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vfrsub.vf v8, v8, fa0
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a1, a1, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a1, .LBB25_1
+; CHECK-NEXT:    bnez a1, .LBB25_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1460,41 +1460,41 @@ define void @sink_splat_fmul_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fmul_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB26_2
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB26_5
 ; CHECK-NEXT:  .LBB26_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    vsetvli a5, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a3
+; CHECK-NEXT:    mv a6, a2
 ; CHECK-NEXT:  .LBB26_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a5)
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    sub a6, a6, a2
+; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    bnez a6, .LBB26_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB26_7
 ; CHECK-NEXT:  .LBB26_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a3, 2
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    addi a1, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB26_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flw fa5, 0(a1)
+; CHECK-NEXT:    flw fa5, 0(a0)
 ; CHECK-NEXT:    fmul.s fa5, fa5, fa0
-; CHECK-NEXT:    fsw fa5, 0(a1)
-; CHECK-NEXT:    addi a1, a1, 4
-; CHECK-NEXT:    bne a1, a0, .LBB26_6
+; CHECK-NEXT:    fsw fa5, 0(a0)
+; CHECK-NEXT:    addi a1, a1, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a1, .LBB26_6
 ; CHECK-NEXT:  .LBB26_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1550,41 +1550,41 @@ define void @sink_splat_fdiv_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fdiv_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB27_2
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB27_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB27_5
 ; CHECK-NEXT:  .LBB27_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    vsetvli a5, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a3
+; CHECK-NEXT:    mv a6, a2
 ; CHECK-NEXT:  .LBB27_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a5)
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    sub a6, a6, a2
+; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    bnez a6, .LBB27_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB27_7
 ; CHECK-NEXT:  .LBB27_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a3, 2
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    addi a1, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB27_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flw fa5, 0(a1)
+; CHECK-NEXT:    flw fa5, 0(a0)
 ; CHECK-NEXT:    fdiv.s fa5, fa5, fa0
-; CHECK-NEXT:    fsw fa5, 0(a1)
-; CHECK-NEXT:    addi a1, a1, 4
-; CHECK-NEXT:    bne a1, a0, .LBB27_6
+; CHECK-NEXT:    fsw fa5, 0(a0)
+; CHECK-NEXT:    addi a1, a1, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a1, .LBB27_6
 ; CHECK-NEXT:  .LBB27_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1640,41 +1640,41 @@ define void @sink_splat_frdiv_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_frdiv_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB28_2
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB28_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB28_5
 ; CHECK-NEXT:  .LBB28_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    vsetvli a5, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a3
+; CHECK-NEXT:    mv a6, a2
 ; CHECK-NEXT:  .LBB28_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a5)
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    sub a6, a6, a2
+; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    bnez a6, .LBB28_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB28_7
 ; CHECK-NEXT:  .LBB28_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a3, 2
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    addi a1, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB28_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flw fa5, 0(a1)
+; CHECK-NEXT:    flw fa5, 0(a0)
 ; CHECK-NEXT:    fdiv.s fa5, fa0, fa5
-; CHECK-NEXT:    fsw fa5, 0(a1)
-; CHECK-NEXT:    addi a1, a1, 4
-; CHECK-NEXT:    bne a1, a0, .LBB28_6
+; CHECK-NEXT:    fsw fa5, 0(a0)
+; CHECK-NEXT:    addi a1, a1, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a1, .LBB28_6
 ; CHECK-NEXT:  .LBB28_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1730,41 +1730,41 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fadd_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB29_2
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB29_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB29_5
 ; CHECK-NEXT:  .LBB29_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    vsetvli a5, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a3
+; CHECK-NEXT:    mv a6, a2
 ; CHECK-NEXT:  .LBB29_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a5)
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    sub a6, a6, a2
+; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    bnez a6, .LBB29_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB29_7
 ; CHECK-NEXT:  .LBB29_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a3, 2
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    addi a1, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB29_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flw fa5, 0(a1)
+; CHECK-NEXT:    flw fa5, 0(a0)
 ; CHECK-NEXT:    fadd.s fa5, fa5, fa0
-; CHECK-NEXT:    fsw fa5, 0(a1)
-; CHECK-NEXT:    addi a1, a1, 4
-; CHECK-NEXT:    bne a1, a0, .LBB29_6
+; CHECK-NEXT:    fsw fa5, 0(a0)
+; CHECK-NEXT:    addi a1, a1, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a1, .LBB29_6
 ; CHECK-NEXT:  .LBB29_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1820,41 +1820,41 @@ define void @sink_splat_fsub_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fsub_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB30_2
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB30_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB30_5
 ; CHECK-NEXT:  .LBB30_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    vsetvli a5, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a3
+; CHECK-NEXT:    mv a6, a2
 ; CHECK-NEXT:  .LBB30_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a5)
 ; CHECK-NEXT:    vfsub.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    sub a6, a6, a2
+; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    bnez a6, .LBB30_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB30_7
 ; CHECK-NEXT:  .LBB30_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a3, 2
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    addi a1, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB30_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flw fa5, 0(a1)
+; CHECK-NEXT:    flw fa5, 0(a0)
 ; CHECK-NEXT:    fsub.s fa5, fa5, fa0
-; CHECK-NEXT:    fsw fa5, 0(a1)
-; CHECK-NEXT:    addi a1, a1, 4
-; CHECK-NEXT:    bne a1, a0, .LBB30_6
+; CHECK-NEXT:    fsw fa5, 0(a0)
+; CHECK-NEXT:    addi a1, a1, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a1, .LBB30_6
 ; CHECK-NEXT:  .LBB30_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1910,41 +1910,41 @@ define void @sink_splat_frsub_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_frsub_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB31_2
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB31_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB31_5
 ; CHECK-NEXT:  .LBB31_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    vsetvli a5, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a3
+; CHECK-NEXT:    mv a6, a2
 ; CHECK-NEXT:  .LBB31_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a5)
 ; CHECK-NEXT:    vfrsub.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    sub a6, a6, a2
+; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    bnez a6, .LBB31_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB31_7
 ; CHECK-NEXT:  .LBB31_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a3, 2
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    addi a1, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB31_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flw fa5, 0(a1)
+; CHECK-NEXT:    flw fa5, 0(a0)
 ; CHECK-NEXT:    fsub.s fa5, fa0, fa5
-; CHECK-NEXT:    fsw fa5, 0(a1)
-; CHECK-NEXT:    addi a1, a1, 4
-; CHECK-NEXT:    bne a1, a0, .LBB31_6
+; CHECK-NEXT:    fsw fa5, 0(a0)
+; CHECK-NEXT:    addi a1, a1, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a1, .LBB31_6
 ; CHECK-NEXT:  .LBB31_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -1999,8 +1999,7 @@ for.body:                                         ; preds = %for.body.preheader,
 define void @sink_splat_fma(ptr noalias nocapture %a, ptr nocapture readonly %b, float %x) {
 ; CHECK-LABEL: sink_splat_fma:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a1, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB32_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -2008,9 +2007,10 @@ define void @sink_splat_fma(ptr noalias nocapture %a, ptr nocapture readonly %b,
 ; CHECK-NEXT:    vle32.v v9, (a1)
 ; CHECK-NEXT:    vfmacc.vf v9, fa0, v8
 ; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a1, a2, .LBB32_1
+; CHECK-NEXT:    bnez a2, .LBB32_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2037,8 +2037,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_fma_commute(ptr noalias nocapture %a, ptr nocapture readonly %b, float %x) {
 ; CHECK-LABEL: sink_splat_fma_commute:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a1, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB33_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -2046,9 +2045,10 @@ define void @sink_splat_fma_commute(ptr noalias nocapture %a, ptr nocapture read
 ; CHECK-NEXT:    vle32.v v9, (a1)
 ; CHECK-NEXT:    vfmacc.vf v9, fa0, v8
 ; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a1, a2, .LBB33_1
+; CHECK-NEXT:    bnez a2, .LBB33_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2103,20 +2103,20 @@ define void @sink_splat_fma_scalable(ptr noalias nocapture %a, ptr noalias nocap
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a5, .LBB34_7
 ; CHECK-NEXT:  .LBB34_5: # %for.body.preheader
+; CHECK-NEXT:    addi a2, a4, -1024
 ; CHECK-NEXT:    slli a4, a4, 2
-; CHECK-NEXT:    add a2, a1, a4
+; CHECK-NEXT:    add a1, a1, a4
 ; CHECK-NEXT:    add a0, a0, a4
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:  .LBB34_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    flw fa5, 0(a0)
-; CHECK-NEXT:    flw fa4, 0(a2)
+; CHECK-NEXT:    flw fa4, 0(a1)
 ; CHECK-NEXT:    fmadd.s fa5, fa5, fa0, fa4
 ; CHECK-NEXT:    fsw fa5, 0(a0)
-; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    addi a2, a2, 1
+; CHECK-NEXT:    addi a1, a1, 4
 ; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    bne a2, a1, .LBB34_6
+; CHECK-NEXT:    bnez a2, .LBB34_6
 ; CHECK-NEXT:  .LBB34_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2203,20 +2203,20 @@ define void @sink_splat_fma_commute_scalable(ptr noalias nocapture %a, ptr noali
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a5, .LBB35_7
 ; CHECK-NEXT:  .LBB35_5: # %for.body.preheader
+; CHECK-NEXT:    addi a2, a4, -1024
 ; CHECK-NEXT:    slli a4, a4, 2
-; CHECK-NEXT:    add a2, a1, a4
+; CHECK-NEXT:    add a1, a1, a4
 ; CHECK-NEXT:    add a0, a0, a4
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:  .LBB35_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    flw fa5, 0(a0)
-; CHECK-NEXT:    flw fa4, 0(a2)
+; CHECK-NEXT:    flw fa4, 0(a1)
 ; CHECK-NEXT:    fmadd.s fa5, fa0, fa5, fa4
 ; CHECK-NEXT:    fsw fa5, 0(a0)
-; CHECK-NEXT:    addi a2, a2, 4
+; CHECK-NEXT:    addi a2, a2, 1
+; CHECK-NEXT:    addi a1, a1, 4
 ; CHECK-NEXT:    addi a0, a0, 4
-; CHECK-NEXT:    bne a2, a1, .LBB35_6
+; CHECK-NEXT:    bnez a2, .LBB35_6
 ; CHECK-NEXT:  .LBB35_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2280,8 +2280,7 @@ declare float @llvm.fma.f32(float, float, float)
 define void @sink_splat_icmp(ptr nocapture %x, i32 signext %y) {
 ; CHECK-LABEL: sink_splat_icmp:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:  .LBB36_1: # %vector.body
@@ -2289,8 +2288,9 @@ define void @sink_splat_icmp(ptr nocapture %x, i32 signext %y) {
 ; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vmseq.vx v0, v9, a1
 ; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB36_1
+; CHECK-NEXT:    bnez a2, .LBB36_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2316,8 +2316,7 @@ declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
 define void @sink_splat_fcmp(ptr nocapture %x, float %y) {
 ; CHECK-LABEL: sink_splat_fcmp:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    li a1, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:  .LBB37_1: # %vector.body
@@ -2325,8 +2324,9 @@ define void @sink_splat_fcmp(ptr nocapture %x, float %y) {
 ; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vmfeq.vf v0, v9, fa0
 ; CHECK-NEXT:    vse32.v v8, (a0), v0.t
+; CHECK-NEXT:    addi a1, a1, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a1, .LBB37_1
+; CHECK-NEXT:    bnez a1, .LBB37_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2352,16 +2352,16 @@ declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
 define void @sink_splat_udiv(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_udiv:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB38_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vdivu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB38_1
+; CHECK-NEXT:    bnez a2, .LBB38_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2386,16 +2386,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_sdiv(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_sdiv:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB39_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vdiv.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB39_1
+; CHECK-NEXT:    bnez a2, .LBB39_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2420,16 +2420,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_urem(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_urem:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB40_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vremu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB40_1
+; CHECK-NEXT:    bnez a2, .LBB40_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2454,16 +2454,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_srem(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_srem:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB41_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vrem.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB41_1
+; CHECK-NEXT:    bnez a2, .LBB41_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2489,42 +2489,42 @@ define void @sink_splat_udiv_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_udiv_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB42_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB42_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB42_5
 ; CHECK-NEXT:  .LBB42_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:  .LBB42_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
 ; CHECK-NEXT:    vdivu.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB42_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB42_7
 ; CHECK-NEXT:  .LBB42_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    addi a3, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB42_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    divuw a3, a3, a1
-; CHECK-NEXT:    sw a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 4
-; CHECK-NEXT:    bne a2, a0, .LBB42_6
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    divuw a2, a2, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a3, .LBB42_6
 ; CHECK-NEXT:  .LBB42_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2580,42 +2580,42 @@ define void @sink_splat_sdiv_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_sdiv_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB43_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB43_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB43_5
 ; CHECK-NEXT:  .LBB43_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:  .LBB43_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
 ; CHECK-NEXT:    vdiv.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB43_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB43_7
 ; CHECK-NEXT:  .LBB43_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    addi a3, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB43_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    divw a3, a3, a1
-; CHECK-NEXT:    sw a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 4
-; CHECK-NEXT:    bne a2, a0, .LBB43_6
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    divw a2, a2, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a3, .LBB43_6
 ; CHECK-NEXT:  .LBB43_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2671,42 +2671,42 @@ define void @sink_splat_urem_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_urem_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB44_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB44_5
 ; CHECK-NEXT:  .LBB44_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:  .LBB44_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
 ; CHECK-NEXT:    vremu.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB44_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB44_7
 ; CHECK-NEXT:  .LBB44_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    addi a3, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB44_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    remuw a3, a3, a1
-; CHECK-NEXT:    sw a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 4
-; CHECK-NEXT:    bne a2, a0, .LBB44_6
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    remuw a2, a2, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a3, .LBB44_6
 ; CHECK-NEXT:  .LBB44_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2762,42 +2762,42 @@ define void @sink_splat_srem_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_srem_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB45_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB45_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB45_5
 ; CHECK-NEXT:  .LBB45_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:  .LBB45_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
 ; CHECK-NEXT:    vrem.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB45_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB45_7
 ; CHECK-NEXT:  .LBB45_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    addi a3, a2, -1024
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB45_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lw a3, 0(a2)
-; CHECK-NEXT:    remw a3, a3, a1
-; CHECK-NEXT:    sw a3, 0(a2)
-; CHECK-NEXT:    addi a2, a2, 4
-; CHECK-NEXT:    bne a2, a0, .LBB45_6
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    remw a2, a2, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    addi a3, a3, 1
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    bnez a3, .LBB45_6
 ; CHECK-NEXT:  .LBB45_7: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2854,8 +2854,7 @@ declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @sink_splat_vp_mul(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_mul:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB46_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -2864,8 +2863,9 @@ define void @sink_splat_vp_mul(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i3
 ; CHECK-NEXT:    vmul.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB46_1
+; CHECK-NEXT:    bnez a3, .LBB46_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2892,8 +2892,7 @@ declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @sink_splat_vp_add(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_add:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB47_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -2902,8 +2901,9 @@ define void @sink_splat_vp_add(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i3
 ; CHECK-NEXT:    vadd.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB47_1
+; CHECK-NEXT:    bnez a3, .LBB47_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2928,8 +2928,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_vp_add_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_add_commute:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB48_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -2938,8 +2937,9 @@ define void @sink_splat_vp_add_commute(ptr nocapture %a, i32 signext %x, <4 x i1
 ; CHECK-NEXT:    vadd.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB48_1
+; CHECK-NEXT:    bnez a3, .LBB48_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -2966,8 +2966,7 @@ declare <4 x i32> @llvm.vp.sub.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @sink_splat_vp_sub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_sub:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB49_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -2976,8 +2975,9 @@ define void @sink_splat_vp_sub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i3
 ; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB49_1
+; CHECK-NEXT:    bnez a3, .LBB49_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3002,8 +3002,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_vp_rsub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_rsub:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB50_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3012,8 +3011,9 @@ define void @sink_splat_vp_rsub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i
 ; CHECK-NEXT:    vrsub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB50_1
+; CHECK-NEXT:    bnez a3, .LBB50_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3040,8 +3040,7 @@ declare <4 x i32> @llvm.vp.shl.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @sink_splat_vp_shl(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_shl:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB51_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3050,8 +3049,9 @@ define void @sink_splat_vp_shl(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i3
 ; CHECK-NEXT:    vsll.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB51_1
+; CHECK-NEXT:    bnez a3, .LBB51_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3078,8 +3078,7 @@ declare <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @sink_splat_vp_lshr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_lshr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB52_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3088,8 +3087,9 @@ define void @sink_splat_vp_lshr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB52_1
+; CHECK-NEXT:    bnez a3, .LBB52_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3116,8 +3116,7 @@ declare <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @sink_splat_vp_ashr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_ashr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB53_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3126,8 +3125,9 @@ define void @sink_splat_vp_ashr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i
 ; CHECK-NEXT:    vsra.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB53_1
+; CHECK-NEXT:    bnez a3, .LBB53_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3154,8 +3154,7 @@ declare <4 x float> @llvm.vp.fmul.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
 define void @sink_splat_vp_fmul(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_fmul:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB54_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3164,8 +3163,9 @@ define void @sink_splat_vp_fmul(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB54_1
+; CHECK-NEXT:    bnez a2, .LBB54_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3192,8 +3192,7 @@ declare <4 x float> @llvm.vp.fdiv.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
 define void @sink_splat_vp_fdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_fdiv:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB55_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3202,8 +3201,9 @@ define void @sink_splat_vp_fdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB55_1
+; CHECK-NEXT:    bnez a2, .LBB55_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3228,8 +3228,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_vp_frdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_frdiv:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB56_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3238,8 +3237,9 @@ define void @sink_splat_vp_frdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 ze
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB56_1
+; CHECK-NEXT:    bnez a2, .LBB56_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3266,8 +3266,7 @@ declare <4 x float> @llvm.vp.fadd.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
 define void @sink_splat_vp_fadd(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_fadd:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB57_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3276,8 +3275,9 @@ define void @sink_splat_vp_fadd(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB57_1
+; CHECK-NEXT:    bnez a2, .LBB57_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3304,8 +3304,7 @@ declare <4 x float> @llvm.vp.fsub.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
 define void @sink_splat_vp_fsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_fsub:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB58_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3314,8 +3313,9 @@ define void @sink_splat_vp_fsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer
 ; CHECK-NEXT:    vfsub.vf v8, v8, fa0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB58_1
+; CHECK-NEXT:    bnez a2, .LBB58_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3342,8 +3342,7 @@ declare <4 x float> @llvm.vp.frsub.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32
 define void @sink_splat_vp_frsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_frsub:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB59_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3352,8 +3351,9 @@ define void @sink_splat_vp_frsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 ze
 ; CHECK-NEXT:    vfrsub.vf v8, v8, fa0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB59_1
+; CHECK-NEXT:    bnez a2, .LBB59_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3380,8 +3380,7 @@ declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @sink_splat_vp_udiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_udiv:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB60_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3390,8 +3389,9 @@ define void @sink_splat_vp_udiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i
 ; CHECK-NEXT:    vdivu.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB60_1
+; CHECK-NEXT:    bnez a3, .LBB60_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3418,8 +3418,7 @@ declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @sink_splat_vp_sdiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_sdiv:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB61_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3428,8 +3427,9 @@ define void @sink_splat_vp_sdiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i
 ; CHECK-NEXT:    vdiv.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB61_1
+; CHECK-NEXT:    bnez a3, .LBB61_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3456,8 +3456,7 @@ declare <4 x i32> @llvm.vp.urem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @sink_splat_vp_urem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_urem:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB62_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3466,8 +3465,9 @@ define void @sink_splat_vp_urem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i
 ; CHECK-NEXT:    vremu.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB62_1
+; CHECK-NEXT:    bnez a3, .LBB62_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3494,8 +3494,7 @@ declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @sink_splat_vp_srem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_srem:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB63_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3504,8 +3503,9 @@ define void @sink_splat_vp_srem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i
 ; CHECK-NEXT:    vrem.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB63_1
+; CHECK-NEXT:    bnez a3, .LBB63_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3534,8 +3534,7 @@ define void @sink_splat_vp_srem_commute(ptr nocapture %a, i32 signext %x, <4 x i
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    li a1, 1024
 ; CHECK-NEXT:  .LBB64_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v9, (a0)
@@ -3543,8 +3542,9 @@ define void @sink_splat_vp_srem_commute(ptr nocapture %a, i32 signext %x, <4 x i
 ; CHECK-NEXT:    vrem.vv v9, v8, v9, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    addi a1, a1, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a1, .LBB64_1
+; CHECK-NEXT:    bnez a1, .LBB64_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3571,8 +3571,7 @@ declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4
 define void @sink_splat_vp_fma(ptr noalias nocapture %a, ptr nocapture readonly %b, float %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_fma:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a1, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB65_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3582,9 +3581,10 @@ define void @sink_splat_vp_fma(ptr noalias nocapture %a, ptr nocapture readonly
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a1, a3, .LBB65_1
+; CHECK-NEXT:    bnez a3, .LBB65_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3611,8 +3611,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_vp_fma_commute(ptr noalias nocapture %a, ptr nocapture readonly %b, float %x, <4 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: sink_splat_vp_fma_commute:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a1, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB66_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -3622,9 +3621,10 @@ define void @sink_splat_vp_fma_commute(ptr noalias nocapture %a, ptr nocapture r
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a1, a3, .LBB66_1
+; CHECK-NEXT:    bnez a3, .LBB66_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3652,16 +3652,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_mul_lmul2(ptr nocapture %a, i64 signext %x) {
 ; CHECK-LABEL: sink_splat_mul_lmul2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:  .LBB67_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vmul.vx v8, v8, a1
 ; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB67_1
+; CHECK-NEXT:    bnez a2, .LBB67_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3686,16 +3686,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_add_lmul2(ptr nocapture %a, i64 signext %x) {
 ; CHECK-LABEL: sink_splat_add_lmul2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:  .LBB68_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vadd.vx v8, v8, a1
 ; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB68_1
+; CHECK-NEXT:    bnez a2, .LBB68_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3720,16 +3720,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_sub_lmul2(ptr nocapture %a, i64 signext %x) {
 ; CHECK-LABEL: sink_splat_sub_lmul2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:  .LBB69_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB69_1
+; CHECK-NEXT:    bnez a2, .LBB69_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3754,16 +3754,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_rsub_lmul2(ptr nocapture %a, i64 signext %x) {
 ; CHECK-LABEL: sink_splat_rsub_lmul2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:  .LBB70_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB70_1
+; CHECK-NEXT:    bnez a2, .LBB70_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3788,16 +3788,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_and_lmul2(ptr nocapture %a, i64 signext %x) {
 ; CHECK-LABEL: sink_splat_and_lmul2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:  .LBB71_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB71_1
+; CHECK-NEXT:    bnez a2, .LBB71_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3822,16 +3822,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_or_lmul2(ptr nocapture %a, i64 signext %x) {
 ; CHECK-LABEL: sink_splat_or_lmul2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:  .LBB72_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vor.vx v8, v8, a1
 ; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB72_1
+; CHECK-NEXT:    bnez a2, .LBB72_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3856,16 +3856,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_xor_lmul2(ptr nocapture %a, i64 signext %x) {
 ; CHECK-LABEL: sink_splat_xor_lmul2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:  .LBB73_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
 ; CHECK-NEXT:    vse64.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB73_1
+; CHECK-NEXT:    bnez a2, .LBB73_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3890,8 +3890,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_mul_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_mul_lmul8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:  .LBB74_1: # %vector.body
@@ -3899,8 +3898,9 @@ define void @sink_splat_mul_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmul.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB74_1
+; CHECK-NEXT:    bnez a2, .LBB74_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3925,8 +3925,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_add_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_add_lmul8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:  .LBB75_1: # %vector.body
@@ -3934,8 +3933,9 @@ define void @sink_splat_add_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vadd.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB75_1
+; CHECK-NEXT:    bnez a2, .LBB75_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3960,8 +3960,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_sub_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_sub_lmul8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:  .LBB76_1: # %vector.body
@@ -3969,8 +3968,9 @@ define void @sink_splat_sub_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB76_1
+; CHECK-NEXT:    bnez a2, .LBB76_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -3995,8 +3995,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_rsub_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_rsub_lmul8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:  .LBB77_1: # %vector.body
@@ -4004,8 +4003,9 @@ define void @sink_splat_rsub_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB77_1
+; CHECK-NEXT:    bnez a2, .LBB77_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -4030,8 +4030,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_and_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_and_lmul8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:  .LBB78_1: # %vector.body
@@ -4039,8 +4038,9 @@ define void @sink_splat_and_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB78_1
+; CHECK-NEXT:    bnez a2, .LBB78_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -4065,8 +4065,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_or_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_or_lmul8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:  .LBB79_1: # %vector.body
@@ -4074,8 +4073,9 @@ define void @sink_splat_or_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vor.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB79_1
+; CHECK-NEXT:    bnez a2, .LBB79_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -4100,8 +4100,7 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_xor_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_xor_lmul8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:  .LBB80_1: # %vector.body
@@ -4109,8 +4108,9 @@ define void @sink_splat_xor_lmul8(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB80_1
+; CHECK-NEXT:    bnez a2, .LBB80_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -4135,16 +4135,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_mul_lmulmf2(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_mul_lmulmf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:  .LBB81_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vmul.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB81_1
+; CHECK-NEXT:    bnez a2, .LBB81_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -4169,16 +4169,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_add_lmulmf2(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_add_lmulmf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:  .LBB82_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vadd.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB82_1
+; CHECK-NEXT:    bnez a2, .LBB82_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -4203,16 +4203,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_sub_lmulmf2(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_sub_lmulmf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:  .LBB83_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB83_1
+; CHECK-NEXT:    bnez a2, .LBB83_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -4237,16 +4237,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_rsub_lmulmf2(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_rsub_lmulmf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:  .LBB84_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB84_1
+; CHECK-NEXT:    bnez a2, .LBB84_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -4271,16 +4271,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_and_lmulmf2(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_and_lmulmf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:  .LBB85_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB85_1
+; CHECK-NEXT:    bnez a2, .LBB85_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -4305,16 +4305,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_or_lmulmf2(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_or_lmulmf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:  .LBB86_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vor.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB86_1
+; CHECK-NEXT:    bnez a2, .LBB86_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -4339,16 +4339,16 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @sink_splat_xor_lmulmf2(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_xor_lmulmf2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:  .LBB87_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 32
-; CHECK-NEXT:    bne a0, a2, .LBB87_1
+; CHECK-NEXT:    bnez a2, .LBB87_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -4376,8 +4376,7 @@ define void @sink_splat_vp_icmp(ptr nocapture %x, i32 signext %y, <4 x i1> %m, i
 ; CHECK-LABEL: sink_splat_vp_icmp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vmv1r.v v8, v0
-; CHECK-NEXT:    lui a3, 1
-; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:  .LBB88_1: # %vector.body
@@ -4388,8 +4387,9 @@ define void @sink_splat_vp_icmp(ptr nocapture %x, i32 signext %y, <4 x i1> %m, i
 ; CHECK-NEXT:    vmseq.vx v0, v10, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v9, (a0), v0.t
+; CHECK-NEXT:    addi a3, a3, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a3, .LBB88_1
+; CHECK-NEXT:    bnez a3, .LBB88_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -4417,8 +4417,7 @@ define void @sink_splat_vp_fcmp(ptr nocapture %x, float %y, <4 x i1> %m, i32 zer
 ; CHECK-LABEL: sink_splat_vp_fcmp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vmv1r.v v8, v0
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    li a2, 1024
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:  .LBB89_1: # %vector.body
@@ -4429,8 +4428,9 @@ define void @sink_splat_vp_fcmp(ptr nocapture %x, float %y, <4 x i1> %m, i32 zer
 ; CHECK-NEXT:    vmfeq.vf v0, v10, fa0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v9, (a0), v0.t
+; CHECK-NEXT:    addi a2, a2, -4
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    bne a0, a2, .LBB89_1
+; CHECK-NEXT:    bnez a2, .LBB89_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip.ll b/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip.ll
new file mode 100644
index 000000000000..42577408f71b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip.ll
@@ -0,0 +1,379 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+
+declare <4 x i16> @llvm.smax.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.smin.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>)
+declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>)
+
+declare <4 x i16> @llvm.umax.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.umin.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>)
+declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>)
+
+define void @trunc_sat_i8i16_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i8i16_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vnclip.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i16>, ptr %x, align 16
+  %2 = tail call <4 x i16> @llvm.smax.v4i16(<4 x i16> %1, <4 x i16> <i16 -128, i16 -128, i16 -128, i16 -128>)
+  %3 = tail call <4 x i16> @llvm.smin.v4i16(<4 x i16> %2, <4 x i16> <i16 127, i16 127, i16 127, i16 127>)
+  %4 = trunc <4 x i16> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_i8i16_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i8i16_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vnclip.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i16>, ptr %x, align 16
+  %2 = tail call <4 x i16> @llvm.smin.v4i16(<4 x i16> %1, <4 x i16> <i16 127, i16 127, i16 127, i16 127>)
+  %3 = tail call <4 x i16> @llvm.smax.v4i16(<4 x i16> %2, <4 x i16> <i16 -128, i16 -128, i16 -128, i16 -128>)
+  %4 = trunc <4 x i16> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_i8i16_notopt(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i8i16_notopt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    li a0, -127
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 128
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i16>, ptr %x, align 16
+  %2 = tail call <4 x i16> @llvm.smax.v4i16(<4 x i16> %1, <4 x i16> <i16 -127, i16 -127, i16 -127, i16 -127>)
+  %3 = tail call <4 x i16> @llvm.smin.v4i16(<4 x i16> %2, <4 x i16> <i16 128, i16 128, i16 128, i16 128>)
+  %4 = trunc <4 x i16> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_u8u16_min(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u16_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i16>, ptr %x, align 16
+  %2 = tail call <4 x i16> @llvm.umin.v4i16(<4 x i16> %1, <4 x i16> <i16 255, i16 255, i16 255, i16 255>)
+  %3 = trunc <4 x i16> %2 to <4 x i8>
+  store <4 x i8> %3, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_u8u16_notopt(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u16_notopt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    li a0, 127
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i16>, ptr %x, align 16
+  %2 = tail call <4 x i16> @llvm.umin.v4i16(<4 x i16> %1, <4 x i16> <i16 127, i16 127, i16 127, i16 127>)
+  %3 = trunc <4 x i16> %2 to <4 x i8>
+  store <4 x i8> %3, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_u8u16_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u16_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i16>, ptr %x, align 16
+  %2 = tail call <4 x i16> @llvm.umax.v4i16(<4 x i16> %1, <4 x i16> <i16 0, i16 0, i16 0, i16 0>)
+  %3 = tail call <4 x i16> @llvm.umin.v4i16(<4 x i16> %2, <4 x i16> <i16 255, i16 255, i16 255, i16 255>)
+  %4 = trunc <4 x i16> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+define void @trunc_sat_u8u16_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u8u16_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i16>, ptr %x, align 16
+  %2 = tail call <4 x i16> @llvm.umin.v4i16(<4 x i16> %1, <4 x i16> <i16 255, i16 255, i16 255, i16 255>)
+  %3 = tail call <4 x i16> @llvm.umax.v4i16(<4 x i16> %2, <4 x i16> <i16 0, i16 0, i16 0, i16 0>)
+  %4 = trunc <4 x i16> %3 to <4 x i8>
+  store <4 x i8> %4, ptr %y, align 8
+  ret void
+}
+
+
+define void @trunc_sat_i16i32_notopt(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i16i32_notopt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    addi a0, a0, 1
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i32>, ptr %x, align 32
+  %2 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %1, <4 x i32> <i32 -32767, i32 -32767, i32 -32767, i32 -32767>)
+  %3 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %2, <4 x i32> <i32 32768, i32 32768, i32 32768, i32 32768>)
+  %4 = trunc <4 x i32> %3 to <4 x i16>
+  store <4 x i16> %4, ptr %y, align 16
+  ret void
+}
+
+define void @trunc_sat_i16i32_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i16i32_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vnclip.wi v8, v8, 0
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i32>, ptr %x, align 32
+  %2 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %1, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>)
+  %3 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %2, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
+  %4 = trunc <4 x i32> %3 to <4 x i16>
+  store <4 x i16> %4, ptr %y, align 16
+  ret void
+}
+
+define void @trunc_sat_i16i32_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i16i32_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vnclip.wi v8, v8, 0
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i32>, ptr %x, align 32
+  %2 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %1, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
+  %3 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %2, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>)
+  %4 = trunc <4 x i32> %3 to <4 x i16>
+  store <4 x i16> %4, ptr %y, align 16
+  ret void
+}
+
+define void @trunc_sat_u16u32_notopt(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u32_notopt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i32>, ptr %x, align 32
+  %2 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %1, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
+  %3 = trunc <4 x i32> %2 to <4 x i16>
+  store <4 x i16> %3, ptr %y, align 16
+  ret void
+}
+
+define void @trunc_sat_u16u32_min(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u32_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i32>, ptr %x, align 32
+  %2 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %1, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>)
+  %3 = trunc <4 x i32> %2 to <4 x i16>
+  store <4 x i16> %3, ptr %y, align 16
+  ret void
+}
+
+define void @trunc_sat_u16u32_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u32_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i32>, ptr %x, align 32
+  %2 = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %1, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  %3 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %2, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>)
+  %4 = trunc <4 x i32> %3 to <4 x i16>
+  store <4 x i16> %4, ptr %y, align 16
+  ret void
+}
+
+define void @trunc_sat_u16u32_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u16u32_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vnclipu.wi v8, v8, 0
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i32>, ptr %x, align 32
+  %2 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %1, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>)
+  %3 = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %2, <4 x i32> <i32 0, i32 0, i32 0, i32 0>)
+  %4 = trunc <4 x i32> %3 to <4 x i16>
+  store <4 x i16> %4, ptr %y, align 16
+  ret void
+}
+
+
+define void @trunc_sat_i32i64_notopt(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i32i64_notopt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    lui a0, 524288
+; CHECK-NEXT:    addiw a0, a0, 1
+; CHECK-NEXT:    vmax.vx v8, v8, a0
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    slli a0, a0, 31
+; CHECK-NEXT:    vmin.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vse32.v v10, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 64
+  %2 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %1, <4 x i64> <i64 -2147483647, i64 -2147483647, i64 -2147483647, i64 -2147483647>)
+  %3 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %2, <4 x i64> <i64 2147483648, i64 2147483648, i64 2147483648, i64 2147483648>)
+  %4 = trunc <4 x i64> %3 to <4 x i32>
+  store <4 x i32> %4, ptr %y, align 32
+  ret void
+}
+
+define void @trunc_sat_i32i64_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i32i64_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vnclip.wi v10, v8, 0
+; CHECK-NEXT:    vse32.v v10, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 64
+  %2 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %1, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>)
+  %3 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %2, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
+  %4 = trunc <4 x i64> %3 to <4 x i32>
+  store <4 x i32> %4, ptr %y, align 32
+  ret void
+}
+
+define void @trunc_sat_i32i64_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_i32i64_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vnclip.wi v10, v8, 0
+; CHECK-NEXT:    vse32.v v10, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 64
+  %2 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %1, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
+  %3 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %2, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>)
+  %4 = trunc <4 x i64> %3 to <4 x i32>
+  store <4 x i32> %4, ptr %y, align 32
+  ret void
+}
+
+
+define void @trunc_sat_u32u64_notopt(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u32u64_notopt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    lui a0, 524288
+; CHECK-NEXT:    addiw a0, a0, -1
+; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vse32.v v10, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 64
+  %2 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %1, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
+  %3 = trunc <4 x i64> %2 to <4 x i32>
+  store <4 x i32> %3, ptr %y, align 32
+  ret void
+}
+
+define void @trunc_sat_u32u64_min(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u32u64_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vnclipu.wi v10, v8, 0
+; CHECK-NEXT:    vse32.v v10, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 64
+  %2 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %1, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>)
+  %3 = trunc <4 x i64> %2 to <4 x i32>
+  store <4 x i32> %3, ptr %y, align 32
+  ret void
+}
+
+
+define void @trunc_sat_u32u64_maxmin(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u32u64_maxmin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vnclipu.wi v10, v8, 0
+; CHECK-NEXT:    vse32.v v10, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 64
+  %2 = tail call <4 x i64> @llvm.umax.v4i64(<4 x i64> %1, <4 x i64> <i64 0, i64 0, i64 0, i64 0>)
+  %3 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %2, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>)
+  %4 = trunc <4 x i64> %3 to <4 x i32>
+  store <4 x i32> %4, ptr %y, align 32
+  ret void
+}
+
+define void @trunc_sat_u32u64_minmax(ptr %x, ptr %y) {
+; CHECK-LABEL: trunc_sat_u32u64_minmax:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vnclipu.wi v10, v8, 0
+; CHECK-NEXT:    vse32.v v10, (a1)
+; CHECK-NEXT:    ret
+  %1 = load <4 x i64>, ptr %x, align 64
+  %2 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %1, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>)
+  %3 = tail call <4 x i64> @llvm.umax.v4i64(<4 x i64> %2, <4 x i64> <i64 0, i64 0, i64 0, i64 0>)
+  %4 = trunc <4 x i64> %3 to <4 x i32>
+  store <4 x i32> %4, ptr %y, align 32
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vaesdf.ll b/llvm/test/CodeGen/RISCV/rvv/vaesdf.ll
index 1ad30fa264e0..9d394a1ee3ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vaesdf.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vaesdf.ll
@@ -123,4 +123,3 @@ entry:
 
   ret <vscale x 16 x i32> %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vaesdm.ll b/llvm/test/CodeGen/RISCV/rvv/vaesdm.ll
index b7a4a1166331..f21bdcac032f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vaesdm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vaesdm.ll
@@ -123,4 +123,3 @@ entry:
 
   ret <vscale x 16 x i32> %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vaesef.ll b/llvm/test/CodeGen/RISCV/rvv/vaesef.ll
index bd8a7cb94ab0..ee11786583d7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vaesef.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vaesef.ll
@@ -123,4 +123,3 @@ entry:
 
   ret <vscale x 16 x i32> %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vaesem.ll b/llvm/test/CodeGen/RISCV/rvv/vaesem.ll
index 21b5e5942e31..65486e119842 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vaesem.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vaesem.ll
@@ -123,4 +123,3 @@ entry:
 
   ret <vscale x 16 x i32> %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vaesz.ll b/llvm/test/CodeGen/RISCV/rvv/vaesz.ll
index ee089f1e77cc..2453119ce92d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vaesz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vaesz.ll
@@ -63,4 +63,3 @@ entry:
 
   ret <vscale x 16 x i32> %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
index a1b1c015369c..f9c53c93472a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
@@ -2003,4 +2003,3 @@ define <vscale x 8 x i64> @vandn_vx_swapped_nxv8i64(i64 %x, <vscale x 8 x i64> %
   %b = and <vscale x 8 x i64> %splat, %y
   ret <vscale x 8 x i64> %b
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
index 4ff6e5660b25..f076c3c621cd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
@@ -1429,4 +1429,3 @@ define <vscale x 8 x i64> @vandn_vx_vp_nxv8i64(i64 %a, <vscale x 8 x i64> %b, <v
   %x = call <vscale x 8 x i64> @llvm.vp.and.nxv8i64(<vscale x 8 x i64> %b, <vscale x 8 x i64> %splat.not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i64> %x
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
index f68a15a0d014..c98242437f62 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
@@ -2370,4 +2370,3 @@ define <vscale x 8 x double> @splice_nxv8f64_offset_max(<vscale x 8 x double> %a
 }
 
 attributes #0 = { vscale_range(2,0) }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll
index 9f32efd5027d..c2ac155304dd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vexts-sdnode.ll
@@ -617,4 +617,3 @@ define <vscale x 8 x i64> @vzext_nxv8i32_nxv8i64(<vscale x 8 x i32> %va) {
   %evec = zext <vscale x 8 x i32> %va to <vscale x 8 x i64>
   ret <vscale x 8 x i64> %evec
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvtbf16-f-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvtbf16-f-f.ll
index 906b4b232d65..4c8fc06ee195 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfncvtbf16-f-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvtbf16-f-f.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zfbfmin,+experimental-zvfbfmin \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zfbfmin,+experimental-zvfbfmin \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
 declare <vscale x 1 x bfloat> @llvm.riscv.vfncvtbf16.f.f.w.nxv1bf16.nxv1f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll
index c297cfd1f6ed..35b2df75babf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zfbfmin,+experimental-zvfbfmin \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zfbfmin,+experimental-zvfbfmin \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
 declare <vscale x 1 x float> @llvm.riscv.vfwcvtbf16.f.f.v.nxv1f32.nxv1bf16(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmarith-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmarith-sdnode.ll
index 1278f7790a06..d243c89958c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmarith-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmarith-sdnode.ll
@@ -476,4 +476,3 @@ define <vscale x 16 x i1> @vmorn_vv_nxv16i1(<vscale x 16 x i1> %va, <vscale x 16
   %vc = or <vscale x 16 x i1> %va, %not
   ret <vscale x 16 x i1> %vc
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-sdnode.ll
index b4041f0b6783..1247f3d29c09 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-sdnode.ll
@@ -889,4 +889,3 @@ define <vscale x 8 x i64> @vmax_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   %vc = select <vscale x 8 x i1> %cmp, <vscale x 8 x i64> %va, <vscale x 8 x i64> %splat
   ret <vscale x 8 x i64> %vc
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-sdnode.ll
index 5398803b2489..7405282a07b7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-sdnode.ll
@@ -889,4 +889,3 @@ define <vscale x 8 x i64> @vmin_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   %vc = select <vscale x 8 x i1> %cmp, <vscale x 8 x i64> %va, <vscale x 8 x i64> %splat
   ret <vscale x 8 x i64> %vc
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll
index f1f7cdd47626..3e14058210e5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll
@@ -1004,4 +1004,3 @@ define <vscale x 8 x i32> @vmul_vi_mask_nxv8i32(<vscale x 8 x i32> %va, <vscale
   %vc = mul <vscale x 8 x i32> %va, %vs
   ret <vscale x 8 x i32> %vc
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll
index 58874fe8c8fc..1eafb3bdfed2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll
@@ -1299,4 +1299,3 @@ define <vscale x 8 x i64> @vrem_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   %vc = srem <vscale x 8 x i64> %va, %splat
   ret <vscale x 8 x i64> %vc
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
index fc6af87e473e..4ea5a6709db5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
@@ -1154,4 +1154,3 @@ define <vscale x 8 x i64> @vrol_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
   %x = call <vscale x 8 x i64> @llvm.fshl.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> %b.splat)
   ret <vscale x 8 x i64> %x
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
index 13a584a67328..b8a091b24259 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
@@ -1976,4 +1976,3 @@ define <vscale x 8 x i64> @vror_vi_rotl_nxv8i64(<vscale x 8 x i64> %a) {
   %x = call <vscale x 8 x i64> @llvm.fshl.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer))
   ret <vscale x 8 x i64> %x
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrsub-sdnode.ll
index 43705631e545..127d3ffd7931 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrsub-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrsub-sdnode.ll
@@ -581,4 +581,3 @@ define <vscale x 8 x i64> @vrsub_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   %vc = sub <vscale x 8 x i64> %splat, %va
   ret <vscale x 8 x i64> %vc
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
new file mode 100644
index 000000000000..fe605d5ca6f9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING
+; Check that the default value enables the web folding and
+; that it is bigger than 3.
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING
+
+
+; Check that the scalable vector add/sub/mul operations are all promoted into their
+; vw counterpart when the folding of the web size is increased to 3.
+; We need the web size to be at least 3 for the folding to happen, because
+; %c has 3 uses.
+; see https://github.com/llvm/llvm-project/pull/72340
+define <vscale x 2 x i16> @vwop_vscale_sext_multiple_users(ptr %x, ptr %y, ptr %z) {
+; NO_FOLDING-LABEL: vwop_vscale_sext_multiple_users:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
+; NO_FOLDING-NEXT:    vle8.v v8, (a0)
+; NO_FOLDING-NEXT:    vle8.v v9, (a1)
+; NO_FOLDING-NEXT:    vle8.v v10, (a2)
+; NO_FOLDING-NEXT:    vsext.vf2 v11, v8
+; NO_FOLDING-NEXT:    vsext.vf2 v8, v9
+; NO_FOLDING-NEXT:    vsext.vf2 v9, v10
+; NO_FOLDING-NEXT:    vmul.vv v8, v11, v8
+; NO_FOLDING-NEXT:    vadd.vv v10, v11, v9
+; NO_FOLDING-NEXT:    vsub.vv v9, v11, v9
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v10
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v9
+; NO_FOLDING-NEXT:    ret
+;
+; FOLDING-LABEL: vwop_vscale_sext_multiple_users:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT:    vle8.v v8, (a0)
+; FOLDING-NEXT:    vle8.v v9, (a1)
+; FOLDING-NEXT:    vle8.v v10, (a2)
+; FOLDING-NEXT:    vwmul.vv v11, v8, v9
+; FOLDING-NEXT:    vwadd.vv v9, v8, v10
+; FOLDING-NEXT:    vwsub.vv v12, v8, v10
+; FOLDING-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; FOLDING-NEXT:    vor.vv v8, v11, v9
+; FOLDING-NEXT:    vor.vv v8, v8, v12
+; FOLDING-NEXT:    ret
+  %a = load <vscale x 2 x i8>, ptr %x
+  %b = load <vscale x 2 x i8>, ptr %y
+  %b2 = load <vscale x 2 x i8>, ptr %z
+  %c = sext <vscale x 2 x i8> %a to <vscale x 2 x i16>
+  %d = sext <vscale x 2 x i8> %b to <vscale x 2 x i16>
+  %d2 = sext <vscale x 2 x i8> %b2 to <vscale x 2 x i16>
+  %e = mul <vscale x 2 x i16> %c, %d
+  %f = add <vscale x 2 x i16> %c, %d2
+  %g = sub <vscale x 2 x i16> %c, %d2
+  %h = or <vscale x 2 x i16> %e, %f
+  %i = or <vscale x 2 x i16> %h, %g
+  ret <vscale x 2 x i16> %i
+}
+
+
+
+define <vscale x 2 x i16> @vwop_vscale_zext_multiple_users(ptr %x, ptr %y, ptr %z) {
+; NO_FOLDING-LABEL: vwop_vscale_zext_multiple_users:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
+; NO_FOLDING-NEXT:    vle8.v v8, (a0)
+; NO_FOLDING-NEXT:    vle8.v v9, (a1)
+; NO_FOLDING-NEXT:    vle8.v v10, (a2)
+; NO_FOLDING-NEXT:    vzext.vf2 v11, v8
+; NO_FOLDING-NEXT:    vzext.vf2 v8, v9
+; NO_FOLDING-NEXT:    vzext.vf2 v9, v10
+; NO_FOLDING-NEXT:    vmul.vv v8, v11, v8
+; NO_FOLDING-NEXT:    vadd.vv v10, v11, v9
+; NO_FOLDING-NEXT:    vsub.vv v9, v11, v9
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v10
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v9
+; NO_FOLDING-NEXT:    ret
+;
+; FOLDING-LABEL: vwop_vscale_zext_multiple_users:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT:    vle8.v v8, (a0)
+; FOLDING-NEXT:    vle8.v v9, (a1)
+; FOLDING-NEXT:    vle8.v v10, (a2)
+; FOLDING-NEXT:    vwmulu.vv v11, v8, v9
+; FOLDING-NEXT:    vwaddu.vv v9, v8, v10
+; FOLDING-NEXT:    vwsubu.vv v12, v8, v10
+; FOLDING-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; FOLDING-NEXT:    vor.vv v8, v11, v9
+; FOLDING-NEXT:    vor.vv v8, v8, v12
+; FOLDING-NEXT:    ret
+  %a = load <vscale x 2 x i8>, ptr %x
+  %b = load <vscale x 2 x i8>, ptr %y
+  %b2 = load <vscale x 2 x i8>, ptr %z
+  %c = zext <vscale x 2 x i8> %a to <vscale x 2 x i16>
+  %d = zext <vscale x 2 x i8> %b to <vscale x 2 x i16>
+  %d2 = zext <vscale x 2 x i8> %b2 to <vscale x 2 x i16>
+  %e = mul <vscale x 2 x i16> %c, %d
+  %f = add <vscale x 2 x i16> %c, %d2
+  %g = sub <vscale x 2 x i16> %c, %d2
+  %h = or <vscale x 2 x i16> %e, %f
+  %i = or <vscale x 2 x i16> %h, %g
+  ret <vscale x 2 x i16> %i
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvl-ext.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvl-ext.ll
index 5804f8edf84d..39d73bed2592 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvl-ext.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvl-ext.ll
@@ -24,12 +24,614 @@ define zeroext i32 @vsetvl_zext() {
   ret i32 %b
 }
 
-define i64 @vsetvl_and17bits() {
-; CHECK-LABEL: vsetvl_and17bits:
+define i64 @vsetvl_e8m1_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m1_and14bits:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli a0, 1, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    ret
-  %a = call i64 @llvm.riscv.vsetvli(i64 1, i64 1, i64 1)
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 0)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m1_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m1_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m1, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 0)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m1_constant_avl() {
+; CHECK-LABEL: vsetvl_e8m1_constant_avl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli a0, 1, e8, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 1, i64 0, i64 0)
+  %b = and i64 %a, 1
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m2_and15bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m2_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 1)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m2_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m2_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 50
+; CHECK-NEXT:    srli a0, a0, 50
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 1)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m4_and16bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m4_and16bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 2)
+  %b = and i64 %a, 65535
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m4_and15bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m4_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 49
+; CHECK-NEXT:    srli a0, a0, 49
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 2)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m8_and17bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m8_and17bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 3)
   %b = and i64 %a, 131071
   ret i64 %b
 }
+
+define i64 @vsetvl_e8m8_and16bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m8_and16bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a0, a0, 48
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 3)
+  %b = and i64 %a, 65535
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8mf2_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8mf2_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 5)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8mf2_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8mf2_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 5)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8mf4_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8mf4_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 6)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8mf4_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8mf4_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 6)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8mf8_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8mf8_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 7)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8mf8_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8mf8_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 7)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m1_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m1_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 0)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m1_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m1_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m1, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 0)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m2_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m2_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 1)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m2_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m2_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 1)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m4_and15bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m4_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 2)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m4_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m4_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 50
+; CHECK-NEXT:    srli a0, a0, 50
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 2)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m8_and16bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m8_and16bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 3)
+  %b = and i64 %a, 65535
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m8_and15bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m8_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 49
+; CHECK-NEXT:    srli a0, a0, 49
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 3)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16mf2_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16mf2_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 5)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16mf2_and9bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16mf2_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 511
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 5)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16mf4_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16mf4_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 6)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16mf4_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16mf4_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 6)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16mf8_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16mf8_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 7)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16mf8_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16mf8_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 7)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m1_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m1_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 0)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m1_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m1_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m1, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 0)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m2_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m2_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 1)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m2_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m2_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 1)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m4_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m4_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 2)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m4_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m4_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 2)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m8_and15bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m8_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 3)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m8_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m8_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 50
+; CHECK-NEXT:    srli a0, a0, 50
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 3)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32mf2_and9bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32mf2_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 5)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32mf2_and8bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32mf2_and8bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 5)
+  %b = and i64 %a, 255
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32mf4_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32mf4_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 6)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32mf4_and9bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32mf4_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 511
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 6)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32mf8_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32mf8_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 7)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32mf8_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32mf8_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 7)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m1_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m1_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 0)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m1_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m1_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m1, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 0)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m2_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m2_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 1)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m2_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m2_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 1)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m4_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m4_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 2)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m4_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m4_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 2)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m8_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m8_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 3)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m8_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m8_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 3)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64mf2_and8bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64mf2_and8bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 5)
+  %b = and i64 %a, 255
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64mf2_and7bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64mf2_and7bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 127
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 5)
+  %b = and i64 %a, 127
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64mf4_and9bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64mf4_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 6)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64mf4_and8bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64mf4_and8bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 6)
+  %b = and i64 %a, 255
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64mf8_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64mf8_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 7)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64mf8_and9bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64mf8_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, mf2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 511
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 7)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index f154fd2cd14a..73f651225da6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -951,15 +951,16 @@ if.end:
 define void @pre_over_vle(ptr %A) {
 ; CHECK-LABEL: pre_over_vle:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi a1, a0, 800
+; CHECK-NEXT:    li a1, 100
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:  .LBB22_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsext.vf4 v9, v8
 ; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, 8
-; CHECK-NEXT:    bne a0, a1, .LBB22_1
+; CHECK-NEXT:    bnez a1, .LBB22_1
 ; CHECK-NEXT:  # %bb.2: # %exit
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvlmax-ext.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvlmax-ext.ll
new file mode 100644
index 000000000000..b2a676dc0daf
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvlmax-ext.ll
@@ -0,0 +1,626 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+declare i64 @llvm.riscv.vsetvlimax(i64, i64);
+
+define signext i32 @vsetvlmax_sext() {
+; CHECK-LABEL: vsetvlmax_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 1)
+  %b = trunc i64 %a to i32
+  ret i32 %b
+}
+
+define zeroext i32 @vsetvlmax_zext() {
+; CHECK-LABEL: vsetvlmax_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 1)
+  %b = trunc i64 %a to i32
+  ret i32 %b
+}
+
+define i64 @vsetvlmax_e8m1_and14bits() {
+; CHECK-LABEL: vsetvlmax_e8m1_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 0)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m1_and13bits() {
+; CHECK-LABEL: vsetvlmax_e8m1_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 0)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m2_and15bits() {
+; CHECK-LABEL: vsetvlmax_e8m2_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 1)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m2_and14bits() {
+; CHECK-LABEL: vsetvlmax_e8m2_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 50
+; CHECK-NEXT:    srli a0, a0, 50
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 1)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m4_and16bits() {
+; CHECK-LABEL: vsetvlmax_e8m4_and16bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 2)
+  %b = and i64 %a, 65535
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m4_and15bits() {
+; CHECK-LABEL: vsetvlmax_e8m4_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 49
+; CHECK-NEXT:    srli a0, a0, 49
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 2)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m8_and17bits() {
+; CHECK-LABEL: vsetvlmax_e8m8_and17bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 3)
+  %b = and i64 %a, 131071
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m8_and16bits() {
+; CHECK-LABEL: vsetvlmax_e8m8_and16bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a0, a0, 48
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 3)
+  %b = and i64 %a, 65535
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8mf2_and11bits() {
+; CHECK-LABEL: vsetvlmax_e8mf2_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 5)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8mf2_and10bits() {
+; CHECK-LABEL: vsetvlmax_e8mf2_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 5)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8mf4_and12bits() {
+; CHECK-LABEL: vsetvlmax_e8mf4_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 6)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8mf4_and11bits() {
+; CHECK-LABEL: vsetvlmax_e8mf4_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 6)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8mf8_and13bits() {
+; CHECK-LABEL: vsetvlmax_e8mf8_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 7)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8mf8_and12bits() {
+; CHECK-LABEL: vsetvlmax_e8mf8_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 7)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m1_and13bits() {
+; CHECK-LABEL: vsetvlmax_e16m1_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 0)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m1_and12bits() {
+; CHECK-LABEL: vsetvlmax_e16m1_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 0)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m2_and14bits() {
+; CHECK-LABEL: vsetvlmax_e16m2_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 1)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m2_and13bits() {
+; CHECK-LABEL: vsetvlmax_e16m2_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 1)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m4_and15bits() {
+; CHECK-LABEL: vsetvlmax_e16m4_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 2)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m4_and14bits() {
+; CHECK-LABEL: vsetvlmax_e16m4_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 50
+; CHECK-NEXT:    srli a0, a0, 50
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 2)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m8_and16bits() {
+; CHECK-LABEL: vsetvlmax_e16m8_and16bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 3)
+  %b = and i64 %a, 65535
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m8_and15bits() {
+; CHECK-LABEL: vsetvlmax_e16m8_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 49
+; CHECK-NEXT:    srli a0, a0, 49
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 3)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16mf2_and10bits() {
+; CHECK-LABEL: vsetvlmax_e16mf2_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 5)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16mf2_and9bits() {
+; CHECK-LABEL: vsetvlmax_e16mf2_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 511
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 5)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16mf4_and11bits() {
+; CHECK-LABEL: vsetvlmax_e16mf4_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 6)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16mf4_and10bits() {
+; CHECK-LABEL: vsetvlmax_e16mf4_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 6)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16mf8_and12bits() {
+; CHECK-LABEL: vsetvlmax_e16mf8_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 7)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16mf8_and11bits() {
+; CHECK-LABEL: vsetvlmax_e16mf8_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 7)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m1_and12bits() {
+; CHECK-LABEL: vsetvlmax_e32m1_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 0)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m1_and11bits() {
+; CHECK-LABEL: vsetvlmax_e32m1_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 0)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m2_and13bits() {
+; CHECK-LABEL: vsetvlmax_e32m2_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 1)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m2_and12bits() {
+; CHECK-LABEL: vsetvlmax_e32m2_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 1)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m4_and14bits() {
+; CHECK-LABEL: vsetvlmax_e32m4_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 2)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m4_and13bits() {
+; CHECK-LABEL: vsetvlmax_e32m4_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 2)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m8_and15bits() {
+; CHECK-LABEL: vsetvlmax_e32m8_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 3)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m8_and14bits() {
+; CHECK-LABEL: vsetvlmax_e32m8_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 50
+; CHECK-NEXT:    srli a0, a0, 50
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 3)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32mf2_and9bits() {
+; CHECK-LABEL: vsetvlmax_e32mf2_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 5)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32mf2_and8bits() {
+; CHECK-LABEL: vsetvlmax_e32mf2_and8bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 5)
+  %b = and i64 %a, 255
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32mf4_and10bits() {
+; CHECK-LABEL: vsetvlmax_e32mf4_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 6)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32mf4_and9bits() {
+; CHECK-LABEL: vsetvlmax_e32mf4_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 511
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 6)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32mf8_and11bits() {
+; CHECK-LABEL: vsetvlmax_e32mf8_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 7)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32mf8_and10bits() {
+; CHECK-LABEL: vsetvlmax_e32mf8_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 7)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m1_and11bits() {
+; CHECK-LABEL: vsetvlmax_e64m1_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 0)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m1_and10bits() {
+; CHECK-LABEL: vsetvlmax_e64m1_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 0)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m2_and12bits() {
+; CHECK-LABEL: vsetvlmax_e64m2_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 1)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m2_and11bits() {
+; CHECK-LABEL: vsetvlmax_e64m2_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 1)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m4_and13bits() {
+; CHECK-LABEL: vsetvlmax_e64m4_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 2)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m4_and12bits() {
+; CHECK-LABEL: vsetvlmax_e64m4_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 2)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m8_and14bits() {
+; CHECK-LABEL: vsetvlmax_e64m8_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 3)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m8_and13bits() {
+; CHECK-LABEL: vsetvlmax_e64m8_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 3)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64mf2_and8bits() {
+; CHECK-LABEL: vsetvlmax_e64mf2_and8bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 5)
+  %b = and i64 %a, 255
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64mf2_and7bits() {
+; CHECK-LABEL: vsetvlmax_e64mf2_and7bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 127
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 5)
+  %b = and i64 %a, 127
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64mf4_and9bits() {
+; CHECK-LABEL: vsetvlmax_e64mf4_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 6)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64mf4_and8bits() {
+; CHECK-LABEL: vsetvlmax_e64mf4_and8bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 6)
+  %b = and i64 %a, 255
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64mf8_and10bits() {
+; CHECK-LABEL: vsetvlmax_e64mf8_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 7)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64mf8_and9bits() {
+; CHECK-LABEL: vsetvlmax_e64mf8_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, mf2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 511
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 7)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsm4r.ll b/llvm/test/CodeGen/RISCV/rvv/vsm4r.ll
index cb836596fdfd..bcea335deefa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsm4r.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsm4r.ll
@@ -123,4 +123,3 @@ entry:
 
   ret <vscale x 16 x i32> %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsplats-i1.ll b/llvm/test/CodeGen/RISCV/rvv/vsplats-i1.ll
index 7b8c2ae3e29b..f462f242f068 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsplats-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsplats-i1.ll
@@ -209,4 +209,3 @@ define <vscale x 4 x i1> @splat_idx_nxv4i32(<vscale x 4 x i1> %v, i64 %idx) {
   %splat = shufflevector <vscale x 4 x i1> %ins, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
   ret <vscale x 4 x i1> %splat
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtruncs-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vtruncs-sdnode.ll
index 337761f69bf9..e86d6045df29 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtruncs-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtruncs-sdnode.ll
@@ -313,4 +313,3 @@ define <vscale x 8 x i32> @vtrunc_nxv8i64_nxv8i32(<vscale x 8 x i64> %va) {
   %tvec = trunc <vscale x 8 x i64> %va to <vscale x 8 x i32>
   ret <vscale x 8 x i32> %tvec
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-x.ll b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-x.ll
index e304b19e71c8..e9e7fa077b52 100644
--- a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-x.ll
@@ -2239,4 +2239,3 @@ entry:
 }
 
 declare <vscale x 16 x float> @llvm.riscv.sf.vc.v.i.se.nxv16f32.iXLen.iXLen(iXLen, iXLen, iXLen, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xv.ll b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xv.ll
index 354ca3ad973f..23628a98feb7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xv.ll
@@ -3669,4 +3669,3 @@ entry:
 }
 
 declare <vscale x 16 x float> @llvm.riscv.sf.vc.v.fv.se.nxv16f32.nxv16f32.iXLen.f32(iXLen, <vscale x 16 x float>, float, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvv.ll b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvv.ll
index 95fb51e3fa5b..2c9100111fab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvv.ll
@@ -3687,4 +3687,3 @@ entry:
 }
 
 declare <vscale x 16 x float> @llvm.riscv.sf.vc.v.fvv.se.nxv16f32.nxv16f32.nxv16i32.f32.iXLen(iXLen, <vscale x 16 x float>, <vscale x 16 x i32>, float %rs1, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvw.ll b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvw.ll
index 4c0833b2ff0c..29b9238b8e9c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvw.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/xsfvcp-xvw.ll
@@ -2694,4 +2694,3 @@ entry:
 }
 
 declare <vscale x 8 x double> @llvm.riscv.sf.vc.v.fvw.se.nxv8f64.nxv8f32.nxv8i32.f32.iXLen(iXLen, <vscale x 8 x double>, <vscale x 8 x i32>, float, iXLen)
-
diff --git a/llvm/test/CodeGen/RISCV/saverestore-scs.ll b/llvm/test/CodeGen/RISCV/saverestore-scs.ll
index 6c3ee2c2f383..6072943b3847 100644
--- a/llvm/test/CodeGen/RISCV/saverestore-scs.ll
+++ b/llvm/test/CodeGen/RISCV/saverestore-scs.ll
@@ -37,4 +37,3 @@ define void @callee_scs() nounwind shadowcallstack {
   store volatile [30 x i32] %val, ptr @var2
   ret void
 }
-
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index 606dbf9a6c57..cdfb1ef0ab4d 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -393,4 +393,3 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
   %a = urem iXLen2 %x, 12
   ret iXLen2 %a
 }
-
diff --git a/llvm/test/CodeGen/RISCV/stack-folding.ll b/llvm/test/CodeGen/RISCV/stack-folding.ll
index 01f866f61216..8373a745e45c 100644
--- a/llvm/test/CodeGen/RISCV/stack-folding.ll
+++ b/llvm/test/CodeGen/RISCV/stack-folding.ll
@@ -7,10 +7,8 @@
 define i1 @test_sext_w(i64 %x, i32 %y) nounwind {
 ; CHECK-LABEL: test_sext_w:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -144
-; CHECK-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    sd gp, 128(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    sd tp, 120(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    addi sp, sp, -128
+; CHECK-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s1, 104(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s2, 96(sp) # 8-byte Folded Spill
@@ -36,9 +34,7 @@ define i1 @test_sext_w(i64 %x, i32 %y) nounwind {
 ; CHECK-NEXT:    lw a0, 8(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    slti a0, a0, 0
 ; CHECK-NEXT:  .LBB0_3: # %falsebb
-; CHECK-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    ld gp, 128(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    ld tp, 120(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s2, 96(sp) # 8-byte Folded Reload
@@ -51,7 +47,7 @@ define i1 @test_sext_w(i64 %x, i32 %y) nounwind {
 ; CHECK-NEXT:    ld s9, 40(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s10, 32(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s11, 24(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 144
+; CHECK-NEXT:    addi sp, sp, 128
 ; CHECK-NEXT:    ret
   tail call void asm sideeffect "", "~{x1},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"()
   %a = icmp eq i64 %x, 0
@@ -67,10 +63,8 @@ falsebb:
 define i64 @test_sext_b(i64 %x, i8 %y) nounwind {
 ; RV64I-LABEL: test_sext_b:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -144
-; RV64I-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd gp, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd tp, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -128
+; RV64I-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 104(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 96(sp) # 8-byte Folded Spill
@@ -97,9 +91,7 @@ define i64 @test_sext_b(i64 %x, i8 %y) nounwind {
 ; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:  .LBB1_3: # %falsebb
-; RV64I-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld gp, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld tp, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s2, 96(sp) # 8-byte Folded Reload
@@ -112,15 +104,13 @@ define i64 @test_sext_b(i64 %x, i8 %y) nounwind {
 ; RV64I-NEXT:    ld s9, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s10, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s11, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 144
+; RV64I-NEXT:    addi sp, sp, 128
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZB-LABEL: test_sext_b:
 ; RV64ZB:       # %bb.0:
-; RV64ZB-NEXT:    addi sp, sp, -144
-; RV64ZB-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; RV64ZB-NEXT:    sd gp, 128(sp) # 8-byte Folded Spill
-; RV64ZB-NEXT:    sd tp, 120(sp) # 8-byte Folded Spill
+; RV64ZB-NEXT:    addi sp, sp, -128
+; RV64ZB-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
 ; RV64ZB-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
 ; RV64ZB-NEXT:    sd s1, 104(sp) # 8-byte Folded Spill
 ; RV64ZB-NEXT:    sd s2, 96(sp) # 8-byte Folded Spill
@@ -145,9 +135,7 @@ define i64 @test_sext_b(i64 %x, i8 %y) nounwind {
 ; RV64ZB-NEXT:  .LBB1_2: # %truebb
 ; RV64ZB-NEXT:    lb a0, 8(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:  .LBB1_3: # %falsebb
-; RV64ZB-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; RV64ZB-NEXT:    ld gp, 128(sp) # 8-byte Folded Reload
-; RV64ZB-NEXT:    ld tp, 120(sp) # 8-byte Folded Reload
+; RV64ZB-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s2, 96(sp) # 8-byte Folded Reload
@@ -160,7 +148,7 @@ define i64 @test_sext_b(i64 %x, i8 %y) nounwind {
 ; RV64ZB-NEXT:    ld s9, 40(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s10, 32(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s11, 24(sp) # 8-byte Folded Reload
-; RV64ZB-NEXT:    addi sp, sp, 144
+; RV64ZB-NEXT:    addi sp, sp, 128
 ; RV64ZB-NEXT:    ret
   tail call void asm sideeffect "", "~{x1},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"()
   %a = icmp eq i64 %x, 0
@@ -176,10 +164,8 @@ falsebb:
 define i64 @test_sext_h(i64 %x, i16 %y) nounwind {
 ; RV64I-LABEL: test_sext_h:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -144
-; RV64I-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd gp, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd tp, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -128
+; RV64I-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 104(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 96(sp) # 8-byte Folded Spill
@@ -206,9 +192,7 @@ define i64 @test_sext_h(i64 %x, i16 %y) nounwind {
 ; RV64I-NEXT:    slli a0, a0, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:  .LBB2_3: # %falsebb
-; RV64I-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld gp, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld tp, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s2, 96(sp) # 8-byte Folded Reload
@@ -221,15 +205,13 @@ define i64 @test_sext_h(i64 %x, i16 %y) nounwind {
 ; RV64I-NEXT:    ld s9, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s10, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s11, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 144
+; RV64I-NEXT:    addi sp, sp, 128
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZB-LABEL: test_sext_h:
 ; RV64ZB:       # %bb.0:
-; RV64ZB-NEXT:    addi sp, sp, -144
-; RV64ZB-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; RV64ZB-NEXT:    sd gp, 128(sp) # 8-byte Folded Spill
-; RV64ZB-NEXT:    sd tp, 120(sp) # 8-byte Folded Spill
+; RV64ZB-NEXT:    addi sp, sp, -128
+; RV64ZB-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
 ; RV64ZB-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
 ; RV64ZB-NEXT:    sd s1, 104(sp) # 8-byte Folded Spill
 ; RV64ZB-NEXT:    sd s2, 96(sp) # 8-byte Folded Spill
@@ -254,9 +236,7 @@ define i64 @test_sext_h(i64 %x, i16 %y) nounwind {
 ; RV64ZB-NEXT:  .LBB2_2: # %truebb
 ; RV64ZB-NEXT:    lh a0, 8(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:  .LBB2_3: # %falsebb
-; RV64ZB-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; RV64ZB-NEXT:    ld gp, 128(sp) # 8-byte Folded Reload
-; RV64ZB-NEXT:    ld tp, 120(sp) # 8-byte Folded Reload
+; RV64ZB-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s2, 96(sp) # 8-byte Folded Reload
@@ -269,7 +249,7 @@ define i64 @test_sext_h(i64 %x, i16 %y) nounwind {
 ; RV64ZB-NEXT:    ld s9, 40(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s10, 32(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s11, 24(sp) # 8-byte Folded Reload
-; RV64ZB-NEXT:    addi sp, sp, 144
+; RV64ZB-NEXT:    addi sp, sp, 128
 ; RV64ZB-NEXT:    ret
   tail call void asm sideeffect "", "~{x1},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"()
   %a = icmp eq i64 %x, 0
@@ -285,10 +265,8 @@ falsebb:
 define i64 @test_zext_b(i64 %x, i8 %y) nounwind {
 ; CHECK-LABEL: test_zext_b:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -144
-; CHECK-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    sd gp, 128(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    sd tp, 120(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    addi sp, sp, -128
+; CHECK-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s1, 104(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s2, 96(sp) # 8-byte Folded Spill
@@ -313,9 +291,7 @@ define i64 @test_zext_b(i64 %x, i8 %y) nounwind {
 ; CHECK-NEXT:  .LBB3_2: # %truebb
 ; CHECK-NEXT:    lbu a0, 8(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:  .LBB3_3: # %falsebb
-; CHECK-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    ld gp, 128(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    ld tp, 120(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s2, 96(sp) # 8-byte Folded Reload
@@ -328,7 +304,7 @@ define i64 @test_zext_b(i64 %x, i8 %y) nounwind {
 ; CHECK-NEXT:    ld s9, 40(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s10, 32(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s11, 24(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 144
+; CHECK-NEXT:    addi sp, sp, 128
 ; CHECK-NEXT:    ret
   tail call void asm sideeffect "", "~{x1},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"()
   %a = icmp eq i64 %x, 0
@@ -344,10 +320,8 @@ falsebb:
 define i64 @test_zext_h(i64 %x, i16 %y) nounwind {
 ; RV64I-LABEL: test_zext_h:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -144
-; RV64I-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd gp, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd tp, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -128
+; RV64I-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 104(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 96(sp) # 8-byte Folded Spill
@@ -374,9 +348,7 @@ define i64 @test_zext_h(i64 %x, i16 %y) nounwind {
 ; RV64I-NEXT:    slli a0, a0, 48
 ; RV64I-NEXT:    srli a0, a0, 48
 ; RV64I-NEXT:  .LBB4_3: # %falsebb
-; RV64I-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld gp, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld tp, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s2, 96(sp) # 8-byte Folded Reload
@@ -389,15 +361,13 @@ define i64 @test_zext_h(i64 %x, i16 %y) nounwind {
 ; RV64I-NEXT:    ld s9, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s10, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s11, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 144
+; RV64I-NEXT:    addi sp, sp, 128
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZB-LABEL: test_zext_h:
 ; RV64ZB:       # %bb.0:
-; RV64ZB-NEXT:    addi sp, sp, -144
-; RV64ZB-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; RV64ZB-NEXT:    sd gp, 128(sp) # 8-byte Folded Spill
-; RV64ZB-NEXT:    sd tp, 120(sp) # 8-byte Folded Spill
+; RV64ZB-NEXT:    addi sp, sp, -128
+; RV64ZB-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
 ; RV64ZB-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
 ; RV64ZB-NEXT:    sd s1, 104(sp) # 8-byte Folded Spill
 ; RV64ZB-NEXT:    sd s2, 96(sp) # 8-byte Folded Spill
@@ -422,9 +392,7 @@ define i64 @test_zext_h(i64 %x, i16 %y) nounwind {
 ; RV64ZB-NEXT:  .LBB4_2: # %truebb
 ; RV64ZB-NEXT:    lhu a0, 8(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:  .LBB4_3: # %falsebb
-; RV64ZB-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; RV64ZB-NEXT:    ld gp, 128(sp) # 8-byte Folded Reload
-; RV64ZB-NEXT:    ld tp, 120(sp) # 8-byte Folded Reload
+; RV64ZB-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s2, 96(sp) # 8-byte Folded Reload
@@ -437,7 +405,7 @@ define i64 @test_zext_h(i64 %x, i16 %y) nounwind {
 ; RV64ZB-NEXT:    ld s9, 40(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s10, 32(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s11, 24(sp) # 8-byte Folded Reload
-; RV64ZB-NEXT:    addi sp, sp, 144
+; RV64ZB-NEXT:    addi sp, sp, 128
 ; RV64ZB-NEXT:    ret
   tail call void asm sideeffect "", "~{x1},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"()
   %a = icmp eq i64 %x, 0
@@ -453,10 +421,8 @@ falsebb:
 define i64 @test_zext_w(i64 %x, i32 %y) nounwind {
 ; RV64I-LABEL: test_zext_w:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -144
-; RV64I-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd gp, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd tp, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -128
+; RV64I-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s1, 104(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 96(sp) # 8-byte Folded Spill
@@ -483,9 +449,7 @@ define i64 @test_zext_w(i64 %x, i32 %y) nounwind {
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:  .LBB5_3: # %falsebb
-; RV64I-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld gp, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld tp, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s2, 96(sp) # 8-byte Folded Reload
@@ -498,15 +462,13 @@ define i64 @test_zext_w(i64 %x, i32 %y) nounwind {
 ; RV64I-NEXT:    ld s9, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s10, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s11, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 144
+; RV64I-NEXT:    addi sp, sp, 128
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZB-LABEL: test_zext_w:
 ; RV64ZB:       # %bb.0:
-; RV64ZB-NEXT:    addi sp, sp, -144
-; RV64ZB-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
-; RV64ZB-NEXT:    sd gp, 128(sp) # 8-byte Folded Spill
-; RV64ZB-NEXT:    sd tp, 120(sp) # 8-byte Folded Spill
+; RV64ZB-NEXT:    addi sp, sp, -128
+; RV64ZB-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
 ; RV64ZB-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
 ; RV64ZB-NEXT:    sd s1, 104(sp) # 8-byte Folded Spill
 ; RV64ZB-NEXT:    sd s2, 96(sp) # 8-byte Folded Spill
@@ -531,9 +493,7 @@ define i64 @test_zext_w(i64 %x, i32 %y) nounwind {
 ; RV64ZB-NEXT:  .LBB5_2: # %truebb
 ; RV64ZB-NEXT:    lwu a0, 8(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:  .LBB5_3: # %falsebb
-; RV64ZB-NEXT:    ld ra, 136(sp) # 8-byte Folded Reload
-; RV64ZB-NEXT:    ld gp, 128(sp) # 8-byte Folded Reload
-; RV64ZB-NEXT:    ld tp, 120(sp) # 8-byte Folded Reload
+; RV64ZB-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s2, 96(sp) # 8-byte Folded Reload
@@ -546,7 +506,7 @@ define i64 @test_zext_w(i64 %x, i32 %y) nounwind {
 ; RV64ZB-NEXT:    ld s9, 40(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s10, 32(sp) # 8-byte Folded Reload
 ; RV64ZB-NEXT:    ld s11, 24(sp) # 8-byte Folded Reload
-; RV64ZB-NEXT:    addi sp, sp, 144
+; RV64ZB-NEXT:    addi sp, sp, 128
 ; RV64ZB-NEXT:    ret
   tail call void asm sideeffect "", "~{x1},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"()
   %a = icmp eq i64 %x, 0
diff --git a/llvm/test/CodeGen/RISCV/switch-width.ll b/llvm/test/CodeGen/RISCV/switch-width.ll
index 8eed5130fa06..d902bd3276a3 100644
--- a/llvm/test/CodeGen/RISCV/switch-width.ll
+++ b/llvm/test/CodeGen/RISCV/switch-width.ll
@@ -314,4 +314,3 @@ return:
   %retval = phi i32 [ -1, %sw.default ], [ 0, %sw.bb0 ], [ 1, %sw.bb1 ]
   ret i32 %retval
 }
-
diff --git a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
index 48a256929808..2fd4572d2345 100644
--- a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
+++ b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
@@ -89,4 +89,3 @@ define signext i32 @unroll_loop_cse() {
   %26 = phi i32 [ 1, %0 ], [ 1, %4 ], [ 1, %8 ], [ 1, %12 ], [ 1, %16 ], [ %24, %20 ]
   ret i32 %26
 }
-
diff --git a/llvm/test/CodeGen/RISCV/vp-splice-fixed-vectors.ll b/llvm/test/CodeGen/RISCV/vp-splice-fixed-vectors.ll
new file mode 100644
index 000000000000..f7c8c251e197
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/vp-splice-fixed-vectors.ll
@@ -0,0 +1,281 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple riscv64 -mattr=+f,+d,+v -verify-machineinstrs -riscv-v-vector-bits-min=128 \
+; RUN:   < %s | FileCheck %s
+
+declare <2 x i64> @llvm.experimental.vp.splice.v2i64(<2 x i64>, <2 x i64>, i32, <2 x i1>, i32, i32)
+declare <4 x i32> @llvm.experimental.vp.splice.v4i32(<4 x i32>, <4 x i32>, i32, <4 x i1>, i32, i32)
+declare <8 x i16> @llvm.experimental.vp.splice.v8i16(<8 x i16>, <8 x i16>, i32, <8 x i1>, i32, i32)
+declare <16 x i8> @llvm.experimental.vp.splice.v16i8(<16 x i8>, <16 x i8>, i32, <16 x i1>, i32, i32)
+
+declare <2 x double> @llvm.experimental.vp.splice.v2f64(<2 x double>, <2 x double>, i32, <2 x i1>, i32, i32)
+declare <4 x float> @llvm.experimental.vp.splice.v4f32(<4 x float>, <4 x float>, i32, <4 x i1>, i32, i32)
+
+define <2 x i64> @test_vp_splice_v2i64(<2 x i64> %va, <2 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <2 x i1> %head, <2 x i1> undef, <2 x i32> zeroinitializer
+
+  %v = call <2 x i64> @llvm.experimental.vp.splice.v2i64(<2 x i64> %va, <2 x i64> %vb, i32 5, <2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @test_vp_splice_v2i64_negative_offset(<2 x i64> %va, <2 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2i64_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <2 x i1> %head, <2 x i1> undef, <2 x i32> zeroinitializer
+
+  %v = call <2 x i64> @llvm.experimental.vp.splice.v2i64(<2 x i64> %va, <2 x i64> %vb, i32 -5, <2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @test_vp_splice_v2i64_masked(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2i64_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.experimental.vp.splice.v2i64(<2 x i64> %va, <2 x i64> %vb, i32 5, <2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <2 x i64> %v
+}
+
+define <4 x i32> @test_vp_splice_v4i32(<4 x i32> %va, <4 x i32> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> undef, <4 x i32> zeroinitializer
+
+  %v = call <4 x i32> @llvm.experimental.vp.splice.v4i32(<4 x i32> %va, <4 x i32> %vb, i32 5, <4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @test_vp_splice_v4i32_negative_offset(<4 x i32> %va, <4 x i32> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4i32_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> undef, <4 x i32> zeroinitializer
+
+  %v = call <4 x i32> @llvm.experimental.vp.splice.v4i32(<4 x i32> %va, <4 x i32> %vb, i32 -5, <4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @test_vp_splice_v4i32_masked(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4i32_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.experimental.vp.splice.v4i32(<4 x i32> %va, <4 x i32> %vb, i32 5, <4 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <4 x i32> %v
+}
+
+define <8 x i16> @test_vp_splice_v8i16(<8 x i16> %va, <8 x i16> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <8 x i1> %head, <8 x i1> undef, <8 x i32> zeroinitializer
+
+  %v = call <8 x i16> @llvm.experimental.vp.splice.v8i16(<8 x i16> %va, <8 x i16> %vb, i32 5, <8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @test_vp_splice_v8i16_negative_offset(<8 x i16> %va, <8 x i16> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8i16_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <8 x i1> %head, <8 x i1> undef, <8 x i32> zeroinitializer
+
+  %v = call <8 x i16> @llvm.experimental.vp.splice.v8i16(<8 x i16> %va, <8 x i16> %vb, i32 -5, <8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @test_vp_splice_v8i16_masked(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8i16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.experimental.vp.splice.v8i16(<8 x i16> %va, <8 x i16> %vb, i32 5, <8 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <8 x i16> %v
+}
+
+define <16 x i8> @test_vp_splice_v16i8(<16 x i8> %va, <16 x i8> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <16 x i1> %head, <16 x i1> undef, <16 x i32> zeroinitializer
+
+  %v = call <16 x i8> @llvm.experimental.vp.splice.v16i8(<16 x i8> %va, <16 x i8> %vb, i32 5, <16 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @test_vp_splice_v16i8_negative_offset(<16 x i8> %va, <16 x i8> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v16i8_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <16 x i1> %head, <16 x i1> undef, <16 x i32> zeroinitializer
+
+  %v = call <16 x i8> @llvm.experimental.vp.splice.v16i8(<16 x i8> %va, <16 x i8> %vb, i32 -5, <16 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @test_vp_splice_v16i8_masked(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v16i8_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.experimental.vp.splice.v16i8(<16 x i8> %va, <16 x i8> %vb, i32 5, <16 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <16 x i8> %v
+}
+
+define <2 x double> @test_vp_splice_v2f64(<2 x double> %va, <2 x double> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <2 x i1> %head, <2 x i1> undef, <2 x i32> zeroinitializer
+
+  %v = call <2 x double> @llvm.experimental.vp.splice.v2f64(<2 x double> %va, <2 x double> %vb, i32 5, <2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <2 x double> %v
+}
+
+define <2 x double> @test_vp_splice_v2f64_negative_offset(<2 x double> %va, <2 x double> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2f64_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <2 x i1> %head, <2 x i1> undef, <2 x i32> zeroinitializer
+
+  %v = call <2 x double> @llvm.experimental.vp.splice.v2f64(<2 x double> %va, <2 x double> %vb, i32 -5, <2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <2 x double> %v
+}
+
+define <2 x double> @test_vp_splice_v2f64_masked(<2 x double> %va, <2 x double> %vb, <2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2f64_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x double> @llvm.experimental.vp.splice.v2f64(<2 x double> %va, <2 x double> %vb, i32 5, <2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <2 x double> %v
+}
+
+define <4 x float> @test_vp_splice_v4f32(<4 x float> %va, <4 x float> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> undef, <4 x i32> zeroinitializer
+
+  %v = call <4 x float> @llvm.experimental.vp.splice.v4f32(<4 x float> %va, <4 x float> %vb, i32 5, <4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <4 x float> %v
+}
+
+define <4 x float> @test_vp_splice_v4f32_negative_offset(<4 x float> %va, <4 x float> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4f32_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> undef, <4 x i32> zeroinitializer
+
+  %v = call <4 x float> @llvm.experimental.vp.splice.v4f32(<4 x float> %va, <4 x float> %vb, i32 -5, <4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <4 x float> %v
+}
+
+define <4 x float> @test_vp_splice_v4f32_masked(<4 x float> %va, <4 x float> %vb, <4 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4f32_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x float> @llvm.experimental.vp.splice.v4f32(<4 x float> %va, <4 x float> %vb, i32 5, <4 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <4 x float> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/vp-splice-mask-fixed-vectors.ll b/llvm/test/CodeGen/RISCV/vp-splice-mask-fixed-vectors.ll
new file mode 100644
index 000000000000..9579973aee0d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/vp-splice-mask-fixed-vectors.ll
@@ -0,0 +1,316 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs -riscv-v-vector-bits-min=128 \
+; RUN:   < %s | FileCheck %s
+
+declare <2 x i1> @llvm.experimental.vp.splice.v2i1(<2 x i1>, <2 x i1>, i32, <2 x i1>, i32, i32)
+declare <4 x i1> @llvm.experimental.vp.splice.v4i1(<4 x i1>, <4 x i1>, i32, <4 x i1>, i32, i32)
+declare <8 x i1> @llvm.experimental.vp.splice.v8i1(<8 x i1>, <8 x i1>, i32, <8 x i1>, i32, i32)
+declare <16 x i1> @llvm.experimental.vp.splice.v16i1(<16 x i1>, <16 x i1>, i32, <16 x i1>, i32, i32)
+
+define <2 x i1> @test_vp_splice_v2i1(<2 x i1> %va, <2 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <2 x i1> %head, <2 x i1> undef, <2 x i32> zeroinitializer
+
+  %v = call <2 x i1> @llvm.experimental.vp.splice.v2i1(<2 x i1> %va, <2 x i1> %vb, i32 5, <2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <2 x i1> %v
+}
+
+define <2 x i1> @test_vp_splice_v2i1_negative_offset(<2 x i1> %va, <2 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, mf8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <2 x i1> %head, <2 x i1> undef, <2 x i32> zeroinitializer
+
+  %v = call <2 x i1> @llvm.experimental.vp.splice.v2i1(<2 x i1> %va, <2 x i1> %vb, i32 -5, <2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <2 x i1> %v
+}
+
+define <2 x i1> @test_vp_splice_v2i1_masked(<2 x i1> %va, <2 x i1> %vb, <2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i1> @llvm.experimental.vp.splice.v2i1(<2 x i1> %va, <2 x i1> %vb, i32 5, <2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <2 x i1> %v
+}
+
+define <4 x i1> @test_vp_splice_v4i1(<4 x i1> %va, <4 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> undef, <4 x i32> zeroinitializer
+
+  %v = call <4 x i1> @llvm.experimental.vp.splice.v4i1(<4 x i1> %va, <4 x i1> %vb, i32 5, <4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <4 x i1> %v
+}
+
+define <4 x i1> @test_vp_splice_v4i1_negative_offset(<4 x i1> %va, <4 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> undef, <4 x i32> zeroinitializer
+
+  %v = call <4 x i1> @llvm.experimental.vp.splice.v4i1(<4 x i1> %va, <4 x i1> %vb, i32 -5, <4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <4 x i1> %v
+}
+
+define <4 x i1> @test_vp_splice_v4i1_masked(<4 x i1> %va, <4 x i1> %vb, <4 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i1> @llvm.experimental.vp.splice.v4i1(<4 x i1> %va, <4 x i1> %vb, i32 5, <4 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <4 x i1> %v
+}
+
+define <8 x i1> @test_vp_splice_v8i1(<8 x i1> %va, <8 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <8 x i1> %head, <8 x i1> undef, <8 x i32> zeroinitializer
+
+  %v = call <8 x i1> @llvm.experimental.vp.splice.v8i1(<8 x i1> %va, <8 x i1> %vb, i32 5, <8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <8 x i1> %v
+}
+
+define <8 x i1> @test_vp_splice_v8i1_negative_offset(<8 x i1> %va, <8 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <8 x i1> %head, <8 x i1> undef, <8 x i32> zeroinitializer
+
+  %v = call <8 x i1> @llvm.experimental.vp.splice.v8i1(<8 x i1> %va, <8 x i1> %vb, i32 -5, <8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <8 x i1> %v
+}
+
+define <8 x i1> @test_vp_splice_v8i1_masked(<8 x i1> %va, <8 x i1> %vb, <8 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i1> @llvm.experimental.vp.splice.v8i1(<8 x i1> %va, <8 x i1> %vb, i32 5, <8 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <8 x i1> %v
+}
+
+define <16 x i1> @test_vp_splice_v16i1(<16 x i1> %va, <16 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v16i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <16 x i1> %head, <16 x i1> undef, <16 x i32> zeroinitializer
+
+  %v = call <16 x i1> @llvm.experimental.vp.splice.v16i1(<16 x i1> %va, <16 x i1> %vb, i32 5, <16 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <16 x i1> %v
+}
+
+define <16 x i1> @test_vp_splice_v16i1_negative_offset(<16 x i1> %va, <16 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v16i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <16 x i1> %head, <16 x i1> undef, <16 x i32> zeroinitializer
+
+  %v = call <16 x i1> @llvm.experimental.vp.splice.v16i1(<16 x i1> %va, <16 x i1> %vb, i32 -5, <16 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <16 x i1> %v
+}
+
+define <16 x i1> @test_vp_splice_v16i1_masked(<16 x i1> %va, <16 x i1> %vb, <16 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v16i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i1> @llvm.experimental.vp.splice.v16i1(<16 x i1> %va, <16 x i1> %vb, i32 5, <16 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <16 x i1> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/vp-splice-mask-vectors.ll b/llvm/test/CodeGen/RISCV/vp-splice-mask-vectors.ll
new file mode 100644
index 000000000000..4eaadb3c24fb
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/vp-splice-mask-vectors.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.experimental.vp.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32, <vscale x 1 x i1>, i32, i32)
+declare <vscale x 2 x i1> @llvm.experimental.vp.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32, <vscale x 2 x i1>, i32, i32)
+declare <vscale x 4 x i1> @llvm.experimental.vp.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32, <vscale x 4 x i1>, i32, i32)
+declare <vscale x 8 x i1> @llvm.experimental.vp.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32, <vscale x 8 x i1>, i32, i32)
+declare <vscale x 16 x i1> @llvm.experimental.vp.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32, <vscale x 16 x i1>, i32, i32)
+declare <vscale x 32 x i1> @llvm.experimental.vp.splice.nxv32i1(<vscale x 32 x i1>, <vscale x 32 x i1>, i32, <vscale x 32 x i1>, i32, i32)
+declare <vscale x 64 x i1> @llvm.experimental.vp.splice.nxv64i1(<vscale x 64 x i1>, <vscale x 64 x i1>, i32, <vscale x 64 x i1>, i32, i32)
+
+define <vscale x 1 x i1> @test_vp_splice_nxv1i1(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> undef, <vscale x 1 x i32> zeroinitializer
+
+  %v = call <vscale x 1 x i1> @llvm.experimental.vp.splice.nxv1i1(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, i32 5, <vscale x 1 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @test_vp_splice_nxv1i1_negative_offset(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, mf8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> undef, <vscale x 1 x i32> zeroinitializer
+
+  %v = call <vscale x 1 x i1> @llvm.experimental.vp.splice.nxv1i1(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, i32 -5, <vscale x 1 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @test_vp_splice_nxv1i1_masked(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, <vscale x 1 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i1> @llvm.experimental.vp.splice.nxv1i1(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, i32 5, <vscale x 1 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 2 x i1> @test_vp_splice_nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x i1> @llvm.experimental.vp.splice.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i1> %v
+}
+
+define <vscale x 2 x i1> @test_vp_splice_nxv2i1_negative_offset(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x i1> @llvm.experimental.vp.splice.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 -5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i1> %v
+}
+
+define <vscale x 2 x i1> @test_vp_splice_nxv2i1_masked(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i1> @llvm.experimental.vp.splice.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 5, <vscale x 2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i1> %v
+}
+
+define <vscale x 4 x i1> @test_vp_splice_nxv4i1(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv4i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+
+  %v = call <vscale x 4 x i1> @llvm.experimental.vp.splice.nxv4i1(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 5, <vscale x 4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 4 x i1> %v
+}
+
+define <vscale x 4 x i1> @test_vp_splice_nxv4i1_negative_offset(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv4i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+
+  %v = call <vscale x 4 x i1> @llvm.experimental.vp.splice.nxv4i1(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 -5, <vscale x 4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 4 x i1> %v
+}
+
+define <vscale x 4 x i1> @test_vp_splice_nxv4i1_masked(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, <vscale x 4 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv4i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i1> @llvm.experimental.vp.splice.nxv4i1(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 5, <vscale x 4 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 4 x i1> %v
+}
+
+define <vscale x 8 x i1> @test_vp_splice_nxv8i1(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv8i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
+
+  %v = call <vscale x 8 x i1> @llvm.experimental.vp.splice.nxv8i1(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 5, <vscale x 8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @test_vp_splice_nxv8i1_negative_offset(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv8i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
+
+  %v = call <vscale x 8 x i1> @llvm.experimental.vp.splice.nxv8i1(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 -5, <vscale x 8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @test_vp_splice_nxv8i1_masked(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, <vscale x 8 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv8i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i1> @llvm.experimental.vp.splice.nxv8i1(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 5, <vscale x 8 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 16 x i1> @test_vp_splice_nxv16i1(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv16i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+
+  %v = call <vscale x 16 x i1> @llvm.experimental.vp.splice.nxv16i1(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 5, <vscale x 16 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 16 x i1> %v
+}
+
+define <vscale x 16 x i1> @test_vp_splice_nxv16i1_negative_offset(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv16i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 5
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+
+  %v = call <vscale x 16 x i1> @llvm.experimental.vp.splice.nxv16i1(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 -5, <vscale x 16 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 16 x i1> %v
+}
+
+define <vscale x 16 x i1> @test_vp_splice_nxv16i1_masked(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, <vscale x 16 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv16i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v14, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v14, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v12, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmsne.vi v8, v10, 0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i1> @llvm.experimental.vp.splice.nxv16i1(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 5, <vscale x 16 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 16 x i1> %v
+}
+
+define <vscale x 32 x i1> @test_vp_splice_nxv32i1(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv32i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v12, a0
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
+
+  %v = call <vscale x 32 x i1> @llvm.experimental.vp.splice.nxv32i1(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 5, <vscale x 32 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 32 x i1> %v
+}
+
+define <vscale x 32 x i1> @test_vp_splice_nxv32i1_negative_offset(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv32i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 5
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
+
+  %v = call <vscale x 32 x i1> @llvm.experimental.vp.splice.nxv32i1(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 -5, <vscale x 32 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 32 x i1> %v
+}
+
+define <vscale x 32 x i1> @test_vp_splice_nxv32i1_masked(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, <vscale x 32 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv32i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v16, v16, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vslideup.vx v16, v12, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmsne.vi v8, v16, 0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i1> @llvm.experimental.vp.splice.nxv32i1(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 5, <vscale x 32 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 32 x i1> %v
+}
+
+define <vscale x 64 x i1> @test_vp_splice_nxv64i1(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv64i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v16, a0
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
+
+  %v = call <vscale x 64 x i1> @llvm.experimental.vp.splice.nxv64i1(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 5, <vscale x 64 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 64 x i1> %v
+}
+
+define <vscale x 64 x i1> @test_vp_splice_nxv64i1_negative_offset(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv64i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 5
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
+
+  %v = call <vscale x 64 x i1> @llvm.experimental.vp.splice.nxv64i1(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 -5, <vscale x 64 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 64 x i1> %v
+}
+
+define <vscale x 64 x i1> @test_vp_splice_nxv64i1_masked(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, <vscale x 64 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv64i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v24, v24, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v24, v24, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT:    vslideup.vx v24, v16, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vmsne.vi v8, v24, 0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i1> @llvm.experimental.vp.splice.nxv64i1(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 5, <vscale x 64 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 64 x i1> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/vp-splice.ll b/llvm/test/CodeGen/RISCV/vp-splice.ll
new file mode 100644
index 000000000000..7d85370e390b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/vp-splice.ll
@@ -0,0 +1,330 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple riscv64 -mattr=+f,+d,+v -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+declare <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32, <vscale x 2 x i1>, i32, i32)
+
+declare <vscale x 1 x i64> @llvm.experimental.vp.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32, <vscale x 1 x i1>, i32, i32)
+declare <vscale x 2 x i32> @llvm.experimental.vp.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32, <vscale x 2 x i1>, i32, i32)
+declare <vscale x 4 x i16> @llvm.experimental.vp.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32, <vscale x 4 x i1>, i32, i32)
+declare <vscale x 8 x i8> @llvm.experimental.vp.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32, <vscale x 8 x i1>, i32, i32)
+
+declare <vscale x 1 x double> @llvm.experimental.vp.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32, <vscale x 1 x i1>, i32, i32)
+declare <vscale x 2 x float> @llvm.experimental.vp.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32, <vscale x 2 x i1>, i32, i32)
+
+declare <vscale x 16 x i64> @llvm.experimental.vp.splice.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, i32, <vscale x 16 x i1>, i32, i32)
+
+define <vscale x 2 x i64> @test_vp_splice_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, i32 5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @test_vp_splice_nxv2i64_negative_offset(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i64_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e64, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, i32 -5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @test_vp_splice_nxv2i64_masked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i64_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, i32 5, <vscale x 2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 1 x i64> @test_vp_splice_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> undef, <vscale x 1 x i32> zeroinitializer
+
+  %v = call <vscale x 1 x i64> @llvm.experimental.vp.splice.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, i32 5, <vscale x 1 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @test_vp_splice_nxv1i64_negative_offset(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1i64_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> undef, <vscale x 1 x i32> zeroinitializer
+
+  %v = call <vscale x 1 x i64> @llvm.experimental.vp.splice.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, i32 -5, <vscale x 1 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @test_vp_splice_nxv1i64_masked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1i64_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.experimental.vp.splice.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, i32 5, <vscale x 1 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 2 x i32> @test_vp_splice_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x i32> @llvm.experimental.vp.splice.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, i32 5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @test_vp_splice_nxv2i32_negative_offset(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i32_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x i32> @llvm.experimental.vp.splice.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, i32 -5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @test_vp_splice_nxv2i32_masked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i32_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.experimental.vp.splice.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, i32 5, <vscale x 2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 4 x i16> @test_vp_splice_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+
+  %v = call <vscale x 4 x i16> @llvm.experimental.vp.splice.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, i32 5, <vscale x 4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @test_vp_splice_nxv4i16_negative_offset(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv4i16_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+
+  %v = call <vscale x 4 x i16> @llvm.experimental.vp.splice.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, i32 -5, <vscale x 4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @test_vp_splice_nxv4i16_masked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv4i16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.experimental.vp.splice.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, i32 5, <vscale x 4 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 8 x i8> @test_vp_splice_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
+
+  %v = call <vscale x 8 x i8> @llvm.experimental.vp.splice.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, i32 5, <vscale x 8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @test_vp_splice_nxv8i8_negative_offset(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv8i8_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
+
+  %v = call <vscale x 8 x i8> @llvm.experimental.vp.splice.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, i32 -5, <vscale x 8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @test_vp_splice_nxv8i8_masked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv8i8_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.experimental.vp.splice.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, i32 5, <vscale x 8 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 1 x double> @test_vp_splice_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> undef, <vscale x 1 x i32> zeroinitializer
+
+  %v = call <vscale x 1 x double> @llvm.experimental.vp.splice.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, i32 5, <vscale x 1 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x double> %v
+}
+
+define <vscale x 1 x double> @test_vp_splice_nxv1f64_negative_offset(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1f64_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> undef, <vscale x 1 x i32> zeroinitializer
+
+  %v = call <vscale x 1 x double> @llvm.experimental.vp.splice.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, i32 -5, <vscale x 1 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x double> %v
+}
+
+define <vscale x 1 x double> @test_vp_splice_nxv1f64_masked(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, <vscale x 1 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1f64_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x double> @llvm.experimental.vp.splice.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, i32 5, <vscale x 1 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x double> %v
+}
+
+define <vscale x 2 x float> @test_vp_splice_nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x float> @llvm.experimental.vp.splice.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x float> %v
+}
+
+define <vscale x 2 x float> @test_vp_splice_nxv2f32_negative_offset(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2f32_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x float> @llvm.experimental.vp.splice.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 -5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x float> %v
+}
+
+define <vscale x 2 x float> @test_vp_splice_nxv2f32_masked(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2f32_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x float> @llvm.experimental.vp.splice.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 5, <vscale x 2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x float> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 85d28122537e..f878d17d5f1d 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -5927,4 +5927,3 @@ declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
 declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
-
diff --git a/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll b/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll
index b94c50dd7e8c..74bf1a8929cb 100644
--- a/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll
+++ b/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll
@@ -315,4 +315,3 @@ define i1 @no_same_ops(i64 %c, i64 %a, i64 %b) {
   %res = or i1 %l0, %l1
   ret i1 %res
 }
-
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-05.ll b/llvm/test/CodeGen/SystemZ/atomic-load-05.ll
index 979f1e684e89..f406bc6d2ce6 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-05.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-05.ll
@@ -1,14 +1,22 @@
-; Test 128-bit atomic loads.
+; Test 128-bit integer atomic loads.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 define i128 @f1(ptr %src) {
 ; CHECK-LABEL: f1:
-; CHECK: lpq %r0, 0(%r3)
-; CHECK-DAG: stg %r1, 8(%r2)
-; CHECK-DAG: stg %r0, 0(%r2)
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpq %r0, 0(%r3)
+; CHECK-NEXT: stg %r1, 8(%r2)
+; CHECK-NEXT: stg %r0, 0(%r2)
+; CHECK-NEXT: br %r14
   %val = load atomic i128, ptr %src seq_cst, align 16
   ret i128 %val
 }
+
+define i128 @f2(ptr %src) {
+; CHECK-LABEL: f2:
+; CHECK: brasl %r14, __atomic_load@PLT
+  %val = load atomic i128, ptr %src seq_cst, align 8
+  ret i128 %val
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
index 069d2168e19a..4d914e3ea0e1 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
@@ -1,19 +1,25 @@
-; Test long double atomic loads. Expect a libcall.
+; Test long double atomic loads. These are emitted by the Clang FE as i128
+; loads with a bitcast, and this test case gets converted into that form as
+; well by the AtomicExpand pass.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 define void @f1(ptr %ret, ptr %src) {
 ; CHECK-LABEL: f1:
-; CHECK: lgr [[RET:%r[0-9]+]], %r2
-; CHECK: la %r4, 160(%r15)
-; CHECK: lghi %r2, 16
-; CHECK: lhi %r5, 5
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lpq %r0, 0(%r3)
+; CHECK-NEXT:    stg %r1, 8(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic fp128, ptr %src seq_cst, align 16
+  store fp128 %val, ptr %ret, align 8
+  ret void
+}
+
+define void @f2(ptr %ret, ptr %src) {
+; CHECK-LABEL: f2:
 ; CHECK: brasl %r14, __atomic_load@PLT
-; CHECK: ld [[FL:%f[0-9]+]], 160(%r15)
-; CHECK: ld [[FH:%f[0-9]+]], 168(%r15)
-; CHECK: std [[FL]], 0([[RET]])
-; CHECK: std [[FH]], 8([[RET]])
-; CHECK: br %r14
   %val = load atomic fp128, ptr %src seq_cst, align 8
   store fp128 %val, ptr %ret, align 8
   ret void
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-05.ll b/llvm/test/CodeGen/SystemZ/atomic-store-05.ll
index dad7d9527b84..e4af7ad57e38 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-05.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-05.ll
@@ -1,4 +1,4 @@
-; Test 128-bit atomic stores.
+; Test 128-bit integer atomic stores.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
@@ -24,3 +24,10 @@ define void @f2(i128 %val, ptr %src) {
   store atomic i128 %val, ptr %src monotonic, align 16
   ret void
 }
+
+define void @f3(i128 %val, ptr %src) {
+; CHECK-LABEL: f3:
+; CHECK: brasl %r14, __atomic_store@PLT
+  store atomic i128 %val, ptr %src seq_cst, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-06.ll b/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
index fd39793faefc..b748bfc767a4 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
@@ -1,13 +1,17 @@
-; Test float atomic loads.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; Test float atomic stores.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
 define void @f1(ptr %src, float %val) {
 ; CHECK-LABEL: f1:
-; CHECK: lgdr [[R:%r[0-9]+]], %f0
-; CHECK: srlg [[R]], [[R]], 32
-; CHECK: st [[R]], 0(%r2)
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $f0s killed $f0s def $f0d
+; CHECK-NEXT:    lgdr %r0, %f0
+; CHECK-NEXT:    srlg %r0, %r0, 32
+; CHECK-NEXT:    st %r0, 0(%r2)
+; CHECK-NEXT:    bcr 15, %r0
+; CHECK-NEXT:    br %r14
   store atomic float %val, ptr %src seq_cst, align 4
   ret void
 }
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-07.ll b/llvm/test/CodeGen/SystemZ/atomic-store-07.ll
index c904b738f2c5..11f81ae1e07d 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-07.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-07.ll
@@ -1,11 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; Test double atomic stores.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
 define void @f1(ptr %dst, double %val) {
 ; CHECK-LABEL: f1:
-; CHECK: std %f0, 0(%r2)
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    std %f0, 0(%r2)
+; CHECK-NEXT:    bcr 15, %r0
+; CHECK-NEXT:    br %r14
   store atomic double %val, ptr %dst seq_cst, align 8
   ret void
 }
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
index b33b283e8dbd..f7f4f4d967db 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
@@ -1,19 +1,25 @@
-; Test long double atomic stores. Expect a libcall.
+; Test long double atomic stores. The atomic store is converted to i128 by
+; the AtomicExpand pass.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 define void @f1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: f1:
-; CHECK: ld [[FL:%f[0-9]+]], 0(%r3)
-; CHECK: ld [[FH:%f[0-9]+]], 8(%r3)
-; CHECK: lgr %r3, %r2
-; CHECK: std [[FL]], 160(%r15)
-; CHECK: std [[FH]], 168(%r15)
-; CHECK: la %r4, 160(%r15)
-; CHECK: lghi %r2, 16
-; CHECK: lhi %r5, 5
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lg %r1, 8(%r3)
+; CHECK-NEXT:    lg %r0, 0(%r3)
+; CHECK-NEXT:    stpq %r0, 0(%r2)
+; CHECK-NEXT:    bcr 1{{[45]}}, %r0
+; CHECK-NEXT:    br %r14
+  %val = load fp128, ptr %src, align 8
+  store atomic fp128 %val, ptr %dst seq_cst, align 16
+  ret void
+}
+
+define void @f2(ptr %dst, ptr %src) {
+; CHECK-LABEL: f2:
 ; CHECK: brasl %r14, __atomic_store@PLT
-; CHECK: br %r14
   %val = load fp128, ptr %src, align 8
   store atomic fp128 %val, ptr %dst seq_cst, align 8
   ret void
diff --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
index 40b512d68be8..0826faa1071b 100644
--- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
@@ -370,33 +370,12 @@ entry:
   ret <16 x i16> %2
 }
 
-;; FIXME: This should generate the same output as above, but let's fix the crash first.
 define <16 x bfloat> @test_no_vbroadcast2() nounwind {
-; X86-LABEL: test_no_vbroadcast2:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp # encoding: [0x55]
-; X86-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
-; X86-NEXT:    andl $-32, %esp # encoding: [0x83,0xe4,0xe0]
-; X86-NEXT:    subl $64, %esp # encoding: [0x83,0xec,0x40]
-; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
-; X86-NEXT:    vmovaps %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
-; X86-NEXT:    vpbroadcastw (%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
-; X86-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
-; X86-NEXT:    popl %ebp # encoding: [0x5d]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_no_vbroadcast2:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pushq %rbp # encoding: [0x55]
-; X64-NEXT:    movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
-; X64-NEXT:    andq $-32, %rsp # encoding: [0x48,0x83,0xe4,0xe0]
-; X64-NEXT:    subq $64, %rsp # encoding: [0x48,0x83,0xec,0x40]
-; X64-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
-; X64-NEXT:    vmovaps %xmm0, (%rsp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
-; X64-NEXT:    vpbroadcastw (%rsp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
-; X64-NEXT:    movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
-; X64-NEXT:    popq %rbp # encoding: [0x5d]
-; X64-NEXT:    retq # encoding: [0xc3]
+; CHECK-LABEL: test_no_vbroadcast2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 entry:
   %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 4792e8343d75..9c65310f79d7 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -2212,17 +2212,10 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
 ;
 ; AVXNC-LABEL: fptrunc_v16f32:
 ; AVXNC:       # %bb.0:
-; AVXNC-NEXT:    pushq %rbp
-; AVXNC-NEXT:    movq %rsp, %rbp
-; AVXNC-NEXT:    andq $-32, %rsp
-; AVXNC-NEXT:    subq $64, %rsp
-; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm1, %xmm1
-; AVXNC-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp)
 ; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
-; AVXNC-NEXT:    vmovaps %xmm0, (%rsp)
-; AVXNC-NEXT:    vmovaps (%rsp), %ymm0
-; AVXNC-NEXT:    movq %rbp, %rsp
-; AVXNC-NEXT:    popq %rbp
+; AVXNC-NEXT:    vinsertf128 $0, %xmm0, %ymm0, %ymm0
+; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm1, %xmm1
+; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVXNC-NEXT:    retq
   %b = fptrunc <16 x float> %a to <16 x bfloat>
   ret <16 x bfloat> %b
@@ -2461,3 +2454,92 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
   %b = fptrunc <8 x double> %a to <8 x bfloat>
   ret <8 x bfloat> %b
 }
+
+define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
+; SSE2-LABEL: test_v8bf16_v32bf16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps (%rdi), %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    retq
+;
+; F16-LABEL: test_v8bf16_v32bf16:
+; F16:       # %bb.0:
+; F16-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; F16-NEXT:    retq
+;
+; AVXNC-LABEL: test_v8bf16_v32bf16:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVXNC-NEXT:    vmovaps %ymm0, %ymm1
+; AVXNC-NEXT:    retq
+  %2 = load <8 x bfloat>, ptr %0, align 16
+  %3 = shufflevector <8 x bfloat> %2, <8 x bfloat> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <32 x bfloat> %3
+}
+
+define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+; SSE2-LABEL: concat_v8bf16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: concat_v8bf16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x bfloat> %a
+}
+
+define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) {
+; SSE2-LABEL: extract_v32bf16_v8bf16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pextrw $0, %xmm1, %eax
+; SSE2-NEXT:    pextrw $1, %xmm1, %ecx
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    orl %eax, %ecx
+; SSE2-NEXT:    pextrw $2, %xmm1, %eax
+; SSE2-NEXT:    pextrw $3, %xmm1, %edx
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    orl %eax, %edx
+; SSE2-NEXT:    shlq $32, %rdx
+; SSE2-NEXT:    orq %rcx, %rdx
+; SSE2-NEXT:    pextrw $4, %xmm1, %eax
+; SSE2-NEXT:    pextrw $5, %xmm1, %ecx
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    orl %eax, %ecx
+; SSE2-NEXT:    pextrw $6, %xmm1, %eax
+; SSE2-NEXT:    pextrw $7, %xmm1, %esi
+; SSE2-NEXT:    shll $16, %esi
+; SSE2-NEXT:    orl %eax, %esi
+; SSE2-NEXT:    shlq $32, %rsi
+; SSE2-NEXT:    orq %rcx, %rsi
+; SSE2-NEXT:    movq %rsi, %xmm1
+; SSE2-NEXT:    movq %rdx, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: extract_v32bf16_v8bf16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %a = shufflevector <32 x bfloat> %x, <32 x bfloat> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x bfloat> %a
+}
+
+define <16 x bfloat> @concat_zero_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+; SSE2-LABEL: concat_zero_v8bf16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: concat_zero_v8bf16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a = shufflevector <8 x bfloat> %x, <8 x bfloat> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll
new file mode 100644
index 000000000000..a3c3fc70e976
--- /dev/null
+++ b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=x86_64-grtev4-linux-gnu < %s | FileCheck %s
+
+%struct.wibble = type { %struct.wombat }
+%struct.wombat = type { %struct.ham, [3 x i8] }
+%struct.ham = type { %struct.zot }
+%struct.zot = type { %struct.blam }
+%struct.blam = type { %struct.ham.0 }
+%struct.ham.0 = type { %struct.bar }
+%struct.bar = type { %struct.bar.1 }
+%struct.bar.1 = type { %struct.baz, i8 }
+%struct.baz = type { %struct.snork }
+%struct.snork = type <{ %struct.spam, i8, [3 x i8] }>
+%struct.spam = type { %struct.snork.2, %struct.snork.2 }
+%struct.snork.2 = type { i32 }
+%struct.snork.3 = type { %struct.baz, i8, [3 x i8] }
+
+define void @foo(ptr %arg, ptr %arg1, i40 %arg2, ptr %arg3, i32 %arg4) #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    movl %r8d, %r14d
+; CHECK-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %r13
+; CHECK-NEXT:    movq %rdi, %r15
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    # implicit-def: $r12
+; CHECK-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_1: # %bb17
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    movq %r15, %r13
+; CHECK-NEXT:    xorl %r15d, %r15d
+; CHECK-NEXT:    testq %rbx, %rbx
+; CHECK-NEXT:    sete %r15b
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    callq _Znwm@PLT
+; CHECK-NEXT:    shll $4, %r15d
+; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; CHECK-NEXT:    movq %r12, %rcx
+; CHECK-NEXT:    shrq $32, %rcx
+; CHECK-NEXT:    movb %cl, 12(%rax)
+; CHECK-NEXT:    movl %r12d, 8(%rax)
+; CHECK-NEXT:    movq %r15, %rbx
+; CHECK-NEXT:    movq %r13, %r15
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; CHECK-NEXT:    decl %r14d
+; CHECK-NEXT:    je .LBB0_8
+; CHECK-NEXT:  .LBB0_3: # %bb7
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq widget@PLT
+; CHECK-NEXT:    cmpb $-5, (%r13)
+; CHECK-NEXT:    jae .LBB0_5
+; CHECK-NEXT:  # %bb.4: # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    movl %r12d, %r12d
+; CHECK-NEXT:    cmpq %r15, %rbx
+; CHECK-NEXT:    jbe .LBB0_1
+; CHECK-NEXT:    jmp .LBB0_7
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_5: # %bb12
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    movq 0, %rax
+; CHECK-NEXT:    movq 8, %rax
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; CHECK-NEXT:    cmpq %r15, %rbx
+; CHECK-NEXT:    jbe .LBB0_1
+; CHECK-NEXT:  .LBB0_7: # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    decl %r14d
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:  .LBB0_8: # %bb21
+; CHECK-NEXT:    cmpb $0, 12(%rax)
+; CHECK-NEXT:    jne .LBB0_10
+; CHECK-NEXT:  # %bb.9: # %bb26
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_10: # %bb25
+; CHECK-NEXT:    .cfi_def_cfa %rbp, 16
+; CHECK-NEXT:    movq %r15, %rdi
+; CHECK-NEXT:    callq pluto@PLT
+bb:
+  br label %bb7
+
+bb5:                                              ; preds = %bb17, %bb14
+  %phi = phi ptr [ %call19, %bb17 ], [ null, %bb14 ]
+  %phi6 = phi ptr [ %getelementptr, %bb17 ], [ null, %bb14 ]
+  %add = add i32 %phi9, 1
+  %icmp = icmp eq i32 %phi9, %arg4
+  br i1 %icmp, label %bb21, label %bb7
+
+bb7:                                              ; preds = %bb5, %bb
+  %phi8 = phi ptr [ null, %bb ], [ %phi6, %bb5 ]
+  %phi9 = phi i32 [ 0, %bb ], [ %add, %bb5 ]
+  %phi10 = phi i40 [ undef, %bb ], [ %phi15, %bb5 ]
+  %call = call ptr @widget()
+  %load = load i8, ptr %arg1, align 8
+  %icmp11 = icmp ult i8 %load, -5
+  %and = and i40 %phi10, 4294967295
+  br i1 %icmp11, label %bb14, label %bb12
+
+bb12:                                             ; preds = %bb7
+  %load13 = load volatile { i64, i64 }, ptr null, align 4294967296
+  br label %bb14
+
+bb14:                                             ; preds = %bb12, %bb7
+  %phi15 = phi i40 [ %and, %bb7 ], [ %arg2, %bb12 ]
+  %icmp16 = icmp ugt ptr %phi8, %arg
+  br i1 %icmp16, label %bb5, label %bb17
+
+bb17:                                             ; preds = %bb14
+  %icmp18 = icmp eq ptr %phi8, null
+  %zext = zext i1 %icmp18 to i64
+  %call19 = call ptr @_Znwm(i64 0)
+  %getelementptr = getelementptr %struct.wibble, ptr %arg3, i64 %zext
+  %getelementptr20 = getelementptr i8, ptr %call19, i64 8
+  store i40 %phi15, ptr %getelementptr20, align 4
+  br label %bb5
+
+bb21:                                             ; preds = %bb5
+  %getelementptr22 = getelementptr %struct.snork.3, ptr %phi, i64 0, i32 1
+  %load23 = load i8, ptr %getelementptr22, align 4
+  %icmp24 = icmp eq i8 %load23, 0
+  br i1 %icmp24, label %bb26, label %bb25
+
+bb25:                                             ; preds = %bb21
+  call void @pluto(ptr %arg)
+  unreachable
+
+bb26:                                             ; preds = %bb21
+  ret void
+}
+
+define void @eggs(ptr %arg, ptr %arg1) {
+; CHECK-LABEL: eggs:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %r8d, %r8d
+; CHECK-NEXT:    callq foo@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+bb:
+  call void @foo(ptr %arg1, ptr %arg, i40 0, ptr null, i32 0)
+  ret void
+}
+
+declare ptr @widget()
+
+declare void @pluto(ptr)
+
+declare ptr @_Znwm(i64)
+
+attributes #0 = { noinline "frame-pointer"="all" }
diff --git a/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
index 8241a1757af5..190b14052d9b 100644
--- a/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
+++ b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
@@ -9,7 +9,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x2aaaaaab), %bb.2(0x55555555)
   ; CHECK-NEXT:   liveins: $edi
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+  ; CHECK-NEXT:   undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def [[MOV32r0_]]
   ; CHECK-NEXT:   JCC_1 %bb.2, 5, implicit killed undef $eflags
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -28,7 +28,7 @@ body:             |
   ; CHECK-NEXT:   JCC_1 %bb.5, 5, implicit killed undef $eflags
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   dead $eax = MOV32r0 implicit-def dead $eflags, implicit-def $al
+  ; CHECK-NEXT:   dead $eax = MOV32r0 implicit-def dead $eflags, implicit-def $al, implicit-def $al
   ; CHECK-NEXT:   RET 0, killed undef $al
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
diff --git a/llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir b/llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir
new file mode 100644
index 000000000000..fe53aef86e83
--- /dev/null
+++ b/llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=x86_64-- -run-pass=register-coalescer -enable-subreg-liveness -verify-coalescing -o - %s | FileCheck %s
+
+
+# FIXME: Need to handle subrange updates when coalescing with subreg_to_reg
+# This will fail if x86 enables subregister liveness.
+---
+name: requires_new_subrange_coalesce_subreg_to_reg
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: requires_new_subrange_coalesce_subreg_to_reg
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $eax
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef %a.sub_32bit:gr64_with_sub_8bit = COPY $eax
+  ; CHECK-NEXT:   %b:gr32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %c:gr64 = INSERT_SUBREG %a, %b, %subreg.sub_32bit
+  ; CHECK-NEXT:   JCC_1 %bb.2, 4, implicit undef $eflags
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef %a.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+  ; CHECK-NEXT:   %c.sub_32bit:gr64 = COPY %a
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   %c.sub_32bit:gr64 = SUBREG_TO_REG %a, %b, %subreg.sub_32bit
+  ; CHECK-NEXT:   RET 0, implicit %c
+  bb.0:
+    liveins: $eax
+    %init_eax:gr32 = COPY $eax
+    %a:gr64 = SUBREG_TO_REG 0, %init_eax, %subreg.sub_32bit
+    %b:gr32 = IMPLICIT_DEF
+    %c:gr64 = INSERT_SUBREG %a, %b, %subreg.sub_32bit
+    JCC_1 %bb.2, 4, implicit undef $eflags
+
+  bb.1:
+    %imm0:gr32 = MOV32r0 implicit-def dead $eflags
+    %a = SUBREG_TO_REG 0, %imm0, %subreg.sub_32bit
+    %c.sub_32bit = COPY %a
+
+  bb.2:
+    %c.sub_32bit = SUBREG_TO_REG %a, %b, %subreg.sub_32bit
+    RET 0, implicit %c
+
+...
diff --git a/llvm/test/CodeGen/X86/code-model-elf-sections.ll b/llvm/test/CodeGen/X86/code-model-elf-sections.ll
index cb19f0d34f59..749d5b6bf904 100644
--- a/llvm/test/CodeGen/X86/code-model-elf-sections.ll
+++ b/llvm/test/CodeGen/X86/code-model-elf-sections.ll
@@ -21,16 +21,16 @@
 ; SMALL: .data {{.*}} WA {{.*}}
 ; SMALL: .data.x {{.*}} WA {{.*}}
 ; SMALL: .data0 {{.*}} WA {{.*}}
-; SMALL: .ldata {{.*}} WA {{.*}}
-; SMALL: .ldata.x {{.*}} WA {{.*}}
+; SMALL: .ldata {{.*}} WAl {{.*}}
+; SMALL: .ldata.x {{.*}} WAl {{.*}}
 ; SMALL: .ldata0 {{.*}} WA {{.*}}
 ; SMALL: force_small {{.*}} WA {{.*}}
 ; SMALL: force_large {{.*}} WAl {{.*}}
 ; SMALL: foo {{.*}} WA {{.*}}
 ; SMALL: .bss {{.*}} WA {{.*}}
-; SMALL: .lbss {{.*}} WA {{.*}}
+; SMALL: .lbss {{.*}} WAl {{.*}}
 ; SMALL: .rodata {{.*}} A {{.*}}
-; SMALL: .lrodata {{.*}} A {{.*}}
+; SMALL: .lrodata {{.*}} Al {{.*}}
 ; SMALL: .data.rel.ro {{.*}} WA {{.*}}
 ; SMALL: .tbss {{.*}} WAT {{.*}}
 ; SMALL: .tdata {{.*}} WAT {{.*}}
@@ -38,16 +38,16 @@
 ; SMALL-DS: .data {{.*}} WA {{.*}}
 ; SMALL-DS: .data.x {{.*}} WA {{.*}}
 ; SMALL-DS: .data0 {{.*}} WA {{.*}}
-; SMALL-DS: .ldata {{.*}} WA {{.*}}
-; SMALL-DS: .ldata.x {{.*}} WA {{.*}}
+; SMALL-DS: .ldata {{.*}} WAl {{.*}}
+; SMALL-DS: .ldata.x {{.*}} WAl {{.*}}
 ; SMALL-DS: .ldata0 {{.*}} WA {{.*}}
 ; SMALL-DS: .data.data {{.*}} WA {{.*}}
 ; SMALL-DS: force_small {{.*}} WA {{.*}}
 ; SMALL-DS: force_large {{.*}} WAl {{.*}}
 ; SMALL-DS: foo {{.*}} WA {{.*}}
-; SMALL-DS: .lbss {{.*}} WA {{.*}}
+; SMALL-DS: .lbss {{.*}} WAl {{.*}}
 ; SMALL-DS: .bss.bss {{.*}} WA {{.*}}
-; SMALL-DS: .lrodata {{.*}} A {{.*}}
+; SMALL-DS: .lrodata {{.*}} Al {{.*}}
 ; SMALL-DS: .rodata.rodata {{.*}} A {{.*}}
 ; SMALL-DS: .data.rel.ro.relro {{.*}} WA {{.*}}
 ; SMALL-DS: .tbss.tbss {{.*}} WAT {{.*}}
diff --git a/llvm/test/CodeGen/X86/code-model-elf.ll b/llvm/test/CodeGen/X86/code-model-elf.ll
index 6112f2a57b82..afcffb3a7ade 100644
--- a/llvm/test/CodeGen/X86/code-model-elf.ll
+++ b/llvm/test/CodeGen/X86/code-model-elf.ll
@@ -11,6 +11,16 @@
 ; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=large  | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-PIC
 ; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=large  -large-data-threshold=1000 | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-SMALL-DATA-PIC
 
+; Check that the relocations we emit are valid.
+; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=small  -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=medium -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=large  -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=small  -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=medium -large-data-threshold=1000 -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=medium -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=large  -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=large  -large-data-threshold=1000 -filetype=obj -o /dev/null
+
 ; Generated from this C source:
 ;
 ; static int static_data[10];
@@ -376,7 +386,6 @@ define dso_local ptr @lea_forced_small_data() #0 {
   ret ptr @forced_small_data
 }
 
-; TODO: make small and medium instruction sequence the same
 define dso_local i32 @load_forced_small_data() #0 {
 ; SMALL-STATIC-LABEL: load_forced_small_data:
 ; SMALL-STATIC:       # %bb.0:
@@ -385,14 +394,13 @@ define dso_local i32 @load_forced_small_data() #0 {
 ;
 ; MEDIUM-STATIC-LABEL: load_forced_small_data:
 ; MEDIUM-STATIC:       # %bb.0:
-; MEDIUM-STATIC-NEXT:    movl $forced_small_data, %eax
-; MEDIUM-STATIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-STATIC-NEXT:    movl forced_small_data+8(%rip), %eax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: load_forced_small_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movl $forced_small_data, %eax
-; LARGE-STATIC-NEXT:    movl 8(%rax), %eax
+; LARGE-STATIC-NEXT:    movl $forced_small_data+8, %eax
+; LARGE-STATIC-NEXT:    movl (%rax), %eax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: load_forced_small_data:
@@ -402,14 +410,12 @@ define dso_local i32 @load_forced_small_data() #0 {
 ;
 ; MEDIUM-SMALL-DATA-PIC-LABEL: load_forced_small_data:
 ; MEDIUM-SMALL-DATA-PIC:       # %bb.0:
-; MEDIUM-SMALL-DATA-PIC-NEXT:    leaq forced_small_data(%rip), %rax
-; MEDIUM-SMALL-DATA-PIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-SMALL-DATA-PIC-NEXT:    movl forced_small_data+8(%rip), %eax
 ; MEDIUM-SMALL-DATA-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: load_forced_small_data:
 ; MEDIUM-PIC:       # %bb.0:
-; MEDIUM-PIC-NEXT:    leaq forced_small_data(%rip), %rax
-; MEDIUM-PIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-PIC-NEXT:    movl forced_small_data+8(%rip), %eax
 ; MEDIUM-PIC-NEXT:    retq
 ;
 ; LARGE-PIC-LABEL: load_forced_small_data:
@@ -435,7 +441,6 @@ define dso_local i32 @load_forced_small_data() #0 {
   ret i32 %rv
 }
 
-; TODO: fix small code model instruction sequences to use 64-bit constants
 define dso_local ptr @lea_forced_large_data() #0 {
 ; SMALL-STATIC-LABEL: lea_forced_large_data:
 ; SMALL-STATIC:       # %bb.0:
@@ -454,8 +459,9 @@ define dso_local ptr @lea_forced_large_data() #0 {
 ;
 ; SMALL-PIC-LABEL: lea_forced_large_data:
 ; SMALL-PIC:       # %bb.0:
-; SMALL-PIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax
-; SMALL-PIC-NEXT:    leaq forced_large_data@GOTOFF(%rax), %rax
+; SMALL-PIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rcx
+; SMALL-PIC-NEXT:    movabsq $forced_large_data@GOTOFF, %rax
+; SMALL-PIC-NEXT:    addq %rcx, %rax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-SMALL-DATA-PIC-LABEL: lea_forced_large_data:
@@ -497,25 +503,27 @@ define dso_local ptr @lea_forced_large_data() #0 {
 define dso_local i32 @load_forced_large_data() #0 {
 ; SMALL-STATIC-LABEL: load_forced_large_data:
 ; SMALL-STATIC:       # %bb.0:
-; SMALL-STATIC-NEXT:    movl forced_large_data+8(%rip), %eax
+; SMALL-STATIC-NEXT:    movabsq $forced_large_data+8, %rax
+; SMALL-STATIC-NEXT:    movl (%rax), %eax
 ; SMALL-STATIC-NEXT:    retq
 ;
 ; MEDIUM-STATIC-LABEL: load_forced_large_data:
 ; MEDIUM-STATIC:       # %bb.0:
-; MEDIUM-STATIC-NEXT:    movabsq $forced_large_data, %rax
-; MEDIUM-STATIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-STATIC-NEXT:    movabsq $forced_large_data+8, %rax
+; MEDIUM-STATIC-NEXT:    movl (%rax), %eax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: load_forced_large_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movabsq $forced_large_data, %rax
-; LARGE-STATIC-NEXT:    movl 8(%rax), %eax
+; LARGE-STATIC-NEXT:    movabsq $forced_large_data+8, %rax
+; LARGE-STATIC-NEXT:    movl (%rax), %eax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: load_forced_large_data:
 ; SMALL-PIC:       # %bb.0:
 ; SMALL-PIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax
-; SMALL-PIC-NEXT:    movl forced_large_data@GOTOFF+8(%rax), %eax
+; SMALL-PIC-NEXT:    movabsq $forced_large_data@GOTOFF, %rcx
+; SMALL-PIC-NEXT:    movl 8(%rax,%rcx), %eax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-SMALL-DATA-PIC-LABEL: load_forced_large_data:
@@ -563,14 +571,14 @@ define dso_local i32 @load_global_data() #0 {
 ;
 ; MEDIUM-STATIC-LABEL: load_global_data:
 ; MEDIUM-STATIC:       # %bb.0:
-; MEDIUM-STATIC-NEXT:    movabsq $global_data, %rax
-; MEDIUM-STATIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-STATIC-NEXT:    movabsq $global_data+8, %rax
+; MEDIUM-STATIC-NEXT:    movl (%rax), %eax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: load_global_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movabsq $global_data, %rax
-; LARGE-STATIC-NEXT:    movl 8(%rax), %eax
+; LARGE-STATIC-NEXT:    movabsq $global_data+8, %rax
+; LARGE-STATIC-NEXT:    movl (%rax), %eax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: load_global_data:
@@ -580,8 +588,7 @@ define dso_local i32 @load_global_data() #0 {
 ;
 ; MEDIUM-SMALL-DATA-PIC-LABEL: load_global_data:
 ; MEDIUM-SMALL-DATA-PIC:       # %bb.0:
-; MEDIUM-SMALL-DATA-PIC-NEXT:    leaq global_data(%rip), %rax
-; MEDIUM-SMALL-DATA-PIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-SMALL-DATA-PIC-NEXT:    movl global_data+8(%rip), %eax
 ; MEDIUM-SMALL-DATA-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: load_global_data:
@@ -684,14 +691,14 @@ define dso_local i32 @load_unknown_size_data() #0 {
 ;
 ; MEDIUM-STATIC-LABEL: load_unknown_size_data:
 ; MEDIUM-STATIC:       # %bb.0:
-; MEDIUM-STATIC-NEXT:    movabsq $unknown_size_data, %rax
-; MEDIUM-STATIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-STATIC-NEXT:    movabsq $unknown_size_data+8, %rax
+; MEDIUM-STATIC-NEXT:    movl (%rax), %eax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: load_unknown_size_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movabsq $unknown_size_data, %rax
-; LARGE-STATIC-NEXT:    movl 8(%rax), %eax
+; LARGE-STATIC-NEXT:    movabsq $unknown_size_data+8, %rax
+; LARGE-STATIC-NEXT:    movl (%rax), %eax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: load_unknown_size_data:
@@ -1127,8 +1134,7 @@ define dso_local float @load_constant_pool(float %x) #0 {
 ;
 ; MEDIUM-STATIC-LABEL: load_constant_pool:
 ; MEDIUM-STATIC:       # %bb.0:
-; MEDIUM-STATIC-NEXT:    movl ${{\.?LCPI[0-9]+_[0-9]+}}, %eax
-; MEDIUM-STATIC-NEXT:    addss (%rax), %xmm0
+; MEDIUM-STATIC-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: load_constant_pool:
diff --git a/llvm/test/CodeGen/X86/fast-isel-large-object.ll b/llvm/test/CodeGen/X86/fast-isel-large-object.ll
index 6ca2c4240723..9acdfdeaf7cc 100644
--- a/llvm/test/CodeGen/X86/fast-isel-large-object.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-large-object.ll
@@ -6,8 +6,9 @@
 define ptr @f() {
 ; CHECK-LABEL: f:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax
-; CHECK-NEXT:    leaq g@GOTOFF(%rax), %rax
+; CHECK-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rcx
+; CHECK-NEXT:    movabsq $g@GOTOFF, %rax
+; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    retq
   ret ptr @g
 }
diff --git a/llvm/test/CodeGen/X86/fold-add.ll b/llvm/test/CodeGen/X86/fold-add.ll
index 597e51d877eb..8c28d66597fb 100644
--- a/llvm/test/CodeGen/X86/fold-add.ll
+++ b/llvm/test/CodeGen/X86/fold-add.ll
@@ -45,8 +45,7 @@ define dso_local i64 @one() #0 {
 ;
 ; MSTATIC-LABEL: one:
 ; MSTATIC:       # %bb.0: # %entry
-; MSTATIC-NEXT:    movabsq $foo, %rax
-; MSTATIC-NEXT:    incq %rax
+; MSTATIC-NEXT:    movabsq $foo+1, %rax
 ; MSTATIC-NEXT:    retq
 ;
 ; MPIC-LABEL: one:
diff --git a/llvm/test/CodeGen/X86/insert.ll b/llvm/test/CodeGen/X86/insert.ll
new file mode 100644
index 000000000000..381de2ecaa16
--- /dev/null
+++ b/llvm/test/CodeGen/X86/insert.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
+
+define i64 @sub8(i64 noundef %res, ptr %byte) {
+; X86-LABEL: sub8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb (%ecx), %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: sub8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movb (%rsi), %al
+; X64-NEXT:    retq
+entry:
+  %and = and i64 %res, -256
+  %d = load i8, ptr %byte, align 1
+  %conv2 = zext i8 %d to i64
+  %or = or i64 %and, %conv2
+  ret i64 %or
+}
+
+define i64 @sub16(i64 noundef %res, ptr %byte) {
+; X86-LABEL: sub16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sub16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movw (%rsi), %ax
+; X64-NEXT:    retq
+entry:
+  %and = and i64 %res, -65536
+  %d = load i16, ptr %byte, align 1
+  %conv2 = zext i16 %d to i64
+  %or = or i64 %and, %conv2
+  ret i64 %or
+}
+
+define i32 @sub8_32(i32 noundef %res, ptr %byte) {
+; X86-LABEL: sub8_32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb (%ecx), %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: sub8_32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movb (%rsi), %al
+; X64-NEXT:    retq
+entry:
+  %and = and i32 %res, -256
+  %d = load i8, ptr %byte, align 1
+  %conv2 = zext i8 %d to i32
+  %or = or i32 %and, %conv2
+  ret i32 %or
+}
+
+define i32 @sub16_32(i32 noundef %res, ptr %byte) {
+; X86-LABEL: sub16_32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sub16_32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movw (%rsi), %ax
+; X64-NEXT:    retq
+entry:
+  %and = and i32 %res, -65536
+  %d = load i16, ptr %byte, align 1
+  %conv2 = zext i16 %d to i32
+  %or = or i32 %and, %conv2
+  ret i32 %or
+}
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 1fe8d834dbcd..4fbe05cd1b2f 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -347,7 +347,6 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; i686-NEXT:    shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
-; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    shrl %cl, %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl %esi, 28(%ecx)
@@ -489,7 +488,6 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; i686-NEXT:    shrdl %cl, %esi, %ebx
-; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    sarl %cl, %ebp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl %ebp, 28(%ecx)
@@ -623,11 +621,9 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    shll %cl, %edi
 ; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ecx, %edi
 ; i686-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; i686-NEXT:    negl %ebp
 ; i686-NEXT:    movl 64(%esp,%ebp), %esi
-; i686-NEXT:    movl %edi, %ecx
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; i686-NEXT:    shldl %cl, %edi, %esi
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index 0e4e70666930..e1466aebf422 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -78,7 +78,6 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; CHECK-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; CHECK-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; CHECK-NEXT:    movl 28(%esp,%ebp), %edx
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index abab313f4b12..b2b5bcc5b44b 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -1201,7 +1201,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl %ecx, %esi
diff --git a/llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir b/llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir
new file mode 100644
index 000000000000..6121a0bcc564
--- /dev/null
+++ b/llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir
@@ -0,0 +1,348 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=x86_64-- -run-pass=register-coalescer -o - %s | FileCheck %s
+
+# We cannot lose the liveness of the high subregister of %1 when
+# coalesced with %0, so introduce an implicit-def of the super
+# register on the MOV.
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %1
+    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = MOV32r0 implicit-def dead $eflags
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: subreg_to_reg_folds_to_undef
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $rax
+
+    ; CHECK-LABEL: name: subreg_to_reg_folds_to_undef
+    ; CHECK: liveins: $rax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY $rax
+    ; CHECK-NEXT: undef %4.sub_32bit:gr64_with_sub_8bit = MOV32rr [[COPY]].sub_32bit, implicit-def %4
+    ; CHECK-NEXT: RET 0, implicit %4
+    %0:gr64 = COPY killed $rax
+    %1:gr32 = COPY killed %0.sub_32bit
+    %2:gr32 = MOV32rr killed %1
+    %3:gr64 = SUBREG_TO_REG 0, killed %2, %subreg.sub_32bit
+    %4:gr64 = COPY killed %3
+    RET 0, implicit %4
+
+...
+
+---
+name: coalesce_mov32r0_subreg_def_into_subreg_to_reg64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_subreg_def_into_subreg_to_reg64
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    undef %0.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+    %1:gr64 = SUBREG_TO_REG 0, killed %0.sub_32bit, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_def_with_super_def_to_reg64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_def_with_super_def_to_reg64
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %1
+    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    undef %0.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %0
+    %1:gr64 = SUBREG_TO_REG 0, killed %0.sub_32bit, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_already_defs_other_subreg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_already_defs_other_subreg
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def undef %1.sub_8bit, implicit-def %1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, implicit %1
+    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit undef $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = MOV32r0 implicit-def dead $eflags, implicit-def undef %0.sub_8bit
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    INLINEASM &"", 0, implicit %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit undef $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+
+# Reduced realistic case which was asserting after introducing new implicit-defs
+---
+name: coalesce_needs_implicit_defs
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: coalesce_needs_implicit_defs
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $rdi
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+  ; CHECK-NEXT:   undef %2.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %2
+  ; CHECK-NEXT:   undef %3.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef %10.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+  ; CHECK-NEXT:   TEST64rr %3, %3, implicit-def $eflags
+  ; CHECK-NEXT:   %10.sub_8bit:gr64_with_sub_8bit = SETCCr 4, implicit killed $eflags
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+  ; CHECK-NEXT:   CALL64r %2, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[SHL64ri:%[0-9]+]]:gr64_with_sub_8bit = SHL64ri [[SHL64ri]], 4, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64rr:%[0-9]+]]:gr64_with_sub_8bit = ADD64rr [[ADD64rr]], [[COPY]], implicit-def dead $eflags
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_with_sub_8bit = COPY [[ADD64rr]]
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  bb.0:
+    liveins: $rdi
+
+    %0:gr64 = COPY killed $rdi
+    %1:gr32 = MOV32r0 implicit-def dead $eflags
+    %2:gr64 = SUBREG_TO_REG 0, %1, %subreg.sub_32bit
+    %3:gr64 = COPY killed %2
+
+  bb.1:
+    %4:gr64 = COPY killed %3
+    %5:gr32 = MOV32r0 implicit-def dead $eflags
+    TEST64rr killed %4, %4, implicit-def $eflags
+    %6:gr8 = SETCCr 4, implicit killed $eflags
+    %7:gr32 = COPY killed %5
+    %7.sub_8bit:gr32 = COPY killed %6
+    %8:gr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32bit
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %9:gr64 = SUBREG_TO_REG 0, %1, %subreg.sub_32bit
+    $rdi = COPY %9
+    CALL64r killed %9, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %10:gr64 = COPY killed %8
+    %10:gr64 = SHL64ri %10, 4, implicit-def dead $eflags
+    %11:gr64 = COPY killed %10
+    %11:gr64 = ADD64rr %11, %0, implicit-def dead $eflags
+    %3:gr64 = COPY killed %11
+    JMP_1 %bb.1
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_physreg_def
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_physreg_def
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+    ; CHECK-NEXT: CALL64r killed $rdi, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = MOV32r0 implicit-def dead $eflags
+    $rdi = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    CALL64r killed $rdi, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_physreg_use
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $eax
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_physreg_use
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: $eax = MOV32r0 implicit-def dead $eflags
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, $eax, %subreg.sub_32bit
+    ; CHECK-NEXT: $rdi = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: CALL64r [[SUBREG_TO_REG]], csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $eax = MOV32r0 implicit-def dead $eflags
+    %1:gr64 = SUBREG_TO_REG 0, killed $eax, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+# Coalesced instruction is a copy with other implicit operands
+---
+name: coalesce_copy_into_subreg_to_reg64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $eax
+    ; CHECK-LABEL: name: coalesce_copy_into_subreg_to_reg64
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = COPY $eax, implicit-def dead $eflags, implicit-def %1
+    ; CHECK-NEXT: $rdi = COPY %1
+    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = COPY $eax, implicit-def dead $eflags
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_multiple_redef_value
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_multiple_redef_value
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, implicit-def %1.sub_32bit, implicit %1.sub_32bit
+    ; CHECK-NEXT: $rdi = COPY %1
+    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = MOV32r0 implicit-def dead $eflags
+    INLINEASM &"", 0, implicit-def %0, implicit %0
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_def_is_block_liveout
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_def_is_block_liveout
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   INLINEASM &"", 0 /* attdialect */, implicit-def undef %1.sub_32bit, implicit-def %1
+  ; CHECK-NEXT:   JCC_1 %bb.1, 4, implicit undef $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $rdi = COPY %1
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   RET 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  bb.0:
+    INLINEASM &"", 0, implicit-def %0:gr32
+    JCC_1 %bb.1, 4, implicit undef $eflags
+    JMP_1 %bb.2
+
+  bb.1:
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+  bb.2:
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_def_is_phi_def
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_def_is_phi_def
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   INLINEASM &"", 0 /* attdialect */, implicit-def undef %1.sub_32bit, implicit-def %1
+  ; CHECK-NEXT:   JCC_1 %bb.1, 4, implicit undef $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $rdi = COPY %1
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  bb.0:
+
+    INLINEASM &"", 0, implicit-def %0:gr32
+    JCC_1 %bb.1, 4, implicit undef $eflags
+    JMP_1 %bb.2
+
+  bb.1:
+    %1:gr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    JMP_1 %bb.1
+
+  bb.2:
+
+...
diff --git a/llvm/test/CodeGen/X86/usermsr-intrinsics.ll b/llvm/test/CodeGen/X86/usermsr-intrinsics.ll
index 29801a494f49..fa569affdd9f 100644
--- a/llvm/test/CodeGen/X86/usermsr-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/usermsr-intrinsics.ll
@@ -35,7 +35,7 @@ declare i64 @llvm.x86.urdmsr(i64 %A)
 define void @test_int_x86_uwrmsr(i64 %A, i64 %B) nounwind {
 ; X64-LABEL: test_int_x86_uwrmsr:
 ; X64:       # %bb.0:
-; X64-NEXT:    uwrmsr %rdi, %rsi # encoding: [0xf3,0x0f,0x38,0xf8,0xfe]
+; X64-NEXT:    uwrmsr %rsi, %rdi # encoding: [0xf3,0x0f,0x38,0xf8,0xfe]
 ; X64-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.uwrmsr(i64 %A, i64 %B)
   ret void
@@ -46,7 +46,7 @@ define void @test_int_x86_uwrmsr_const(i64 %A) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    uwrmsr %rdi, $123 # encoding: [0xc4,0xe7,0x7a,0xf8,0xc7,0x7b,0x00,0x00,0x00]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  call void @llvm.x86.uwrmsr(i64 %A, i64 123)
+  call void @llvm.x86.uwrmsr(i64 123, i64 %A)
   ret void
 }
 
@@ -55,9 +55,9 @@ define void @test_int_x86_uwrmsr_const_i64(i64 %A) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movabsq $8589934591, %rax # encoding: [0x48,0xb8,0xff,0xff,0xff,0xff,0x01,0x00,0x00,0x00]
 ; X64-NEXT:    # imm = 0x1FFFFFFFF
-; X64-NEXT:    uwrmsr %rdi, %rax # encoding: [0xf3,0x0f,0x38,0xf8,0xf8]
+; X64-NEXT:    uwrmsr %rdi, %rax # encoding: [0xf3,0x0f,0x38,0xf8,0xc7]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  call void @llvm.x86.uwrmsr(i64 %A, i64 8589934591)
+  call void @llvm.x86.uwrmsr(i64 8589934591, i64 %A)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index b1f0dcb9238a..08667aed4bb3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -14447,7 +14447,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
 ; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, %xmm6, %zmm5, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm23, %ymm10
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7]
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
@@ -14483,7 +14482,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, %xmm7, %zmm3, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm9, %ymm11
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7]
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm3, %xmm7
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3],xmm3[4],xmm7[5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
@@ -14516,7 +14515,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm6, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm6
 ; AVX512DQ-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
 ; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
@@ -14530,8 +14529,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 {%k1} # 16-byte Folded Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm10, %ymm18
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm15, %ymm18
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm11, %ymm25
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm6
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7]
@@ -14738,7 +14737,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, %xmm11, %zmm0, %zmm27
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm13, %ymm19
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm16
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm11
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7]
@@ -14747,7 +14745,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128>
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm3, %ymm1
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
@@ -14823,14 +14820,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm12
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, %ymm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm3, %ymm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm14[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7,8,9,10],ymm15[11],ymm12[12,13,14,15]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm19, %ymm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, %ymm4
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm15, %xmm10
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2,3,4,5],xmm10[6],xmm15[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index 91a70fb000dd..db8bca5bc16b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -8490,12 +8490,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermt2q %zmm31, %zmm23, %zmm12
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4]
 ; AVX512F-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm31, %zmm24, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm31, %zmm8, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index a060d29200ba..10ccd40e4865 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -2480,7 +2480,6 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
 ; SSE-NEXT:    psllq $48, %xmm0
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm7, %xmm4
 ; SSE-NEXT:    movdqa %xmm7, %xmm1
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; SSE-NEXT:    pandn %xmm5, %xmm1
@@ -2537,7 +2536,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pandn %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm8, %xmm1
 ; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm7, %xmm0
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; SSE-NEXT:    pandn %xmm4, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index 992b190ac17c..16808dca4511 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -1181,13 +1181,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pandn %xmm9, %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm9, %xmm11
-; SSE-NEXT:    pand %xmm1, %xmm11
-; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm10, %xmm11
+; SSE-NEXT:    movdqa %xmm10, %xmm4
 ; SSE-NEXT:    pandn %xmm0, %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 96(%rdi), %xmm13
 ; SSE-NEXT:    movdqa %xmm13, %xmm4
-; SSE-NEXT:    pand %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm10, %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm4
 ; SSE-NEXT:    movdqa %xmm4, %xmm10
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index f82a192c60b5..2b2cb554d6ac 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -1024,8 +1024,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm9, %xmm7
 ; SSE-NEXT:    pand %xmm14, %xmm7
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm6, %xmm15
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm5, %xmm15
 ; SSE-NEXT:    pand %xmm14, %xmm15
 ; SSE-NEXT:    movdqa %xmm11, %xmm3
 ; SSE-NEXT:    pandn %xmm8, %xmm3
@@ -2148,7 +2148,6 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm5, %xmm9
 ; SSE-NEXT:    pand %xmm13, %xmm9
 ; SSE-NEXT:    por %xmm0, %xmm9
-; SSE-NEXT:    movdqa %xmm6, %xmm3
 ; SSE-NEXT:    movdqa %xmm6, %xmm0
 ; SSE-NEXT:    pand %xmm13, %xmm0
 ; SSE-NEXT:    pandn %xmm10, %xmm13
@@ -2185,7 +2184,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pandn %xmm3, %xmm2
+; SSE-NEXT:    pandn %xmm6, %xmm2
 ; SSE-NEXT:    por %xmm10, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
@@ -5451,19 +5450,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm14, %xmm6
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    movdqa %xmm14, %xmm3
 ; SSE-NEXT:    movdqa %xmm11, %xmm6
 ; SSE-NEXT:    pandn %xmm11, %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm14, %xmm5
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm3
-; SSE-NEXT:    pand %xmm0, %xmm3
+; SSE-NEXT:    pand %xmm14, %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index 0c2df82fd1be..f2133b9e42d3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -11212,7 +11212,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm7, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm5, %xmm23
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
 ; AVX512F-SLOW-NEXT:    vpsrlq $32, %zmm17, %zmm9
@@ -11289,7 +11288,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm30, %xmm10
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm10, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm12, %xmm15
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm12, %xmm31
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
@@ -11302,7 +11300,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm6, %xmm9
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm23, %xmm11
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm21, %xmm11
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 739a5c879de2..d253dd117b10 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -1343,10 +1343,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[3,3]
 ; SSE-NEXT:    movdqa %xmm15, %xmm10
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
-; SSE-NEXT:    movdqa %xmm5, %xmm1
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
 ; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0,2]
 ; SSE-NEXT:    andps %xmm8, %xmm1
 ; SSE-NEXT:    orps %xmm6, %xmm1
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 24475360cbbc..f84131dfc879 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -1845,7 +1845,6 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %edx
@@ -2485,7 +2484,6 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%ebx), %edx
@@ -3129,7 +3127,6 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %edx
@@ -3562,7 +3559,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
@@ -4197,7 +4193,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -4879,7 +4874,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r15, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%r10), %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r12, %rsi
@@ -5200,7 +5194,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notl %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -5211,7 +5205,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
@@ -5534,7 +5527,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -6233,7 +6225,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
@@ -6872,7 +6863,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -7360,9 +7350,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X86: {{.*}}
-; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}
 ; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
 ; X64-SHLD: {{.*}}
+; X86: {{.*}}
+; X86-NO-SHLD: {{.*}}
+; X86-SHLD: {{.*}}
diff --git a/llvm/test/DebugInfo/X86/array2.ll b/llvm/test/DebugInfo/X86/array2.ll
index 8b386ca44c5f..4fe9c9feb86e 100644
--- a/llvm/test/DebugInfo/X86/array2.ll
+++ b/llvm/test/DebugInfo/X86/array2.ll
@@ -16,7 +16,7 @@
 ; RUN: opt --try-experimental-debuginfo-iterators %s -O2 -S -o - | FileCheck %s
 ; Test that we correctly lower dbg.declares for arrays.
 ;
-; CHECK: define i32 @main
+; CHECK: define noundef i32 @main
 ; CHECK: call void @llvm.dbg.value(metadata i32 42, metadata ![[ARRAY:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 32))
 ; CHECK: ![[ARRAY]] = !DILocalVariable(name: "array",{{.*}} line: 6
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/DebugInfo/X86/sret.ll b/llvm/test/DebugInfo/X86/sret.ll
index 087f136541d9..f06b097d07b9 100644
--- a/llvm/test/DebugInfo/X86/sret.ll
+++ b/llvm/test/DebugInfo/X86/sret.ll
@@ -12,9 +12,9 @@
 ; RUN: llc -O0 -fast-isel=true -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | llvm-dwarfdump -debug-info - | FileCheck -check-prefixes=CHECK,FASTISEL %s
 ; RUN: llc -O0 -fast-isel=false -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | llvm-dwarfdump -debug-info - | FileCheck -check-prefixes=CHECK,SDAG %s
 
-; RUN: llc --try-experimental-debuginfo-iterators -O0 -fast-isel=false -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | llvm-dwarfdump -debug-info - | FileCheck -check-prefixes=CHECK,SDAG %s
 ;; FIXME: RemoveDIs - enable when FastISel support is added.
-; run: llc --try-experimental-debuginfo-iterators -O0 -fast-isel=false -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | llvm-dwarfdump -debug-info - | FileCheck -check-prefixes=CHECK,SDAG %s
+; run: llc --try-experimental-debuginfo-iterators -O0 -fast-isel=true -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | llvm-dwarfdump -debug-info - | FileCheck -check-prefixes=CHECK,FASTISEL %s
+; RUN: llc --try-experimental-debuginfo-iterators -O0 -fast-isel=false -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | llvm-dwarfdump -debug-info - | FileCheck -check-prefixes=CHECK,SDAG %s
 
 ; CHECK: _ZN1B9AInstanceEv
 ; CHECK: DW_TAG_variable
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml
new file mode 100644
index 000000000000..e45f2961d017
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml
@@ -0,0 +1,307 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-jitlink -noexec -phony-externals %t
+#
+# Check that MachO::ARM64_RELOC_SUBTRACTOR relocations work when the fixup
+# location and target are in the same block (in this case in the __eh_frame
+# section).
+
+--- !mach-o
+IsLittleEndian: true
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x1
+  ncmds:           5
+  sizeofcmds:      480
+  flags:           0x0
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         392
+    segname:         ''
+    vmaddr:          0
+    vmsize:          200
+    fileoff:         544
+    filesize:        200
+    maxprot:         7
+    initprot:        7
+    nsects:          4
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0
+        size:            72
+        offset:          0x220
+        align:           2
+        reloff:          0x2E8
+        nreloc:          6
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         F44FBEA9FD7B01A9800080520000009448058052080000B901000090210040F9020080D200000094200020D400000094130040B900000094E00313AAFD7B41A9F44FC2A8C0035FD6
+        relocations:
+          - address:         0x34
+            symbolnum:       7
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            2
+            scattered:       false
+            value:           0
+          - address:         0x2C
+            symbolnum:       6
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            2
+            scattered:       false
+            value:           0
+          - address:         0x24
+            symbolnum:       8
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            2
+            scattered:       false
+            value:           0
+          - address:         0x1C
+            symbolnum:       4
+            pcrel:           false
+            length:          2
+            extern:          true
+            type:            6
+            scattered:       false
+            value:           0
+          - address:         0x18
+            symbolnum:       4
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            5
+            scattered:       false
+            value:           0
+          - address:         0xC
+            symbolnum:       5
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            2
+            scattered:       false
+            value:           0
+      - sectname:        __gcc_except_tab
+        segname:         __TEXT
+        addr:            0x48
+        size:            24
+        offset:          0x268
+        align:           2
+        reloff:          0x318
+        nreloc:          1
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         FF9B15010C0018000018102C012820000001000000000000
+        relocations:
+          - address:         0x14
+            symbolnum:       4
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            7
+            scattered:       false
+            value:           0
+      - sectname:        __eh_frame
+        segname:         __TEXT
+        addr:            0x60
+        size:            72
+        offset:          0x280
+        align:           3
+        reloff:          0x320
+        nreloc:          7
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         1800000000000000017A504C520001781E079B0000000010100C1F002800000004000000F8FFFFFFFFFFFFFF480000000000000008E7FFFFFFFFFFFFFF480E209E019D0293039404
+        relocations:
+          - address:         0x13
+            symbolnum:       9
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            7
+            scattered:       false
+            value:           0
+          - address:         0x20
+            symbolnum:       1
+            pcrel:           false
+            length:          2
+            extern:          true
+            type:            1
+            scattered:       false
+            value:           0
+          - address:         0x20
+            symbolnum:       2
+            pcrel:           false
+            length:          2
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+          - address:         0x24
+            symbolnum:       2
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            1
+            scattered:       false
+            value:           0
+          - address:         0x24
+            symbolnum:       3
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+          - address:         0x35
+            symbolnum:       2
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            1
+            scattered:       false
+            value:           0
+          - address:         0x35
+            symbolnum:       0
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+      - sectname:        __compact_unwind
+        segname:         __LD
+        addr:            0xA8
+        size:            32
+        offset:          0x2C8
+        align:           3
+        reloff:          0x358
+        nreloc:          2
+        flags:           0x2000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000000000000480000000000000300000000000000000000000000000000'
+        relocations:
+          - address:         0x0
+            symbolnum:       3
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+          - address:         0x18
+            symbolnum:       0
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          880
+    nsyms:           10
+    stroff:          1040
+    strsize:         152
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           917504
+    sdk:             0
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         59048448
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         872
+    datasize:        0
+  - cmd:             LC_LINKER_OPTIMIZATION_HINT
+    cmdsize:         16
+    dataoff:         872
+    datasize:        8
+LinkEditData:
+  NameList:
+    - n_strx:          112
+      n_type:          0xE
+      n_sect:          2
+      n_desc:          32
+      n_value:         72
+    - n_strx:          130
+      n_type:          0xE
+      n_sect:          3
+      n_desc:          0
+      n_value:         96
+    - n_strx:          140
+      n_type:          0xE
+      n_sect:          3
+      n_desc:          0
+      n_value:         124
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          32
+      n_value:         0
+    - n_strx:          8
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          15
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          41
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          60
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          77
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          90
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ' '
+    - _main
+    - __ZTIi
+    - ___cxa_allocate_exception
+    - ___cxa_begin_catch
+    - ___cxa_end_catch
+    - ___cxa_throw
+    - ___gxx_personality_v0
+    - GCC_except_table0
+    - EH_Frame1
+    - func.eh
+    - ''
+    - ''
+    - ''
+    - ''
+...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml
new file mode 100644
index 000000000000..05c16d1ad1ca
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml
@@ -0,0 +1,160 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-jitlink -noexec -phony-externals %t
+#
+# Check that MachO::X86_64_RELOC_SUBTRACTOR relocations work when the fixup
+# location and target are in the same block (in this case in the __eh_frame
+# section).
+
+--- !mach-o
+IsLittleEndian: true
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x1000007
+  cpusubtype:      0x3
+  filetype:        0x1
+  ncmds:           4
+  sizeofcmds:      384
+  flags:           0x0
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         312
+    segname:         ''
+    vmaddr:          0
+    vmsize:          96
+    fileoff:         448
+    filesize:        96
+    maxprot:         7
+    initprot:        7
+    nsects:          3
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0
+        size:            3
+        offset:          0x1C0
+        align:           4
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         31C0C3
+      - sectname:        __eh_frame
+        segname:         __TEXT
+        addr:            0x8
+        size:            56
+        offset:          0x1C8
+        align:           3
+        reloff:          0x220
+        nreloc:          4
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         1400000000000000017A520001781001100C0708900100001C00000004000000F8FFFFFFFFFFFFFF03000000000000000000000000000000
+        relocations:
+          - address:         0x1C
+            symbolnum:       0
+            pcrel:           false
+            length:          2
+            extern:          true
+            type:            5
+            scattered:       false
+            value:           0
+          - address:         0x1C
+            symbolnum:       1
+            pcrel:           false
+            length:          2
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+          - address:         0x20
+            symbolnum:       1
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            5
+            scattered:       false
+            value:           0
+          - address:         0x20
+            symbolnum:       2
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+      - sectname:        __compact_unwind
+        segname:         __LD
+        addr:            0x40
+        size:            32
+        offset:          0x200
+        align:           3
+        reloff:          0x240
+        nreloc:          1
+        flags:           0x2000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000000000000030000000000000400000000000000000000000000000000'
+        relocations:
+          - address:         0x0
+            symbolnum:       2
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          584
+    nsyms:           3
+    stroff:          632
+    strsize:         32
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           917504
+    sdk:             0
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         59048448
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         584
+    datasize:        0
+LinkEditData:
+  NameList:
+    - n_strx:          8
+      n_type:          0xE
+      n_sect:          2
+      n_desc:          0
+      n_value:         8
+    - n_strx:          18
+      n_type:          0xE
+      n_sect:          2
+      n_desc:          0
+      n_value:         32
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          32
+      n_value:         0
+  StringTable:
+    - ' '
+    - _main
+    - EH_Frame1
+    - func.eh
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+...
diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll
index 079a9c25bfd7..9bac3897864f 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll
@@ -1,4 +1,4 @@
-; REQUIRES: native && target-x86_64
+; REQUIRES: native && x86_64-linux
 
 ; RUN: lli --jit-linker=rtdyld \
 ; RUN:     --generate=__dump_jit_debug_descriptor %s | FileCheck %s
diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
index d7bc2dc117b7..31fe730b7403 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
@@ -1,4 +1,4 @@
-; REQUIRES: native && target-x86_64
+; REQUIRES: native && x86_64-linux
 
 ; In-memory debug-object contains some basic DWARF
 ;
diff --git a/llvm/test/MC/AArch64/armv9.5a-e3dse.s b/llvm/test/MC/AArch64/armv9.5a-e3dse.s
new file mode 100644
index 000000000000..b69d49ab4e9e
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-e3dse.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+
+mrs x0, VDISR_EL3
+// CHECK: mrs x0, VDISR_EL3                  // encoding: [0x20,0xc1,0x3e,0xd5]
+
+msr VDISR_EL3, x0
+// CHECK: msr VDISR_EL3, x0                  // encoding: [0x20,0xc1,0x1e,0xd5]
+
+mrs x0, VSESR_EL3
+// CHECK: mrs x0, VSESR_EL3                  // encoding: [0x60,0x52,0x3e,0xd5]
+
+msr VSESR_EL3, x0
+// CHECK: msr VSESR_EL3, x0                  // encoding: [0x60,0x52,0x1e,0xd5]
diff --git a/llvm/test/MC/AArch64/armv9.5a-fgwte3.s b/llvm/test/MC/AArch64/armv9.5a-fgwte3.s
new file mode 100644
index 000000000000..2352bc7e1ca7
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-fgwte3.s
@@ -0,0 +1,6 @@
+// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+
+mrs x0, FGWTE3_EL3
+// CHECK: mrs x0, FGWTE3_EL3                  // encoding: [0xa0,0x11,0x3e,0xd5]
+msr FGWTE3_EL3, x0
+// CHECK: msr FGWTE3_EL3, x0                  // encoding: [0xa0,0x11,0x1e,0xd5]
diff --git a/llvm/test/MC/AArch64/armv9.5a-hacdbs.s b/llvm/test/MC/AArch64/armv9.5a-hacdbs.s
new file mode 100644
index 000000000000..8ccba29beb44
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-hacdbs.s
@@ -0,0 +1,12 @@
+// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+
+mrs x0, HACDBSBR_EL2
+// CHECK: mrs x0, HACDBSBR_EL2                  // encoding: [0x80,0x23,0x3c,0xd5]
+msr HACDBSBR_EL2, x0
+// CHECK: msr HACDBSBR_EL2, x0                  // encoding: [0x80,0x23,0x1c,0xd5]
+
+mrs x0, HACDBSCONS_EL2
+// CHECK: mrs x0, HACDBSCONS_EL2                  // encoding: [0xa0,0x23,0x3c,0xd5]
+msr HACDBSCONS_EL2, x0
+// CHECK: msr HACDBSCONS_EL2, x0                  // encoding: [0xa0,0x23,0x1c,0xd5]
+
diff --git a/llvm/test/MC/AArch64/armv9.5a-hdbss.s b/llvm/test/MC/AArch64/armv9.5a-hdbss.s
new file mode 100644
index 000000000000..c4505c9d70e7
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-hdbss.s
@@ -0,0 +1,12 @@
+// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+
+mrs x0, HDBSSBR_EL2
+// CHECK: mrs x0, HDBSSBR_EL2                  // encoding: [0x40,0x23,0x3c,0xd5]
+msr HDBSSBR_EL2, x0
+// CHECK: msr HDBSSBR_EL2, x0                  // encoding: [0x40,0x23,0x1c,0xd5]
+
+mrs x0, HDBSSPROD_EL2
+// CHECK: mrs x0, HDBSSPROD_EL2                  // encoding: [0x60,0x23,0x3c,0xd5]
+msr HDBSSPROD_EL2, x0
+// CHECK: msr HDBSSPROD_EL2, x0                  // encoding: [0x60,0x23,0x1c,0xd5]
+
diff --git a/llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s b/llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s
new file mode 100644
index 000000000000..d06183be9da3
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s
@@ -0,0 +1,57 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+pauth-lr 2>&1 < %s | FileCheck %s
+
+  autiasppc #2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc #2
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autiasppc #1<<17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc #1<<17
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autiasppc #-2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc #-2
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autiasppc w0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc w0
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autiasppc sp
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc sp
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retabsppc #2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retabsppc #2
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retabsppc #(1<<17)
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retabsppc #(1<<17)
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retabsppc #-2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retabsppc #-2
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retaasppc w0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retaasppc w0
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retaasppc sp
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retaasppc sp
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retaasppc xzr
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retaasppc xzr
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
diff --git a/llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s b/llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s
new file mode 100644
index 000000000000..c10142a19976
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s
@@ -0,0 +1,12 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+pauth-lr -filetype=obj -o /dev/null 2>&1 < %s | FileCheck %s
+
+  autiasppc undef_label
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: relocation of PAC/AUT instructions is not supported
+// CHECK-NEXT: autiasppc undef_label
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autibsppc undef_label
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: relocation of PAC/AUT instructions is not supported
+// CHECK-NEXT: autibsppc undef_label
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
diff --git a/llvm/test/MC/AArch64/armv9.5a-pauthlr.s b/llvm/test/MC/AArch64/armv9.5a-pauthlr.s
new file mode 100644
index 000000000000..2bc84c13f70f
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-pauthlr.s
@@ -0,0 +1,178 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+pauth-lr < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+pauth-lr < %s \
+// RUN:        | llvm-objdump -d --mattr=+pauth-lr - | FileCheck %s --check-prefix=CHECK-DISASS
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+pauth-lr < %s \
+// RUN:        | llvm-objdump -d --mattr=-pauth-lr - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+// Label at address 4, so we can test that the address shows up in the
+// disassembly.
+  nop
+label1:
+
+  paciasppc
+// CHECK-INST: paciasppc
+// CHECK-DISASS: paciasppc
+// CHECK-ENCODING: [0xfe,0xa3,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1a3fe <unknown>
+
+  pacibsppc
+// CHECK-INST: pacibsppc
+// CHECK-DISASS: pacibsppc
+// CHECK-ENCODING: [0xfe,0xa7,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1a7fe <unknown>
+
+  pacnbiasppc
+// CHECK-INST: pacnbiasppc
+// CHECK-DISASS: pacnbiasppc
+// CHECK-ENCODING: [0xfe,0x83,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac183fe <unknown>
+
+  pacnbibsppc
+// CHECK-INST: pacnbibsppc
+// CHECK-DISASS: pacnbibsppc
+// CHECK-ENCODING: [0xfe,0x87,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac187fe <unknown>
+
+  autiasppc label1
+// CHECK-INST: autiasppc label1
+// CHECK-DISASS: autiasppc 0x4 <label1>
+// CHECK-ENCODING: [0bAAA11111,A,0b100AAAAA,0xf3]
+// CHECK-ENCODING: fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: f380009f <unknown>
+
+  autibsppc label1
+// CHECK-INST: autibsppc label1
+// CHECK-DISASS: autibsppc 0x4 <label1>
+// CHECK-ENCODING: [0bAAA11111,A,0b101AAAAA,0xf3]
+// CHECK-ENCODING: fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: f3a000bf <unknown>
+
+  autibsppc #0
+// CHECK-INST: autibsppc #0
+// CHECK-DISASS: autibsppc 0x1c <label1+0x18>
+// CHECK-ENCODING: [0x1f,0x00,0xa0,0xf3]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: f3a0001f <unknown>
+
+  autibsppc #-(1<<18)+4
+// CHECK-INST: autibsppc #-262140
+// CHECK-DISASS: autibsppc 0xfffffffffffc0024 <label1+0xfffffffffffc0020>
+// CHECK-ENCODING: [0xff,0xff,0xbf,0xf3]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: f3bfffff <unknown>
+
+  autiasppc x0
+// CHECK-INST: autiasppc x0
+// CHECK-DISASS: autiasppc x0
+// CHECK-ENCODING: [0x1e,0x90,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1901e <unknown>
+
+  autibsppc x1
+// CHECK-INST: autibsppc x1
+// CHECK-DISASS: autibsppc x1
+// CHECK-ENCODING: [0x3e,0x94,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1943e <unknown>
+
+  autiasppc xzr
+// CHECK-INST: autiasppc xzr
+// CHECK-DISASS: autiasppc xzr
+// CHECK-ENCODING: [0xfe,0x93,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac193fe <unknown>
+
+  autibsppc xzr
+// CHECK-INST: autibsppc xzr
+// CHECK-DISASS: autibsppc xzr
+// CHECK-ENCODING: [0xfe,0x97,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac197fe <unknown>
+
+  pacia171615
+// CHECK-INST: pacia171615
+// CHECK-DISASS: pacia171615
+// CHECK-ENCODING: [0xfe,0x8b,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac18bfe <unknown>
+
+  pacib171615
+// CHECK-INST: pacib171615
+// CHECK-DISASS: pacib171615
+// CHECK-ENCODING: [0xfe,0x8f,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac18ffe <unknown>
+
+  autia171615
+// CHECK-INST: autia171615
+// CHECK-DISASS: autia171615
+// CHECK-ENCODING: [0xfe,0xbb,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1bbfe <unknown>
+
+  autib171615
+// CHECK-INST: autib171615
+// CHECK-DISASS: autib171615
+// CHECK-ENCODING: [0xfe,0xbf,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1bffe <unknown>
+
+  retaasppc label1
+// CHECK-INST: retaasppc label1
+// CHECK-DISASS: retaasppc 0x4 <label1>
+// CHECK-ENCODING: [0bAAA11111,A,0b000AAAAA,0x55]
+// CHECK-ENCODING: //   fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: 5500021f <unknown>
+
+  retabsppc label1
+// CHECK-INST: retabsppc label1
+// CHECK-DISASS: retabsppc 0x4 <label1>
+// CHECK-ENCODING: [0bAAA11111,A,0b001AAAAA,0x55]
+// CHECK-ENCODING: //   fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: 5520023f <unknown>
+
+  retaasppc #0
+// CHECK-INST: retaasppc #0
+// CHECK-DISASS: retaasppc 0x4c <label1+0x48>
+// CHECK-ENCODING: [0x1f,0x00,0x00,0x55]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: 5500001f <unknown>
+
+  retaasppc #-(1<<18)+4
+// CHECK-INST: retaasppc #-262140
+// CHECK-DISASS: retaasppc 0xfffffffffffc0054 <label1+0xfffffffffffc0050>
+// CHECK-ENCODING: [0xff,0xff,0x1f,0x55]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: 551fffff <unknown>
+
+  retaasppc x2
+// CHECK-INST: retaasppc x2
+// CHECK-DISASS: retaasppc x2
+// CHECK-ENCODING: [0xe2,0x0b,0x5f,0xd6]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: d65f0be2 <unknown>
+
+  retabsppc x3
+// CHECK-INST: retabsppc x3
+// CHECK-DISASS: retabsppc x3
+// CHECK-ENCODING: [0xe3,0x0f,0x5f,0xd6]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: d65f0fe3 <unknown>
+
+  pacm
+// CHECK-INST: pacm
+// CHECK-DISASS: pacm
+// CHECK-ENCODING: [0xff,0x24,0x03,0xd5]
+// CHECK-ERROR-NOT: instruction requires:
+// CHECK-UNKNOWN: d50324ff hint #39
diff --git a/llvm/test/MC/AArch64/armv9.5a-tlbiw.s b/llvm/test/MC/AArch64/armv9.5a-tlbiw.s
new file mode 100644
index 000000000000..435ed06b33c8
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-tlbiw.s
@@ -0,0 +1,27 @@
+// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+tlbiw -mattr=+xs < %s | FileCheck --check-prefix=CHECK-TLBIW --check-prefix=CHECK-XS %s
+// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+tlbiw < %s 2> %t | FileCheck --check-prefix=CHECK-TLBIW %s && FileCheck --check-prefix=ERROR-NO-XS-TLBIW %s < %t
+// RUN: not llvm-mc -triple aarch64 < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-TLBIW --check-prefix=ERROR-NO-XS-TLBIW %s
+
+tlbi VMALLWS2E1
+// CHECK-TLBIW: tlbi vmallws2e1                  // encoding: [0x5f,0x86,0x0c,0xd5]
+// ERROR-NO-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1 requires: tlbiw
+
+tlbi VMALLWS2E1IS
+// CHECK-TLBIW: tlbi vmallws2e1is                // encoding: [0x5f,0x82,0x0c,0xd5]
+// ERROR-NO-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1IS requires: tlbiw
+
+tlbi VMALLWS2E1OS
+// CHECK-TLBIW: tlbi vmallws2e1os                // encoding: [0x5f,0x85,0x0c,0xd5]
+// ERROR-NO-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1OS requires: tlbiw
+
+tlbi VMALLWS2E1nXS
+// CHECK-XS: tlbi vmallws2e1nxs                  // encoding: [0x5f,0x96,0x0c,0xd5]
+// ERROR-NO-XS-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1nXS requires: xs, tlbiw
+
+tlbi VMALLWS2E1ISnXS
+// CHECK-XS: tlbi vmallws2e1isnxs                // encoding: [0x5f,0x92,0x0c,0xd5]
+// ERROR-NO-XS-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1ISnXS requires: xs, tlbiw
+
+tlbi VMALLWS2E1OSnXS
+// CHECK-XS: tlbi vmallws2e1osnxs                // encoding: [0x5f,0x95,0x0c,0xd5]
+// ERROR-NO-XS-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1OSnXS requires: xs, tlbiw
diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
index 89078c1ad4e0..e01eb05e8558 100644
--- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
@@ -2013,3 +2013,6 @@ ds_sub_clamp_rtn_u32 v5, v1, v2
 
 ds_sub_clamp_u32 v1, v2
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_features.s b/llvm/test/MC/AMDGPU/gfx12_asm_features.s
new file mode 100644
index 000000000000..7e58bdb3b444
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_features.s
@@ -0,0 +1,29 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefix=GFX12 %s
+
+//
+// Elements of CPol operand can be given in any order
+//
+
+image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_LOAD_HT scope:SCOPE_SE
+// GFX12: encoding: [0x00,0x00,0x40,0xd0,0x00,0x00,0x24,0x00,0x00,0x00,0x00,0x00]
+
+image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D scope:SCOPE_SE th:TH_LOAD_HT
+// GFX12: encoding: [0x00,0x00,0x40,0xd0,0x00,0x00,0x24,0x00,0x00,0x00,0x00,0x00]
+
+image_sample v[29:30], [v31, v32, v33], s[32:39], s[68:71] dmask:0x3 dim:SQ_RSRC_IMG_3D th:TH_LOAD_NT scope:SCOPE_SYS
+// GFX12: encoding: [0x02,0xc0,0xc6,0xe4,0x1d,0x40,0x1c,0x22,0x1f,0x20,0x21,0x00]
+
+image_sample v[29:30], [v31, v32, v33], s[32:39], s[68:71] dmask:0x3 dim:SQ_RSRC_IMG_3D scope:SCOPE_SYS th:TH_LOAD_NT
+// GFX12: encoding: [0x02,0xc0,0xc6,0xe4,0x1d,0x40,0x1c,0x22,0x1f,0x20,0x21,0x00]
+
+buffer_load_b32 v5, off, s[8:11], s3 offset:8388607 th:TH_LOAD_NT_HT scope:SCOPE_DEV
+// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_load_b32 v5, off, s[8:11], s3 offset:8388607 scope:SCOPE_DEV th:TH_LOAD_NT_HT
+// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f]
+
+tbuffer_load_d16_format_x v4, off, ttmp[4:7], s3 format:[BUF_FMT_8_UINT] offset:8388607 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0xe0,0xbc,0x02,0x00,0xff,0xff,0x7f]
+
+tbuffer_load_d16_format_x v4, off, ttmp[4:7], s3 format:[BUF_FMT_8_UINT] offset:8388607 scope:SCOPE_SYS th:TH_LOAD_BYPASS
+// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0xe0,0xbc,0x02,0x00,0xff,0xff,0x7f]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
index c0ffc5247d90..95d352b421a2 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
@@ -1266,6 +1266,30 @@ global_atomic_or_b64 v[1:2], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN
 global_atomic_or_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN
 // GFX12: encoding: [0x7c,0x80,0x12,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 
+global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:-64
+// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64
+// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+
+global_atomic_ordered_add_b64 v[0:1], v[2:3], off offset:-64
+// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_ordered_add_b64 v[0:1], v[2:3], off offset:64
+// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+
+global_atomic_ordered_add_b64 v[1:2], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_ordered_add_b64 v[1:2], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+
+global_atomic_ordered_add_b64 v[1:2], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_ordered_add_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+
 global_atomic_sub_u32 v0, v2, s[0:1] offset:-64
 // GFX12: encoding: [0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/hsa-diag-v4.s b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
index f7a554aedb74..069b71b7229c 100644
--- a/llvm/test/MC/AMDGPU/hsa-diag-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
@@ -1,6 +1,7 @@
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX8,PREGFX10,AMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX10,AMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX11,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1200 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX12,AMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd- -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,NONAMDHSA
 // RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GFX90A,PREGFX10,AMDHSA,ALL
 
@@ -10,6 +11,7 @@
 // GFX8-NOT: error:
 // GFX10: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1010:xnack+
 // GFX11: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1100
+// GFX12: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1200
 // NONAMDHSA: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-unknown--gfx810
 .warning "test_target"
 .amdgcn_target "amdgcn-amd-amdhsa--gfx810:xnack+"
@@ -228,8 +230,10 @@
 .end_amdhsa_kernel
 
 // GCN-LABEL: warning: test_amdhsa_shared_vgpr_count_invalid1
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: .amdhsa_next_free_vgpr directive is required
+// PREGFX10: error: directive requires gfx10 or gfx11
+// GFX10: error: .amdhsa_next_free_vgpr directive is required
+// GFX11: error: .amdhsa_next_free_vgpr directive is required
+// GFX12: error: directive requires gfx10 or gfx11
 // NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_shared_vgpr_count_invalid1"
 .amdhsa_kernel test_amdhsa_shared_vgpr_count_invalid1
@@ -237,8 +241,10 @@
 .end_amdhsa_kernel
 
 // GCN-LABEL: warning: test_amdhsa_shared_vgpr_count_invalid2
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: shared_vgpr_count directive not valid on wavefront size 32
+// PREGFX10: error: directive requires gfx10 or gfx11
+// GFX10: error: shared_vgpr_count directive not valid on wavefront size 32
+// GFX11: error: shared_vgpr_count directive not valid on wavefront size 32
+// GFX12: error: directive requires gfx10 or gfx11
 // NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_shared_vgpr_count_invalid2"
 .amdhsa_kernel test_amdhsa_shared_vgpr_count_invalid2
@@ -249,8 +255,10 @@
 .end_amdhsa_kernel
 
 // GCN-LABEL: warning: test_amdhsa_shared_vgpr_count_invalid3
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: value out of range
+// PREGFX10: error: directive requires gfx10 or gfx11
+// GFX10: error: value out of range
+// GFX11: error: value out of range
+// GFX12: error: directive requires gfx10 or gfx11
 // NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_shared_vgpr_count_invalid3"
 .amdhsa_kernel test_amdhsa_shared_vgpr_count_invalid3
@@ -260,8 +268,10 @@
 .end_amdhsa_kernel
 
 // GCN-LABEL: warning: test_amdhsa_shared_vgpr_count_invalid4
-// PREGFX10: error: directive requires gfx10+
-// GFX10PLUS: error: shared_vgpr_count*2 + compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot exceed 63
+// PREGFX10: error: directive requires gfx10 or gfx11
+// GFX10: error: shared_vgpr_count*2 + compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot exceed 63
+// GFX11: error: shared_vgpr_count*2 + compute_pgm_rsrc1.GRANULATED_WORKITEM_VGPR_COUNT cannot exceed 63
+// GFX12: error: directive requires gfx10 or gfx11
 // NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_shared_vgpr_count_invalid4"
 .amdhsa_kernel test_amdhsa_shared_vgpr_count_invalid4
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
index efbcec21f586..186d98f78b98 100644
--- a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
@@ -118,7 +118,6 @@ disabled_user_sgpr:
   .amdhsa_workgroup_processor_mode 1
   .amdhsa_memory_ordered 1
   .amdhsa_forward_progress 1
-  .amdhsa_shared_vgpr_count 0
   .amdhsa_round_robin_scheduling 1
   .amdhsa_exception_fp_ieee_invalid_op 1
   .amdhsa_exception_fp_denorm_src 1
@@ -157,7 +156,6 @@ disabled_user_sgpr:
 // ASM-NEXT: .amdhsa_workgroup_processor_mode 1
 // ASM-NEXT: .amdhsa_memory_ordered 1
 // ASM-NEXT: .amdhsa_forward_progress 1
-// ASM-NEXT: .amdhsa_shared_vgpr_count 0
 // ASM-NEXT: .amdhsa_round_robin_scheduling 1
 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
 // ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-e3dse.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-e3dse.txt
new file mode 100644
index 000000000000..d2476dbf876d
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-e3dse.txt
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
+
+[0x20,0xc1,0x3e,0xd5]
+# CHECK: mrs x0, VDISR_EL3
+
+[0x20,0xc1,0x1e,0xd5]
+# CHECK: msr VDISR_EL3, x0
+
+[0x60,0x52,0x3e,0xd5]
+# CHECK: mrs x0, VSESR_EL3
+
+[0x60,0x52,0x1e,0xd5]
+# CHECK: msr VSESR_EL3, x0
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-fgwte3.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-fgwte3.txt
new file mode 100644
index 000000000000..f7e355a700af
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-fgwte3.txt
@@ -0,0 +1,7 @@
+# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
+
+[0xa0,0x11,0x3e,0xd5]
+# CHECK: mrs x0, FGWTE3_EL3
+
+[0xa0,0x11,0x1e,0xd5]
+# CHECK: msr FGWTE3_EL3, x0
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-hacdbs.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-hacdbs.txt
new file mode 100644
index 000000000000..d9be7e5ba443
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-hacdbs.txt
@@ -0,0 +1,14 @@
+# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
+
+[0x80,0x23,0x3c,0xd5]
+# CHECK: mrs x0, HACDBSBR_EL2
+
+[0x80,0x23,0x1c,0xd5]
+# CHECK: msr HACDBSBR_EL2, x0
+
+[0xa0,0x23,0x3c,0xd5]
+# CHECK: mrs x0, HACDBSCONS_EL2
+
+[0xa0,0x23,0x1c,0xd5]
+# CHECK: msr HACDBSCONS_EL2, x0
+
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-hdbss.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-hdbss.txt
new file mode 100644
index 000000000000..999f322548f4
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-hdbss.txt
@@ -0,0 +1,14 @@
+# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
+
+[0x40,0x23,0x3c,0xd5]
+# CHECK: mrs x0, HDBSSBR_EL2
+
+[0x40,0x23,0x1c,0xd5]
+# CHECK: msr HDBSSBR_EL2, x0
+
+[0x60,0x23,0x3c,0xd5]
+# CHECK: mrs x0, HDBSSPROD_EL2
+
+[0x60,0x23,0x1c,0xd5]
+# CHECK: msr HDBSSPROD_EL2, x0
+
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt
new file mode 100644
index 000000000000..53d78023bc4b
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt
@@ -0,0 +1,94 @@
+# RUN: llvm-mc -triple aarch64 -disassemble -mattr=+pauth-lr < %s | FileCheck %s
+# RUN: not llvm-mc -triple aarch64 -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NO-PAUTHLR
+
+[0xfe,0xa3,0xc1,0xda]
+# CHECK: paciasppc
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0xa7,0xc1,0xda]
+# CHECK: pacibsppc
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x83,0xc1,0xda]
+# CHECK: pacnbiasppc
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x87,0xc1,0xda]
+# CHECK: pacnbibsppc
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x9f,0x00,0x80,0xf3]
+# CHECK: autiasppc #-16
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xbf,0x00,0xa0,0xf3]
+# CHECK: autibsppc #-20
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x1f,0x00,0xa0,0xf3]
+# CHECK: autibsppc #0
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xff,0xff,0xbf,0xf3]
+# CHECK: autibsppc #-262140
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x1e,0x90,0xc1,0xda]
+# CHECK: autiasppc x0
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x3e,0x94,0xc1,0xda]
+# CHECK: autibsppc x1
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x93,0xc1,0xda]
+# CHECK: autiasppc xzr
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x97,0xc1,0xda]
+# CHECK: autibsppc xzr
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x8b,0xc1,0xda]
+# CHECK: pacia171615
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x8f,0xc1,0xda]
+# CHECK: pacib171615
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0xbb,0xc1,0xda]
+# CHECK: autia171615
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0xbf,0xc1,0xda]
+# CHECK: autib171615
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xbf,0x01,0x00,0x55]
+# CHECK: retaasppc #-52
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xdf,0x01,0x20,0x55]
+# CHECK: retabsppc #-56
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x1f,0x00,0x00,0x55]
+# CHECK: retaasppc #0
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xff,0xff,0x1f,0x55]
+# CHECK: retaasppc #-262140
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xe2,0x0b,0x5f,0xd6]
+# CHECK: retaasppc x2
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xe3,0x0f,0x5f,0xd6]
+# CHECK: retabsppc x3
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xff,0x24,0x03,0xd5]
+# CHECK: pacm
+# NO-PAUTHLR: hint #39
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-tlbiw.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-tlbiw.txt
new file mode 100644
index 000000000000..df5e894a929e
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-tlbiw.txt
@@ -0,0 +1,27 @@
+# RUN: llvm-mc -triple aarch64 -disassemble -mattr=+tlbiw -mattr=+xs < %s | FileCheck --check-prefix=CHECK-TLBIW --check-prefix=CHECK-XS %s
+# RUN: llvm-mc -triple aarch64 -disassemble -mattr=+tlbiw < %s | FileCheck --check-prefix=CHECK-TLBIW --check-prefix=CHECK-NO-XS-TLBIW %s
+# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck --check-prefix=CHECK-NO-TLBIW --check-prefix=CHECK-NO-XS-TLBIW %s
+
+[0x5f,0x86,0x0c,0xd5]
+# CHECK-TLBIW: tlbi vmallws2e1
+# CHECK-NO-TLBIW: sys #4, c8, c6, #2
+
+[0x5f,0x82,0x0c,0xd5]
+# CHECK-TLBIW: tlbi vmallws2e1is
+# CHECK-NO-TLBIW: sys #4, c8, c2, #2
+
+[0x5f,0x85,0x0c,0xd5]
+# CHECK-TLBIW: tlbi vmallws2e1os
+# CHECK-NO-TLBIW: sys #4, c8, c5, #2
+
+[0x5f,0x96,0x0c,0xd5]
+# CHECK-XS: tlbi vmallws2e1nxs
+# CHECK-NO-XS-TLBIW: sys #4, c9, c6, #2
+
+[0x5f,0x92,0x0c,0xd5]
+# CHECK-XS: tlbi vmallws2e1isnxs
+# CHECK-NO-XS-TLBIW: sys #4, c9, c2, #2
+
+[0x5f,0x95,0x0c,0xd5]
+# CHECK-XS: tlbi vmallws2e1osnxs
+# CHECK-NO-XS-TLBIW: sys #4, c9, c5, #2
diff --git a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
index 24716b1226e4..e1bb7ad51171 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
@@ -34,6 +34,14 @@
 # W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x02,0x18]
 0x10,0x40,0x40,0xcc,0x00,0x11,0x02,0x18 # src2 sgpr0
 
+# this is ds_add_f32 with gds bit which is not valid on gfx12+
+# GFX12: [[@LINE+1]]:1: warning: invalid instruction encoding
+0x00,0x00,0x56,0xd8,0x00,0x01,0x00,0x00
+
 # this is image_msaa_load where samp field for gfx12 VSAMPLE is not all zeros
 # GFX12: [[@LINE+1]]:1: warning: invalid instruction encoding
 0x06,0x00,0x46,0xe4,0x01,0x10,0x80,0x00,0x05,0x06,0x07,0x00
+
+# This is ds_read_b32 with gds bit which is not valid on gfx90a.
+# GFX90A: [[@LINE+1]]:1: warning: invalid instruction encoding
+0x00,0x00,0x6d,0xd8,0x01,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
index d7f9daf29584..f4038cf10f50 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
@@ -837,6 +837,18 @@
 # GFX12: global_atomic_xor_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 0x7c,0xc0,0x12,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
 
+# GFX12: global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: global_atomic_ordered_add_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: global_atomic_ordered_add_b64 v[1:2], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: global_atomic_ordered_add_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
 # GFX12: global_load_addtid_b32 v1, off offset:64 ; encoding: [0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
 0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_features.txt
index 8746ee79c8f5..b348e8b5ef01 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_features.txt
@@ -773,8 +773,3 @@
 
 # GFX90A: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x41,0xdd,0x00,0x02,0x00,0x00]
 0x00,0x00,0x41,0xdd,0x00,0x02,0x00,0x00
-
-# Disassembler still decodes the gds modifier even though the assembler does
-# not accept it.
-# GFX90A: ds_read_b32 v0, v1 gds ; encoding: [0x00,0x00,0x6d,0xd8,0x01,0x00,0x00,0x00]
-0x00,0x00,0x6d,0xd8,0x01,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/adc.txt b/llvm/test/MC/Disassembler/X86/apx/adc.txt
new file mode 100644
index 000000000000..b46957888719
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/adc.txt
@@ -0,0 +1,210 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	adcb	$123, %bl
+# INTEL: {evex}	adc	bl, 123
+0x62,0xf4,0x7c,0x08,0x80,0xd3,0x7b
+
+# ATT:   adcb	$123, %bl, %cl
+# INTEL: adc	cl, bl, 123
+0x62,0xf4,0x74,0x18,0x80,0xd3,0x7b
+
+# ATT:   {evex}	adcw	$123, %dx
+# INTEL: {evex}	adc	dx, 123
+0x62,0xf4,0x7d,0x08,0x83,0xd2,0x7b
+
+# ATT:   adcw	$123, %dx, %ax
+# INTEL: adc	ax, dx, 123
+0x62,0xf4,0x7d,0x18,0x83,0xd2,0x7b
+
+# ATT:   {evex}	adcl	$123, %ecx
+# INTEL: {evex}	adc	ecx, 123
+0x62,0xf4,0x7c,0x08,0x83,0xd1,0x7b
+
+# ATT:   adcl	$123, %ecx, %edx
+# INTEL: adc	edx, ecx, 123
+0x62,0xf4,0x6c,0x18,0x83,0xd1,0x7b
+
+# ATT:   {evex}	adcq	$123, %r9
+# INTEL: {evex}	adc	r9, 123
+0x62,0xd4,0xfc,0x08,0x83,0xd1,0x7b
+
+# ATT:   adcq	$123, %r9, %r15
+# INTEL: adc	r15, r9, 123
+0x62,0xd4,0x84,0x18,0x83,0xd1,0x7b
+
+# ATT:   {evex}	adcb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	adc	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x80,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   adcb	$123, 291(%r8,%rax,4), %bl
+# INTEL: adc	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0x80,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	adcw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	adc	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   adcw	$123, 291(%r8,%rax,4), %dx
+# INTEL: adc	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	adcl	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	adc	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   adcl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: adc	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	adcq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	adc	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   adcq	$123, 291(%r8,%rax,4), %r9
+# INTEL: adc	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	adcw	$1234, %dx
+# INTEL: {evex}	adc	dx, 1234
+0x62,0xf4,0x7d,0x08,0x81,0xd2,0xd2,0x04
+
+# ATT:   adcw	$1234, %dx, %ax
+# INTEL: adc	ax, dx, 1234
+0x62,0xf4,0x7d,0x18,0x81,0xd2,0xd2,0x04
+
+# ATT:   {evex}	adcw	$1234, 291(%r8,%rax,4)
+# INTEL: {evex}	adc	word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x7d,0x08,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   adcw	$1234, 291(%r8,%rax,4), %dx
+# INTEL: adc	dx, word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x6d,0x18,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {evex}	adcl	$123456, %ecx
+# INTEL: {evex}	adc	ecx, 123456
+0x62,0xf4,0x7c,0x08,0x81,0xd1,0x40,0xe2,0x01,0x00
+
+# ATT:   adcl	$123456, %ecx, %edx
+# INTEL: adc	edx, ecx, 123456
+0x62,0xf4,0x6c,0x18,0x81,0xd1,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	adcq	$123456, %r9
+# INTEL: {evex}	adc	r9, 123456
+0x62,0xd4,0xfc,0x08,0x81,0xd1,0x40,0xe2,0x01,0x00
+
+# ATT:   adcq	$123456, %r9, %r15
+# INTEL: adc	r15, r9, 123456
+0x62,0xd4,0x84,0x18,0x81,0xd1,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	adcl	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	adc	dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x7c,0x08,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   adcl	$123456, 291(%r8,%rax,4), %ecx
+# INTEL: adc	ecx, dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x74,0x18,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	adcq	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	adc	qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xfc,0x08,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   adcq	$123456, 291(%r8,%rax,4), %r9
+# INTEL: adc	r9, qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xb4,0x18,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	adcb	%bl, %cl
+# INTEL: {evex}	adc	cl, bl
+0x62,0xf4,0x7c,0x08,0x10,0xd9
+
+# ATT:   adcb	%bl, %cl, %r8b
+# INTEL: adc	r8b, cl, bl
+0x62,0xf4,0x3c,0x18,0x10,0xd9
+
+# ATT:   {evex}	adcb	%bl, 291(%r8,%rax,4)
+# INTEL: {evex}	adc	byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x7c,0x08,0x10,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   adcb	%bl, 291(%r8,%rax,4), %cl
+# INTEL: adc	cl, byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x74,0x18,0x10,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	adcw	%dx, %ax
+# INTEL: {evex}	adc	ax, dx
+0x62,0xf4,0x7d,0x08,0x11,0xd0
+
+# ATT:   adcw	%dx, %ax, %r9w
+# INTEL: adc	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x11,0xd0
+
+# ATT:   {evex}	adcw	%dx, 291(%r8,%rax,4)
+# INTEL: {evex}	adc	word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x08,0x11,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   adcw	%dx, 291(%r8,%rax,4), %ax
+# INTEL: adc	ax, word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x18,0x11,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	adcl	%ecx, %edx
+# INTEL: {evex}	adc	edx, ecx
+0x62,0xf4,0x7c,0x08,0x11,0xca
+
+# ATT:   adcl	%ecx, %edx, %r10d
+# INTEL: adc	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x11,0xca
+
+# ATT:   {evex}	adcl	%ecx, 291(%r8,%rax,4)
+# INTEL: {evex}	adc	dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x7c,0x08,0x11,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   adcl	%ecx, 291(%r8,%rax,4), %edx
+# INTEL: adc	edx, dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x6c,0x18,0x11,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	adcq	%r9, %r15
+# INTEL: {evex}	adc	r15, r9
+0x62,0x54,0xfc,0x08,0x11,0xcf
+
+# ATT:   adcq	%r9, %r15, %r11
+# INTEL: adc	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x11,0xcf
+
+# ATT:   {evex}	adcq	%r9, 291(%r8,%rax,4)
+# INTEL: {evex}	adc	qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0xfc,0x08,0x11,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   adcq	%r9, 291(%r8,%rax,4), %r15
+# INTEL: adc	r15, qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0x84,0x18,0x11,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	adcb	291(%r8,%rax,4), %bl
+# INTEL: {evex}	adc	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x12,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   adcb	291(%r8,%rax,4), %bl, %cl
+# INTEL: adc	cl, bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0x12,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	adcw	291(%r8,%rax,4), %dx
+# INTEL: {evex}	adc	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0x13,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   adcw	291(%r8,%rax,4), %dx, %ax
+# INTEL: adc	ax, dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x18,0x13,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	adcl	291(%r8,%rax,4), %ecx
+# INTEL: {evex}	adc	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x13,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   adcl	291(%r8,%rax,4), %ecx, %edx
+# INTEL: adc	edx, ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6c,0x18,0x13,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	adcq	291(%r8,%rax,4), %r9
+# INTEL: {evex}	adc	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0xfc,0x08,0x13,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   adcq	291(%r8,%rax,4), %r9, %r15
+# INTEL: adc	r15, r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0x84,0x18,0x13,0x8c,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/add.txt b/llvm/test/MC/Disassembler/X86/apx/add.txt
new file mode 100644
index 000000000000..4dcb523b0be7
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/add.txt
@@ -0,0 +1,418 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	addb	$123, %bl
+# INTEL: {evex}	add	bl, 123
+0x62,0xf4,0x7c,0x08,0x80,0xc3,0x7b
+
+# ATT:   {nf}	addb	$123, %bl
+# INTEL: {nf}	add	bl, 123
+0x62,0xf4,0x7c,0x0c,0x80,0xc3,0x7b
+
+# ATT:   addb	$123, %bl, %cl
+# INTEL: add	cl, bl, 123
+0x62,0xf4,0x74,0x18,0x80,0xc3,0x7b
+
+# ATT:   {nf}	addb	$123, %bl, %cl
+# INTEL: {nf}	add	cl, bl, 123
+0x62,0xf4,0x74,0x1c,0x80,0xc3,0x7b
+
+# ATT:   {evex}	addw	$123, %dx
+# INTEL: {evex}	add	dx, 123
+0x62,0xf4,0x7d,0x08,0x83,0xc2,0x7b
+
+# ATT:   {nf}	addw	$123, %dx
+# INTEL: {nf}	add	dx, 123
+0x62,0xf4,0x7d,0x0c,0x83,0xc2,0x7b
+
+# ATT:   addw	$123, %dx, %ax
+# INTEL: add	ax, dx, 123
+0x62,0xf4,0x7d,0x18,0x83,0xc2,0x7b
+
+# ATT:   {nf}	addw	$123, %dx, %ax
+# INTEL: {nf}	add	ax, dx, 123
+0x62,0xf4,0x7d,0x1c,0x83,0xc2,0x7b
+
+# ATT:   {evex}	addl	$123, %ecx
+# INTEL: {evex}	add	ecx, 123
+0x62,0xf4,0x7c,0x08,0x83,0xc1,0x7b
+
+# ATT:   {nf}	addl	$123, %ecx
+# INTEL: {nf}	add	ecx, 123
+0x62,0xf4,0x7c,0x0c,0x83,0xc1,0x7b
+
+# ATT:   addl	$123, %ecx, %edx
+# INTEL: add	edx, ecx, 123
+0x62,0xf4,0x6c,0x18,0x83,0xc1,0x7b
+
+# ATT:   {nf}	addl	$123, %ecx, %edx
+# INTEL: {nf}	add	edx, ecx, 123
+0x62,0xf4,0x6c,0x1c,0x83,0xc1,0x7b
+
+# ATT:   {evex}	addq	$123, %r9
+# INTEL: {evex}	add	r9, 123
+0x62,0xd4,0xfc,0x08,0x83,0xc1,0x7b
+
+# ATT:   {nf}	addq	$123, %r9
+# INTEL: {nf}	add	r9, 123
+0x62,0xd4,0xfc,0x0c,0x83,0xc1,0x7b
+
+# ATT:   addq	$123, %r9, %r15
+# INTEL: add	r15, r9, 123
+0x62,0xd4,0x84,0x18,0x83,0xc1,0x7b
+
+# ATT:   {nf}	addq	$123, %r9, %r15
+# INTEL: {nf}	add	r15, r9, 123
+0x62,0xd4,0x84,0x1c,0x83,0xc1,0x7b
+
+# ATT:   {evex}	addb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	add	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x80,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	addb	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	add	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0x80,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   addb	$123, 291(%r8,%rax,4), %bl
+# INTEL: add	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0x80,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	addb	$123, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	add	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x1c,0x80,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	addw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	add	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	addw	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	add	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x0c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   addw	$123, 291(%r8,%rax,4), %dx
+# INTEL: add	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	addw	$123, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	add	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x1c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	addl	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	add	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	addl	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	add	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   addl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: add	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	addl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	add	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x1c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	addq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	add	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	addq	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	add	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x0c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   addq	$123, 291(%r8,%rax,4), %r9
+# INTEL: add	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	addq	$123, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	add	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x1c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	addw	$1234, %dx
+# INTEL: {evex}	add	dx, 1234
+0x62,0xf4,0x7d,0x08,0x81,0xc2,0xd2,0x04
+
+# ATT:   {nf}	addw	$1234, %dx
+# INTEL: {nf}	add	dx, 1234
+0x62,0xf4,0x7d,0x0c,0x81,0xc2,0xd2,0x04
+
+# ATT:   addw	$1234, %dx, %ax
+# INTEL: add	ax, dx, 1234
+0x62,0xf4,0x7d,0x18,0x81,0xc2,0xd2,0x04
+
+# ATT:   {nf}	addw	$1234, %dx, %ax
+# INTEL: {nf}	add	ax, dx, 1234
+0x62,0xf4,0x7d,0x1c,0x81,0xc2,0xd2,0x04
+
+# ATT:   {evex}	addw	$1234, 291(%r8,%rax,4)
+# INTEL: {evex}	add	word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x7d,0x08,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {nf}	addw	$1234, 291(%r8,%rax,4)
+# INTEL: {nf}	add	word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x7d,0x0c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   addw	$1234, 291(%r8,%rax,4), %dx
+# INTEL: add	dx, word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x6d,0x18,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {nf}	addw	$1234, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	add	dx, word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x6d,0x1c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {evex}	addl	$123456, %ecx
+# INTEL: {evex}	add	ecx, 123456
+0x62,0xf4,0x7c,0x08,0x81,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	addl	$123456, %ecx
+# INTEL: {nf}	add	ecx, 123456
+0x62,0xf4,0x7c,0x0c,0x81,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   addl	$123456, %ecx, %edx
+# INTEL: add	edx, ecx, 123456
+0x62,0xf4,0x6c,0x18,0x81,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	addl	$123456, %ecx, %edx
+# INTEL: {nf}	add	edx, ecx, 123456
+0x62,0xf4,0x6c,0x1c,0x81,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	addq	$123456, %r9
+# INTEL: {evex}	add	r9, 123456
+0x62,0xd4,0xfc,0x08,0x81,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	addq	$123456, %r9
+# INTEL: {nf}	add	r9, 123456
+0x62,0xd4,0xfc,0x0c,0x81,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   addq	$123456, %r9, %r15
+# INTEL: add	r15, r9, 123456
+0x62,0xd4,0x84,0x18,0x81,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	addq	$123456, %r9, %r15
+# INTEL: {nf}	add	r15, r9, 123456
+0x62,0xd4,0x84,0x1c,0x81,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	addl	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	add	dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x7c,0x08,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	addl	$123456, 291(%r8,%rax,4)
+# INTEL: {nf}	add	dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x7c,0x0c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   addl	$123456, 291(%r8,%rax,4), %ecx
+# INTEL: add	ecx, dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x74,0x18,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	addl	$123456, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	add	ecx, dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x74,0x1c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	addq	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	add	qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xfc,0x08,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	addq	$123456, 291(%r8,%rax,4)
+# INTEL: {nf}	add	qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xfc,0x0c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   addq	$123456, 291(%r8,%rax,4), %r9
+# INTEL: add	r9, qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xb4,0x18,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	addq	$123456, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	add	r9, qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xb4,0x1c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	addb	%bl, %cl
+# INTEL: {evex}	add	cl, bl
+0x62,0xf4,0x7c,0x08,0x00,0xd9
+
+# ATT:   {nf}	addb	%bl, %cl
+# INTEL: {nf}	add	cl, bl
+0x62,0xf4,0x7c,0x0c,0x00,0xd9
+
+# ATT:   addb	%bl, %cl, %r8b
+# INTEL: add	r8b, cl, bl
+0x62,0xf4,0x3c,0x18,0x00,0xd9
+
+# ATT:   {nf}	addb	%bl, %cl, %r8b
+# INTEL: {nf}	add	r8b, cl, bl
+0x62,0xf4,0x3c,0x1c,0x00,0xd9
+
+# ATT:   {evex}	addb	%bl, 291(%r8,%rax,4)
+# INTEL: {evex}	add	byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x7c,0x08,0x00,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addb	%bl, 291(%r8,%rax,4)
+# INTEL: {nf}	add	byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x7c,0x0c,0x00,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   addb	%bl, 291(%r8,%rax,4), %cl
+# INTEL: add	cl, byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x74,0x18,0x00,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addb	%bl, 291(%r8,%rax,4), %cl
+# INTEL: {nf}	add	cl, byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x74,0x1c,0x00,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	addw	%dx, %ax
+# INTEL: {evex}	add	ax, dx
+0x62,0xf4,0x7d,0x08,0x01,0xd0
+
+# ATT:   {nf}	addw	%dx, %ax
+# INTEL: {nf}	add	ax, dx
+0x62,0xf4,0x7d,0x0c,0x01,0xd0
+
+# ATT:   addw	%dx, %ax, %r9w
+# INTEL: add	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x01,0xd0
+
+# ATT:   {nf}	addw	%dx, %ax, %r9w
+# INTEL: {nf}	add	r9w, ax, dx
+0x62,0xf4,0x35,0x1c,0x01,0xd0
+
+# ATT:   {evex}	addw	%dx, 291(%r8,%rax,4)
+# INTEL: {evex}	add	word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x08,0x01,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addw	%dx, 291(%r8,%rax,4)
+# INTEL: {nf}	add	word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x0c,0x01,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   addw	%dx, 291(%r8,%rax,4), %ax
+# INTEL: add	ax, word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x18,0x01,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addw	%dx, 291(%r8,%rax,4), %ax
+# INTEL: {nf}	add	ax, word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x1c,0x01,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	addl	%ecx, %edx
+# INTEL: {evex}	add	edx, ecx
+0x62,0xf4,0x7c,0x08,0x01,0xca
+
+# ATT:   {nf}	addl	%ecx, %edx
+# INTEL: {nf}	add	edx, ecx
+0x62,0xf4,0x7c,0x0c,0x01,0xca
+
+# ATT:   addl	%ecx, %edx, %r10d
+# INTEL: add	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x01,0xca
+
+# ATT:   {nf}	addl	%ecx, %edx, %r10d
+# INTEL: {nf}	add	r10d, edx, ecx
+0x62,0xf4,0x2c,0x1c,0x01,0xca
+
+# ATT:   {evex}	addl	%ecx, 291(%r8,%rax,4)
+# INTEL: {evex}	add	dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x7c,0x08,0x01,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addl	%ecx, 291(%r8,%rax,4)
+# INTEL: {nf}	add	dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x7c,0x0c,0x01,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   addl	%ecx, 291(%r8,%rax,4), %edx
+# INTEL: add	edx, dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x6c,0x18,0x01,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addl	%ecx, 291(%r8,%rax,4), %edx
+# INTEL: {nf}	add	edx, dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x6c,0x1c,0x01,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	addq	%r9, %r15
+# INTEL: {evex}	add	r15, r9
+0x62,0x54,0xfc,0x08,0x01,0xcf
+
+# ATT:   {nf}	addq	%r9, %r15
+# INTEL: {nf}	add	r15, r9
+0x62,0x54,0xfc,0x0c,0x01,0xcf
+
+# ATT:   addq	%r9, %r15, %r11
+# INTEL: add	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x01,0xcf
+
+# ATT:   {nf}	addq	%r9, %r15, %r11
+# INTEL: {nf}	add	r11, r15, r9
+0x62,0x54,0xa4,0x1c,0x01,0xcf
+
+# ATT:   {evex}	addq	%r9, 291(%r8,%rax,4)
+# INTEL: {evex}	add	qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0xfc,0x08,0x01,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addq	%r9, 291(%r8,%rax,4)
+# INTEL: {nf}	add	qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0xfc,0x0c,0x01,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   addq	%r9, 291(%r8,%rax,4), %r15
+# INTEL: add	r15, qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0x84,0x18,0x01,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addq	%r9, 291(%r8,%rax,4), %r15
+# INTEL: {nf}	add	r15, qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0x84,0x1c,0x01,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	addb	291(%r8,%rax,4), %bl
+# INTEL: {evex}	add	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x02,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addb	291(%r8,%rax,4), %bl
+# INTEL: {nf}	add	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0x02,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   addb	291(%r8,%rax,4), %bl, %cl
+# INTEL: add	cl, bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0x02,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addb	291(%r8,%rax,4), %bl, %cl
+# INTEL: {nf}	add	cl, bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x1c,0x02,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	addw	291(%r8,%rax,4), %dx
+# INTEL: {evex}	add	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0x03,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addw	291(%r8,%rax,4), %dx
+# INTEL: {nf}	add	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x0c,0x03,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   addw	291(%r8,%rax,4), %dx, %ax
+# INTEL: add	ax, dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x18,0x03,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addw	291(%r8,%rax,4), %dx, %ax
+# INTEL: {nf}	add	ax, dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x1c,0x03,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	addl	291(%r8,%rax,4), %ecx
+# INTEL: {evex}	add	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x03,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addl	291(%r8,%rax,4), %ecx
+# INTEL: {nf}	add	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0x03,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   addl	291(%r8,%rax,4), %ecx, %edx
+# INTEL: add	edx, ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6c,0x18,0x03,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addl	291(%r8,%rax,4), %ecx, %edx
+# INTEL: {nf}	add	edx, ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6c,0x1c,0x03,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	addq	291(%r8,%rax,4), %r9
+# INTEL: {evex}	add	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0xfc,0x08,0x03,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addq	291(%r8,%rax,4), %r9
+# INTEL: {nf}	add	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0xfc,0x0c,0x03,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   addq	291(%r8,%rax,4), %r9, %r15
+# INTEL: add	r15, r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0x84,0x18,0x03,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	addq	291(%r8,%rax,4), %r9, %r15
+# INTEL: {nf}	add	r15, r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0x84,0x1c,0x03,0x8c,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/amx-tile.txt b/llvm/test/MC/Disassembler/X86/apx/amx-tile.txt
new file mode 100644
index 000000000000..960c40cfc4b1
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/amx-tile.txt
@@ -0,0 +1,22 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   ldtilecfg	291(%r28,%r29,4)
+# INTEL: ldtilecfg	[r28 + 4*r29 + 291]
+0x62,0x9a,0x78,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   sttilecfg	291(%r28,%r29,4)
+# INTEL: sttilecfg	[r28 + 4*r29 + 291]
+0x62,0x9a,0x79,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   tileloadd	291(%r28,%r29,4), %tmm6
+# INTEL: tileloadd	tmm6, [r28 + 4*r29 + 291]
+0x62,0x9a,0x7b,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   tileloaddt1	291(%r28,%r29,4), %tmm6
+# INTEL: tileloaddt1	tmm6, [r28 + 4*r29 + 291]
+0x62,0x9a,0x79,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   tilestored	%tmm6, 291(%r28,%r29,4)
+# INTEL: tilestored	[r28 + 4*r29 + 291], tmm6
+0x62,0x9a,0x7a,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/and.txt b/llvm/test/MC/Disassembler/X86/apx/and.txt
new file mode 100644
index 000000000000..39697516a8e6
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/and.txt
@@ -0,0 +1,418 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	andb	$123, %bl
+# INTEL: {evex}	and	bl, 123
+0x62,0xf4,0x7c,0x08,0x80,0xe3,0x7b
+
+# ATT:   {nf}	andb	$123, %bl
+# INTEL: {nf}	and	bl, 123
+0x62,0xf4,0x7c,0x0c,0x80,0xe3,0x7b
+
+# ATT:   andb	$123, %bl, %cl
+# INTEL: and	cl, bl, 123
+0x62,0xf4,0x74,0x18,0x80,0xe3,0x7b
+
+# ATT:   {nf}	andb	$123, %bl, %cl
+# INTEL: {nf}	and	cl, bl, 123
+0x62,0xf4,0x74,0x1c,0x80,0xe3,0x7b
+
+# ATT:   {evex}	andw	$123, %dx
+# INTEL: {evex}	and	dx, 123
+0x62,0xf4,0x7d,0x08,0x83,0xe2,0x7b
+
+# ATT:   {nf}	andw	$123, %dx
+# INTEL: {nf}	and	dx, 123
+0x62,0xf4,0x7d,0x0c,0x83,0xe2,0x7b
+
+# ATT:   andw	$123, %dx, %ax
+# INTEL: and	ax, dx, 123
+0x62,0xf4,0x7d,0x18,0x83,0xe2,0x7b
+
+# ATT:   {nf}	andw	$123, %dx, %ax
+# INTEL: {nf}	and	ax, dx, 123
+0x62,0xf4,0x7d,0x1c,0x83,0xe2,0x7b
+
+# ATT:   {evex}	andl	$123, %ecx
+# INTEL: {evex}	and	ecx, 123
+0x62,0xf4,0x7c,0x08,0x83,0xe1,0x7b
+
+# ATT:   {nf}	andl	$123, %ecx
+# INTEL: {nf}	and	ecx, 123
+0x62,0xf4,0x7c,0x0c,0x83,0xe1,0x7b
+
+# ATT:   andl	$123, %ecx, %edx
+# INTEL: and	edx, ecx, 123
+0x62,0xf4,0x6c,0x18,0x83,0xe1,0x7b
+
+# ATT:   {nf}	andl	$123, %ecx, %edx
+# INTEL: {nf}	and	edx, ecx, 123
+0x62,0xf4,0x6c,0x1c,0x83,0xe1,0x7b
+
+# ATT:   {evex}	andq	$123, %r9
+# INTEL: {evex}	and	r9, 123
+0x62,0xd4,0xfc,0x08,0x83,0xe1,0x7b
+
+# ATT:   {nf}	andq	$123, %r9
+# INTEL: {nf}	and	r9, 123
+0x62,0xd4,0xfc,0x0c,0x83,0xe1,0x7b
+
+# ATT:   andq	$123, %r9, %r15
+# INTEL: and	r15, r9, 123
+0x62,0xd4,0x84,0x18,0x83,0xe1,0x7b
+
+# ATT:   {nf}	andq	$123, %r9, %r15
+# INTEL: {nf}	and	r15, r9, 123
+0x62,0xd4,0x84,0x1c,0x83,0xe1,0x7b
+
+# ATT:   {evex}	andb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	and	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x80,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	andb	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	and	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0x80,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   andb	$123, 291(%r8,%rax,4), %bl
+# INTEL: and	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0x80,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	andb	$123, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	and	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x1c,0x80,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	andw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	and	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	andw	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	and	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x0c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   andw	$123, 291(%r8,%rax,4), %dx
+# INTEL: and	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	andw	$123, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	and	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x1c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	andl	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	and	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	andl	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	and	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   andl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: and	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	andl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	and	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x1c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	andq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	and	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	andq	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	and	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x0c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   andq	$123, 291(%r8,%rax,4), %r9
+# INTEL: and	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	andq	$123, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	and	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x1c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	andw	$1234, %dx
+# INTEL: {evex}	and	dx, 1234
+0x62,0xf4,0x7d,0x08,0x81,0xe2,0xd2,0x04
+
+# ATT:   {nf}	andw	$1234, %dx
+# INTEL: {nf}	and	dx, 1234
+0x62,0xf4,0x7d,0x0c,0x81,0xe2,0xd2,0x04
+
+# ATT:   andw	$1234, %dx, %ax
+# INTEL: and	ax, dx, 1234
+0x62,0xf4,0x7d,0x18,0x81,0xe2,0xd2,0x04
+
+# ATT:   {nf}	andw	$1234, %dx, %ax
+# INTEL: {nf}	and	ax, dx, 1234
+0x62,0xf4,0x7d,0x1c,0x81,0xe2,0xd2,0x04
+
+# ATT:   {evex}	andw	$1234, 291(%r8,%rax,4)
+# INTEL: {evex}	and	word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x7d,0x08,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {nf}	andw	$1234, 291(%r8,%rax,4)
+# INTEL: {nf}	and	word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x7d,0x0c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   andw	$1234, 291(%r8,%rax,4), %dx
+# INTEL: and	dx, word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x6d,0x18,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {nf}	andw	$1234, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	and	dx, word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x6d,0x1c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {evex}	andl	$123456, %ecx
+# INTEL: {evex}	and	ecx, 123456
+0x62,0xf4,0x7c,0x08,0x81,0xe1,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	andl	$123456, %ecx
+# INTEL: {nf}	and	ecx, 123456
+0x62,0xf4,0x7c,0x0c,0x81,0xe1,0x40,0xe2,0x01,0x00
+
+# ATT:   andl	$123456, %ecx, %edx
+# INTEL: and	edx, ecx, 123456
+0x62,0xf4,0x6c,0x18,0x81,0xe1,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	andl	$123456, %ecx, %edx
+# INTEL: {nf}	and	edx, ecx, 123456
+0x62,0xf4,0x6c,0x1c,0x81,0xe1,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	andq	$123456, %r9
+# INTEL: {evex}	and	r9, 123456
+0x62,0xd4,0xfc,0x08,0x81,0xe1,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	andq	$123456, %r9
+# INTEL: {nf}	and	r9, 123456
+0x62,0xd4,0xfc,0x0c,0x81,0xe1,0x40,0xe2,0x01,0x00
+
+# ATT:   andq	$123456, %r9, %r15
+# INTEL: and	r15, r9, 123456
+0x62,0xd4,0x84,0x18,0x81,0xe1,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	andq	$123456, %r9, %r15
+# INTEL: {nf}	and	r15, r9, 123456
+0x62,0xd4,0x84,0x1c,0x81,0xe1,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	andl	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	and	dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x7c,0x08,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	andl	$123456, 291(%r8,%rax,4)
+# INTEL: {nf}	and	dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x7c,0x0c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   andl	$123456, 291(%r8,%rax,4), %ecx
+# INTEL: and	ecx, dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x74,0x18,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	andl	$123456, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	and	ecx, dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x74,0x1c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	andq	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	and	qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xfc,0x08,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	andq	$123456, 291(%r8,%rax,4)
+# INTEL: {nf}	and	qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xfc,0x0c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   andq	$123456, 291(%r8,%rax,4), %r9
+# INTEL: and	r9, qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xb4,0x18,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	andq	$123456, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	and	r9, qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xb4,0x1c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	andb	%bl, %cl
+# INTEL: {evex}	and	cl, bl
+0x62,0xf4,0x7c,0x08,0x20,0xd9
+
+# ATT:   {nf}	andb	%bl, %cl
+# INTEL: {nf}	and	cl, bl
+0x62,0xf4,0x7c,0x0c,0x20,0xd9
+
+# ATT:   andb	%bl, %cl, %r8b
+# INTEL: and	r8b, cl, bl
+0x62,0xf4,0x3c,0x18,0x20,0xd9
+
+# ATT:   {nf}	andb	%bl, %cl, %r8b
+# INTEL: {nf}	and	r8b, cl, bl
+0x62,0xf4,0x3c,0x1c,0x20,0xd9
+
+# ATT:   {evex}	andb	%bl, 291(%r8,%rax,4)
+# INTEL: {evex}	and	byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x7c,0x08,0x20,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andb	%bl, 291(%r8,%rax,4)
+# INTEL: {nf}	and	byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x7c,0x0c,0x20,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   andb	%bl, 291(%r8,%rax,4), %cl
+# INTEL: and	cl, byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x74,0x18,0x20,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andb	%bl, 291(%r8,%rax,4), %cl
+# INTEL: {nf}	and	cl, byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x74,0x1c,0x20,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	andw	%dx, %ax
+# INTEL: {evex}	and	ax, dx
+0x62,0xf4,0x7d,0x08,0x21,0xd0
+
+# ATT:   {nf}	andw	%dx, %ax
+# INTEL: {nf}	and	ax, dx
+0x62,0xf4,0x7d,0x0c,0x21,0xd0
+
+# ATT:   andw	%dx, %ax, %r9w
+# INTEL: and	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x21,0xd0
+
+# ATT:   {nf}	andw	%dx, %ax, %r9w
+# INTEL: {nf}	and	r9w, ax, dx
+0x62,0xf4,0x35,0x1c,0x21,0xd0
+
+# ATT:   {evex}	andw	%dx, 291(%r8,%rax,4)
+# INTEL: {evex}	and	word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x08,0x21,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andw	%dx, 291(%r8,%rax,4)
+# INTEL: {nf}	and	word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x0c,0x21,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   andw	%dx, 291(%r8,%rax,4), %ax
+# INTEL: and	ax, word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x18,0x21,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andw	%dx, 291(%r8,%rax,4), %ax
+# INTEL: {nf}	and	ax, word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x1c,0x21,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	andl	%ecx, %edx
+# INTEL: {evex}	and	edx, ecx
+0x62,0xf4,0x7c,0x08,0x21,0xca
+
+# ATT:   {nf}	andl	%ecx, %edx
+# INTEL: {nf}	and	edx, ecx
+0x62,0xf4,0x7c,0x0c,0x21,0xca
+
+# ATT:   andl	%ecx, %edx, %r10d
+# INTEL: and	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x21,0xca
+
+# ATT:   {nf}	andl	%ecx, %edx, %r10d
+# INTEL: {nf}	and	r10d, edx, ecx
+0x62,0xf4,0x2c,0x1c,0x21,0xca
+
+# ATT:   {evex}	andl	%ecx, 291(%r8,%rax,4)
+# INTEL: {evex}	and	dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x7c,0x08,0x21,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andl	%ecx, 291(%r8,%rax,4)
+# INTEL: {nf}	and	dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x7c,0x0c,0x21,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   andl	%ecx, 291(%r8,%rax,4), %edx
+# INTEL: and	edx, dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x6c,0x18,0x21,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andl	%ecx, 291(%r8,%rax,4), %edx
+# INTEL: {nf}	and	edx, dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x6c,0x1c,0x21,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	andq	%r9, %r15
+# INTEL: {evex}	and	r15, r9
+0x62,0x54,0xfc,0x08,0x21,0xcf
+
+# ATT:   {nf}	andq	%r9, %r15
+# INTEL: {nf}	and	r15, r9
+0x62,0x54,0xfc,0x0c,0x21,0xcf
+
+# ATT:   andq	%r9, %r15, %r11
+# INTEL: and	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x21,0xcf
+
+# ATT:   {nf}	andq	%r9, %r15, %r11
+# INTEL: {nf}	and	r11, r15, r9
+0x62,0x54,0xa4,0x1c,0x21,0xcf
+
+# ATT:   {evex}	andq	%r9, 291(%r8,%rax,4)
+# INTEL: {evex}	and	qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0xfc,0x08,0x21,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andq	%r9, 291(%r8,%rax,4)
+# INTEL: {nf}	and	qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0xfc,0x0c,0x21,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   andq	%r9, 291(%r8,%rax,4), %r15
+# INTEL: and	r15, qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0x84,0x18,0x21,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andq	%r9, 291(%r8,%rax,4), %r15
+# INTEL: {nf}	and	r15, qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0x84,0x1c,0x21,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	andb	291(%r8,%rax,4), %bl
+# INTEL: {evex}	and	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x22,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andb	291(%r8,%rax,4), %bl
+# INTEL: {nf}	and	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0x22,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   andb	291(%r8,%rax,4), %bl, %cl
+# INTEL: and	cl, bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0x22,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andb	291(%r8,%rax,4), %bl, %cl
+# INTEL: {nf}	and	cl, bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x1c,0x22,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	andw	291(%r8,%rax,4), %dx
+# INTEL: {evex}	and	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0x23,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andw	291(%r8,%rax,4), %dx
+# INTEL: {nf}	and	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x0c,0x23,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   andw	291(%r8,%rax,4), %dx, %ax
+# INTEL: and	ax, dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x18,0x23,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andw	291(%r8,%rax,4), %dx, %ax
+# INTEL: {nf}	and	ax, dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x1c,0x23,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	andl	291(%r8,%rax,4), %ecx
+# INTEL: {evex}	and	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x23,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andl	291(%r8,%rax,4), %ecx
+# INTEL: {nf}	and	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0x23,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   andl	291(%r8,%rax,4), %ecx, %edx
+# INTEL: and	edx, ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6c,0x18,0x23,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andl	291(%r8,%rax,4), %ecx, %edx
+# INTEL: {nf}	and	edx, ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6c,0x1c,0x23,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	andq	291(%r8,%rax,4), %r9
+# INTEL: {evex}	and	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0xfc,0x08,0x23,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andq	291(%r8,%rax,4), %r9
+# INTEL: {nf}	and	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0xfc,0x0c,0x23,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   andq	291(%r8,%rax,4), %r9, %r15
+# INTEL: and	r15, r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0x84,0x18,0x23,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	andq	291(%r8,%rax,4), %r9, %r15
+# INTEL: {nf}	and	r15, r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0x84,0x1c,0x23,0x8c,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt b/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt
new file mode 100644
index 000000000000..9f65d4c8d25c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt
@@ -0,0 +1,122 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpbexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpbexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe6,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpbxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpbxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe2,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpbxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpbxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmplexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmplexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xee,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmplexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmplexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xee,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmplxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmplxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xec,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmplxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmplxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnoxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpnoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe1,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnoxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpnoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe1,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnpxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpnpxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xeb,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnpxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpnpxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xeb,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnsxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpnsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe9,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnsxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpnsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe9,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpoxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe0,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpoxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe0,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmppxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmppxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xea,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmppxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmppxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xea,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpsxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe8,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpsxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe8,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/crc32.txt b/llvm/test/MC/Disassembler/X86/apx/crc32.txt
new file mode 100644
index 000000000000..0144f576b8ba
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/crc32.txt
@@ -0,0 +1,90 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   crc32b	%al, %ebx
+# INTEL: crc32	ebx, al
+0x62,0xf4,0x7c,0x08,0xf0,0xd8
+
+# ATT:   crc32b	%al, %rbx
+# INTEL: crc32	rbx, al
+0x62,0xf4,0xfc,0x08,0xf0,0xd8
+
+# ATT:   crc32w	%ax, %ebx
+# INTEL: crc32	ebx, ax
+0x62,0xf4,0x7d,0x08,0xf1,0xd8
+
+# ATT:   crc32l	%eax, %ebx
+# INTEL: crc32	ebx, eax
+0x62,0xf4,0x7c,0x08,0xf1,0xd8
+
+# ATT:   crc32q	%rax, %rbx
+# INTEL: crc32	rbx, rax
+0x62,0xf4,0xfc,0x08,0xf1,0xd8
+
+# ATT:   crc32w	291(%rax,%rbx,4), %ecx
+# INTEL: crc32	ecx, word ptr [rax + 4*rbx + 291]
+0x62,0xf4,0x7d,0x08,0xf1,0x8c,0x98,0x23,0x01,0x00,0x00
+
+# ATT:   crc32l	291(%rax,%rbx,4), %ecx
+# INTEL: crc32	ecx, dword ptr [rax + 4*rbx + 291]
+0x62,0xf4,0x7c,0x08,0xf1,0x8c,0x98,0x23,0x01,0x00,0x00
+
+# ATT:   crc32b	291(%rax,%rbx,4), %rcx
+# INTEL: crc32	rcx, byte ptr [rax + 4*rbx + 291]
+0x62,0xf4,0xfc,0x08,0xf0,0x8c,0x98,0x23,0x01,0x00,0x00
+
+# ATT:   crc32q	291(%rax,%rbx,4), %rcx
+# INTEL: crc32	rcx, qword ptr [rax + 4*rbx + 291]
+0x62,0xf4,0xfc,0x08,0xf1,0x8c,0x98,0x23,0x01,0x00,0x00
+
+# ATT:   crc32b	%r16b, %r22d
+# INTEL: crc32	r22d, r16b
+0x62,0xec,0x7c,0x08,0xf0,0xf0
+
+# ATT:   crc32b	%r16b, %r23
+# INTEL: crc32	r23, r16b
+0x62,0xec,0xfc,0x08,0xf0,0xf8
+
+# ATT:   crc32w	%r17w, %r22d
+# INTEL: crc32	r22d, r17w
+0x62,0xec,0x7d,0x08,0xf1,0xf1
+
+# ATT:   crc32l	%r18d, %r22d
+# INTEL: crc32	r22d, r18d
+0x62,0xec,0x7c,0x08,0xf1,0xf2
+
+# ATT:   crc32q	%r19, %r23
+# INTEL: crc32	r23, r19
+0x62,0xec,0xfc,0x08,0xf1,0xfb
+
+# ATT:   crc32w	291(%r28,%r29,4), %r18d
+# INTEL: crc32	r18d, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0xf1,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   crc32l	291(%r28,%r29,4), %r18d
+# INTEL: crc32	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0xf1,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   crc32b	291(%r28,%r29,4), %r19
+# INTEL: crc32	r19, byte ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   crc32q	291(%r28,%r29,4), %r19
+# INTEL: crc32	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0xf1,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   crc32w	123(%r28,%r29,4), %r18d
+# INTEL: crc32	r18d, word ptr [r28 + 4*r29 + 123]
+0x62,0x8c,0x79,0x08,0xf1,0x54,0xac,0x7b
+
+# ATT:   crc32l	123(%r28,%r29,4), %r18d
+# INTEL: crc32	r18d, dword ptr [r28 + 4*r29 + 123]
+0x62,0x8c,0x78,0x08,0xf1,0x54,0xac,0x7b
+
+# ATT:   crc32b	123(%r28,%r29,4), %r19
+# INTEL: crc32	r19, byte ptr [r28 + 4*r29 + 123]
+0x62,0x8c,0xf8,0x08,0xf0,0x5c,0xac,0x7b
+
+# ATT:   crc32q	123(%r28,%r29,4), %r19
+# INTEL: crc32	r19, qword ptr [r28 + 4*r29 + 123]
+0x62,0x8c,0xf8,0x08,0xf1,0x5c,0xac,0x7b
diff --git a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
index 389b22cb4a22..88258e7e29b5 100644
--- a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
@@ -8,60 +8,102 @@
 # INTEL: vextractf32x4	xmmword ptr [r16 + r17], zmm0, 1
 0x62,0xfb,0x79,0x48,0x19,0x04,0x08,0x01
 
+# ATT:   addq	%r16, 123(%r17), %r18
+# INTEL: add	r18, qword ptr [r17 + 123], r16
+0x62,0xec,0xec,0x10,0x01,0x41,0x7b
+
 ## MRMSrcMem
 
 # ATT:   vbroadcasti32x4	(%r16,%r17), %zmm0
 # INTEL: vbroadcasti32x4	zmm0, xmmword ptr [r16 + r17]
 0x62,0xfa,0x79,0x48,0x5a,0x04,0x08
 
+# ATT:   subq	123(%r16), %r17, %r18
+# INTEL: sub	r18, r17, qword ptr [r16 + 123]
+0x62,0xec,0xec,0x10,0x2b,0x48,0x7b
+
 ## MRM0m
 
 # ATT:   vprorq	$0, (%r16,%r17), %zmm0
 # INTEL: vprorq	zmm0, zmmword ptr [r16 + r17], 0
 0x62,0xf9,0xf9,0x48,0x72,0x04,0x08,0x00
 
+# ATT:   addq	$127, 123(%r16), %r17
+# INTEL: add	r17, qword ptr [r16 + 123], 127
+0x62,0xfc,0xf4,0x10,0x83,0x40,0x7b,0x7f
+
 ## MRM1m
 
 # ATT:   vprolq	$0, (%r16,%r17), %zmm0
 # INTEL: vprolq	zmm0, zmmword ptr [r16 + r17], 0
 0x62,0xf9,0xf9,0x48,0x72,0x0c,0x08,0x00
 
+# ATT:   orq	$127, 123(%r16), %r17
+# INTEL: or	r17, qword ptr [r16 + 123], 127
+0x62,0xfc,0xf4,0x10,0x83,0x48,0x7b,0x7f
+
 ## MRM2m
 
 # ATT:   vpsrlq	$0, (%r16,%r17), %zmm0
 # INTEL: vpsrlq	zmm0, zmmword ptr [r16 + r17], 0
 0x62,0xf9,0xf9,0x48,0x73,0x14,0x08,0x00
 
+# ATT:   adcq	$127, 123(%r16), %r17
+# INTEL: adc	r17, qword ptr [r16 + 123], 127
+0x62,0xfc,0xf4,0x10,0x83,0x50,0x7b,0x7f
+
 ## MRM3m
 
 # ATT:   vpsrldq	$0, (%r16,%r17), %zmm0
 # INTEL: vpsrldq	zmm0, zmmword ptr [r16 + r17], 0
 0x62,0xf9,0x79,0x48,0x73,0x1c,0x08,0x00
 
+# ATT:   sbbq	$127, 123(%r16), %r17
+# INTEL: sbb	r17, qword ptr [r16 + 123], 127
+0x62,0xfc,0xf4,0x10,0x83,0x58,0x7b,0x7f
+
 ## MRM4m
 
 # ATT:   vpsraq	$0, (%r16,%r17), %zmm0
 # INTEL: vpsraq	zmm0, zmmword ptr [r16 + r17], 0
 0x62,0xf9,0xf9,0x48,0x72,0x24,0x08,0x00
 
+# ATT:   andq	$127, 123(%r16), %r17
+# INTEL: and	r17, qword ptr [r16 + 123], 127
+0x62,0xfc,0xf4,0x10,0x83,0x60,0x7b,0x7f
+
 ## MRM5m
 
 # ATT:   vscatterpf0dps	(%r16,%zmm0) {%k1}
 # INTEL: vscatterpf0dps	{k1}, zmmword ptr [r16 + zmm0]
 0x62,0xfa,0x7d,0x49,0xc6,0x2c,0x00
 
+# ATT:   subq	$127, 123(%r16), %r17
+# INTEL: sub	r17, qword ptr [r16 + 123], 127
+0x62,0xfc,0xf4,0x10,0x83,0x68,0x7b,0x7f
+
 ## MRM6m
 
 # ATT:   vpsllq	$0, (%r16,%r17), %zmm0
 # INTEL: vpsllq	zmm0, zmmword ptr [r16 + r17], 0
 0x62,0xf9,0xf9,0x48,0x73,0x34,0x08,0x00
 
+# ATT:   xorq	$127, 123(%r16), %r17
+# INTEL: xor	r17, qword ptr [r16 + 123], 127
+0x62,0xfc,0xf4,0x10,0x83,0x70,0x7b,0x7f
+
 ## MRM7m
 
 # ATT:   vpslldq	$0, (%r16,%r17), %zmm0
 # INTEL: vpslldq	zmm0, zmmword ptr [r16 + r17], 0
 0x62,0xf9,0x79,0x48,0x73,0x3c,0x08,0x00
 
+## MRMDestMem4VOp3CC
+
+# ATT:   cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00
+
 ## MRMSrcMem4VOp3
 
 # ATT:   bzhiq	%r19, 291(%r28,%r29,4), %r23
@@ -74,8 +116,70 @@
 # INTEL: vextractps	r16d, xmm16, 1
 0x62,0xeb,0x7d,0x08,0x17,0xc0,0x01
 
+# ATT:   {nf}	addq	%r16, %r17
+# INTEL: {nf}	add	r17, r16
+0x62,0xec,0xfc,0x0c,0x01,0xc1
+
+## MRMSrcReg
+
+# ATT:   mulxq	%r16, %r17, %r18
+# INTEL: mulx	r18, r17, r16
+0x62,0xea,0xf7,0x00,0xf6,0xd0
+
 ## MRMSrcReg4VOp3
 
 # ATT:   bzhiq	%r19, %r23, %r27
 # INTEL: bzhi	r27, r23, r19
 0x62,0x6a,0xe4,0x00,0xf5,0xdf
+
+## MRM0r
+
+# ATT:   addq	$127, %r16, %r17
+# INTEL: add	r17, r16, 127
+0x62,0xfc,0xf4,0x10,0x83,0xc0,0x7f
+
+## MRM1r
+
+# ATT:   orq	$127, %r16, %r17
+# INTEL: or	r17, r16, 127
+0x62,0xfc,0xf4,0x10,0x83,0xc8,0x7f
+
+## MRM2r
+
+# ATT:   adcq	$127, %r16, %r17
+# INTEL: adc	r17, r16, 127
+0x62,0xfc,0xf4,0x10,0x83,0xd0,0x7f
+
+## MRM3r
+
+# ATT:   sbbq	$127, %r16, %r17
+# INTEL: sbb	r17, r16, 127
+0x62,0xfc,0xf4,0x10,0x83,0xd8,0x7f
+
+## MRM4r
+
+# ATT:   andq	$127, %r16, %r17
+# INTEL: and	r17, r16, 127
+0x62,0xfc,0xf4,0x10,0x83,0xe0,0x7f
+
+## MRM5r
+
+# ATT:   subq	$127, %r16, %r17
+# INTEL: sub	r17, r16, 127
+0x62,0xfc,0xf4,0x10,0x83,0xe8,0x7f
+
+## MRM6r
+
+# ATT:   xorq	$127, %r16, %r17
+# INTEL: xor	r17, r16, 127
+0x62,0xfc,0xf4,0x10,0x83,0xf0,0x7f
+
+## NoCD8
+
+# ATT:   {nf}	negq	123(%r16)
+# INTEL: {nf}	neg	qword ptr [r16 + 123]
+0x62,0xfc,0xfc,0x0c,0xf7,0x58,0x7b
+
+# ATT:   {evex}	notq	123(%r16)
+# INTEL: {evex}	not	qword ptr [r16 + 123]
+0x62,0xfc,0xfc,0x08,0xf7,0x50,0x7b
diff --git a/llvm/test/MC/Disassembler/X86/apx/neg.txt b/llvm/test/MC/Disassembler/X86/apx/neg.txt
new file mode 100644
index 000000000000..91cffb3b4b13
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/neg.txt
@@ -0,0 +1,130 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	negb	%bl
+# INTEL: {evex}	neg	bl
+0x62,0xf4,0x7c,0x08,0xf6,0xdb
+
+# ATT:   {nf}	negb	%bl
+# INTEL: {nf}	neg	bl
+0x62,0xf4,0x7c,0x0c,0xf6,0xdb
+
+# ATT:   negb	%bl, %bl
+# INTEL: neg	bl, bl
+0x62,0xf4,0x64,0x18,0xf6,0xdb
+
+# ATT:   {nf}	negb	%bl, %bl
+# INTEL: {nf}	neg	bl, bl
+0x62,0xf4,0x64,0x1c,0xf6,0xdb
+
+# ATT:   {evex}	negw	%dx
+# INTEL: {evex}	neg	dx
+0x62,0xf4,0x7d,0x08,0xf7,0xda
+
+# ATT:   {nf}	negw	%dx
+# INTEL: {nf}	neg	dx
+0x62,0xf4,0x7d,0x0c,0xf7,0xda
+
+# ATT:   negw	%dx, %dx
+# INTEL: neg	dx, dx
+0x62,0xf4,0x6d,0x18,0xf7,0xda
+
+# ATT:   {nf}	negw	%dx, %dx
+# INTEL: {nf}	neg	dx, dx
+0x62,0xf4,0x6d,0x1c,0xf7,0xda
+
+# ATT:   {evex}	negl	%ecx
+# INTEL: {evex}	neg	ecx
+0x62,0xf4,0x7c,0x08,0xf7,0xd9
+
+# ATT:   {nf}	negl	%ecx
+# INTEL: {nf}	neg	ecx
+0x62,0xf4,0x7c,0x0c,0xf7,0xd9
+
+# ATT:   negl	%ecx, %ecx
+# INTEL: neg	ecx, ecx
+0x62,0xf4,0x74,0x18,0xf7,0xd9
+
+# ATT:   {nf}	negl	%ecx, %ecx
+# INTEL: {nf}	neg	ecx, ecx
+0x62,0xf4,0x74,0x1c,0xf7,0xd9
+
+# ATT:   {evex}	negq	%r9
+# INTEL: {evex}	neg	r9
+0x62,0xd4,0xfc,0x08,0xf7,0xd9
+
+# ATT:   {nf}	negq	%r9
+# INTEL: {nf}	neg	r9
+0x62,0xd4,0xfc,0x0c,0xf7,0xd9
+
+# ATT:   negq	%r9, %r9
+# INTEL: neg	r9, r9
+0x62,0xd4,0xb4,0x18,0xf7,0xd9
+
+# ATT:   {nf}	negq	%r9, %r9
+# INTEL: {nf}	neg	r9, r9
+0x62,0xd4,0xb4,0x1c,0xf7,0xd9
+
+# ATT:   {evex}	negb	291(%r8,%rax,4)
+# INTEL: {evex}	neg	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xf6,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	negb	291(%r8,%rax,4)
+# INTEL: {nf}	neg	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xf6,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   negb	291(%r8,%rax,4), %bl
+# INTEL: neg	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x18,0xf6,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	negb	291(%r8,%rax,4), %bl
+# INTEL: {nf}	neg	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x1c,0xf6,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	negw	291(%r8,%rax,4)
+# INTEL: {evex}	neg	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	negw	291(%r8,%rax,4)
+# INTEL: {nf}	neg	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x0c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   negw	291(%r8,%rax,4), %dx
+# INTEL: neg	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x18,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	negw	291(%r8,%rax,4), %dx
+# INTEL: {nf}	neg	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x1c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	negl	291(%r8,%rax,4)
+# INTEL: {evex}	neg	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	negl	291(%r8,%rax,4)
+# INTEL: {nf}	neg	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   negl	291(%r8,%rax,4), %ecx
+# INTEL: neg	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	negl	291(%r8,%rax,4), %ecx
+# INTEL: {nf}	neg	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x1c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	negq	291(%r8,%rax,4)
+# INTEL: {evex}	neg	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x08,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	negq	291(%r8,%rax,4)
+# INTEL: {nf}	neg	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x0c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   negq	291(%r8,%rax,4), %r9
+# INTEL: neg	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x18,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	negq	291(%r8,%rax,4), %r9
+# INTEL: {nf}	neg	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x1c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/not.txt b/llvm/test/MC/Disassembler/X86/apx/not.txt
new file mode 100644
index 000000000000..e2c4d57fc69d
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/not.txt
@@ -0,0 +1,66 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	notb	%bl
+# INTEL: {evex}	not	bl
+0x62,0xf4,0x7c,0x08,0xf6,0xd3
+
+# ATT:   notb	%bl, %bl
+# INTEL: not	bl, bl
+0x62,0xf4,0x64,0x18,0xf6,0xd3
+
+# ATT:   {evex}	notw	%dx
+# INTEL: {evex}	not	dx
+0x62,0xf4,0x7d,0x08,0xf7,0xd2
+
+# ATT:   notw	%dx, %dx
+# INTEL: not	dx, dx
+0x62,0xf4,0x6d,0x18,0xf7,0xd2
+
+# ATT:   {evex}	notl	%ecx
+# INTEL: {evex}	not	ecx
+0x62,0xf4,0x7c,0x08,0xf7,0xd1
+
+# ATT:   notl	%ecx, %ecx
+# INTEL: not	ecx, ecx
+0x62,0xf4,0x74,0x18,0xf7,0xd1
+
+# ATT:   {evex}	notq	%r9
+# INTEL: {evex}	not	r9
+0x62,0xd4,0xfc,0x08,0xf7,0xd1
+
+# ATT:   notq	%r9, %r9
+# INTEL: not	r9, r9
+0x62,0xd4,0xb4,0x18,0xf7,0xd1
+
+# ATT:   {evex}	notb	291(%r8,%rax,4)
+# INTEL: {evex}	not	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xf6,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   notb	291(%r8,%rax,4), %bl
+# INTEL: not	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x18,0xf6,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	notw	291(%r8,%rax,4)
+# INTEL: {evex}	not	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0xf7,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   notw	291(%r8,%rax,4), %dx
+# INTEL: not	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x18,0xf7,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	notl	291(%r8,%rax,4)
+# INTEL: {evex}	not	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xf7,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   notl	291(%r8,%rax,4), %ecx
+# INTEL: not	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0xf7,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	notq	291(%r8,%rax,4)
+# INTEL: {evex}	not	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x08,0xf7,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   notq	291(%r8,%rax,4), %r9
+# INTEL: not	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x18,0xf7,0x94,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/or.txt b/llvm/test/MC/Disassembler/X86/apx/or.txt
new file mode 100644
index 000000000000..53225c0659e8
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/or.txt
@@ -0,0 +1,418 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	orb	$123, %bl
+# INTEL: {evex}	or	bl, 123
+0x62,0xf4,0x7c,0x08,0x80,0xcb,0x7b
+
+# ATT:   {nf}	orb	$123, %bl
+# INTEL: {nf}	or	bl, 123
+0x62,0xf4,0x7c,0x0c,0x80,0xcb,0x7b
+
+# ATT:   orb	$123, %bl, %cl
+# INTEL: or	cl, bl, 123
+0x62,0xf4,0x74,0x18,0x80,0xcb,0x7b
+
+# ATT:   {nf}	orb	$123, %bl, %cl
+# INTEL: {nf}	or	cl, bl, 123
+0x62,0xf4,0x74,0x1c,0x80,0xcb,0x7b
+
+# ATT:   {evex}	orw	$123, %dx
+# INTEL: {evex}	or	dx, 123
+0x62,0xf4,0x7d,0x08,0x83,0xca,0x7b
+
+# ATT:   {nf}	orw	$123, %dx
+# INTEL: {nf}	or	dx, 123
+0x62,0xf4,0x7d,0x0c,0x83,0xca,0x7b
+
+# ATT:   orw	$123, %dx, %ax
+# INTEL: or	ax, dx, 123
+0x62,0xf4,0x7d,0x18,0x83,0xca,0x7b
+
+# ATT:   {nf}	orw	$123, %dx, %ax
+# INTEL: {nf}	or	ax, dx, 123
+0x62,0xf4,0x7d,0x1c,0x83,0xca,0x7b
+
+# ATT:   {evex}	orl	$123, %ecx
+# INTEL: {evex}	or	ecx, 123
+0x62,0xf4,0x7c,0x08,0x83,0xc9,0x7b
+
+# ATT:   {nf}	orl	$123, %ecx
+# INTEL: {nf}	or	ecx, 123
+0x62,0xf4,0x7c,0x0c,0x83,0xc9,0x7b
+
+# ATT:   orl	$123, %ecx, %edx
+# INTEL: or	edx, ecx, 123
+0x62,0xf4,0x6c,0x18,0x83,0xc9,0x7b
+
+# ATT:   {nf}	orl	$123, %ecx, %edx
+# INTEL: {nf}	or	edx, ecx, 123
+0x62,0xf4,0x6c,0x1c,0x83,0xc9,0x7b
+
+# ATT:   {evex}	orq	$123, %r9
+# INTEL: {evex}	or	r9, 123
+0x62,0xd4,0xfc,0x08,0x83,0xc9,0x7b
+
+# ATT:   {nf}	orq	$123, %r9
+# INTEL: {nf}	or	r9, 123
+0x62,0xd4,0xfc,0x0c,0x83,0xc9,0x7b
+
+# ATT:   orq	$123, %r9, %r15
+# INTEL: or	r15, r9, 123
+0x62,0xd4,0x84,0x18,0x83,0xc9,0x7b
+
+# ATT:   {nf}	orq	$123, %r9, %r15
+# INTEL: {nf}	or	r15, r9, 123
+0x62,0xd4,0x84,0x1c,0x83,0xc9,0x7b
+
+# ATT:   {evex}	orb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	or	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x80,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	orb	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	or	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0x80,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   orb	$123, 291(%r8,%rax,4), %bl
+# INTEL: or	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0x80,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	orb	$123, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	or	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x1c,0x80,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	orw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	or	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	orw	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	or	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x0c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   orw	$123, 291(%r8,%rax,4), %dx
+# INTEL: or	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	orw	$123, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	or	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x1c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	orl	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	or	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	orl	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	or	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   orl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: or	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	orl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	or	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x1c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	orq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	or	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	orq	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	or	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x0c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   orq	$123, 291(%r8,%rax,4), %r9
+# INTEL: or	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	orq	$123, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	or	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x1c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	orw	$1234, %dx
+# INTEL: {evex}	or	dx, 1234
+0x62,0xf4,0x7d,0x08,0x81,0xca,0xd2,0x04
+
+# ATT:   {nf}	orw	$1234, %dx
+# INTEL: {nf}	or	dx, 1234
+0x62,0xf4,0x7d,0x0c,0x81,0xca,0xd2,0x04
+
+# ATT:   orw	$1234, %dx, %ax
+# INTEL: or	ax, dx, 1234
+0x62,0xf4,0x7d,0x18,0x81,0xca,0xd2,0x04
+
+# ATT:   {nf}	orw	$1234, %dx, %ax
+# INTEL: {nf}	or	ax, dx, 1234
+0x62,0xf4,0x7d,0x1c,0x81,0xca,0xd2,0x04
+
+# ATT:   {evex}	orw	$1234, 291(%r8,%rax,4)
+# INTEL: {evex}	or	word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x7d,0x08,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {nf}	orw	$1234, 291(%r8,%rax,4)
+# INTEL: {nf}	or	word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x7d,0x0c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   orw	$1234, 291(%r8,%rax,4), %dx
+# INTEL: or	dx, word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x6d,0x18,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {nf}	orw	$1234, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	or	dx, word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x6d,0x1c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {evex}	orl	$123456, %ecx
+# INTEL: {evex}	or	ecx, 123456
+0x62,0xf4,0x7c,0x08,0x81,0xc9,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	orl	$123456, %ecx
+# INTEL: {nf}	or	ecx, 123456
+0x62,0xf4,0x7c,0x0c,0x81,0xc9,0x40,0xe2,0x01,0x00
+
+# ATT:   orl	$123456, %ecx, %edx
+# INTEL: or	edx, ecx, 123456
+0x62,0xf4,0x6c,0x18,0x81,0xc9,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	orl	$123456, %ecx, %edx
+# INTEL: {nf}	or	edx, ecx, 123456
+0x62,0xf4,0x6c,0x1c,0x81,0xc9,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	orq	$123456, %r9
+# INTEL: {evex}	or	r9, 123456
+0x62,0xd4,0xfc,0x08,0x81,0xc9,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	orq	$123456, %r9
+# INTEL: {nf}	or	r9, 123456
+0x62,0xd4,0xfc,0x0c,0x81,0xc9,0x40,0xe2,0x01,0x00
+
+# ATT:   orq	$123456, %r9, %r15
+# INTEL: or	r15, r9, 123456
+0x62,0xd4,0x84,0x18,0x81,0xc9,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	orq	$123456, %r9, %r15
+# INTEL: {nf}	or	r15, r9, 123456
+0x62,0xd4,0x84,0x1c,0x81,0xc9,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	orl	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	or	dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x7c,0x08,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	orl	$123456, 291(%r8,%rax,4)
+# INTEL: {nf}	or	dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x7c,0x0c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   orl	$123456, 291(%r8,%rax,4), %ecx
+# INTEL: or	ecx, dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x74,0x18,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	orl	$123456, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	or	ecx, dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x74,0x1c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	orq	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	or	qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xfc,0x08,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	orq	$123456, 291(%r8,%rax,4)
+# INTEL: {nf}	or	qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xfc,0x0c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   orq	$123456, 291(%r8,%rax,4), %r9
+# INTEL: or	r9, qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xb4,0x18,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	orq	$123456, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	or	r9, qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xb4,0x1c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	orb	%bl, %cl
+# INTEL: {evex}	or	cl, bl
+0x62,0xf4,0x7c,0x08,0x08,0xd9
+
+# ATT:   {nf}	orb	%bl, %cl
+# INTEL: {nf}	or	cl, bl
+0x62,0xf4,0x7c,0x0c,0x08,0xd9
+
+# ATT:   orb	%bl, %cl, %r8b
+# INTEL: or	r8b, cl, bl
+0x62,0xf4,0x3c,0x18,0x08,0xd9
+
+# ATT:   {nf}	orb	%bl, %cl, %r8b
+# INTEL: {nf}	or	r8b, cl, bl
+0x62,0xf4,0x3c,0x1c,0x08,0xd9
+
+# ATT:   {evex}	orb	%bl, 291(%r8,%rax,4)
+# INTEL: {evex}	or	byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x7c,0x08,0x08,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orb	%bl, 291(%r8,%rax,4)
+# INTEL: {nf}	or	byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x7c,0x0c,0x08,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   orb	%bl, 291(%r8,%rax,4), %cl
+# INTEL: or	cl, byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x74,0x18,0x08,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orb	%bl, 291(%r8,%rax,4), %cl
+# INTEL: {nf}	or	cl, byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x74,0x1c,0x08,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	orw	%dx, %ax
+# INTEL: {evex}	or	ax, dx
+0x62,0xf4,0x7d,0x08,0x09,0xd0
+
+# ATT:   {nf}	orw	%dx, %ax
+# INTEL: {nf}	or	ax, dx
+0x62,0xf4,0x7d,0x0c,0x09,0xd0
+
+# ATT:   orw	%dx, %ax, %r9w
+# INTEL: or	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x09,0xd0
+
+# ATT:   {nf}	orw	%dx, %ax, %r9w
+# INTEL: {nf}	or	r9w, ax, dx
+0x62,0xf4,0x35,0x1c,0x09,0xd0
+
+# ATT:   {evex}	orw	%dx, 291(%r8,%rax,4)
+# INTEL: {evex}	or	word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x08,0x09,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orw	%dx, 291(%r8,%rax,4)
+# INTEL: {nf}	or	word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x0c,0x09,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   orw	%dx, 291(%r8,%rax,4), %ax
+# INTEL: or	ax, word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x18,0x09,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orw	%dx, 291(%r8,%rax,4), %ax
+# INTEL: {nf}	or	ax, word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x1c,0x09,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	orl	%ecx, %edx
+# INTEL: {evex}	or	edx, ecx
+0x62,0xf4,0x7c,0x08,0x09,0xca
+
+# ATT:   {nf}	orl	%ecx, %edx
+# INTEL: {nf}	or	edx, ecx
+0x62,0xf4,0x7c,0x0c,0x09,0xca
+
+# ATT:   orl	%ecx, %edx, %r10d
+# INTEL: or	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x09,0xca
+
+# ATT:   {nf}	orl	%ecx, %edx, %r10d
+# INTEL: {nf}	or	r10d, edx, ecx
+0x62,0xf4,0x2c,0x1c,0x09,0xca
+
+# ATT:   {evex}	orl	%ecx, 291(%r8,%rax,4)
+# INTEL: {evex}	or	dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x7c,0x08,0x09,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orl	%ecx, 291(%r8,%rax,4)
+# INTEL: {nf}	or	dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x7c,0x0c,0x09,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   orl	%ecx, 291(%r8,%rax,4), %edx
+# INTEL: or	edx, dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x6c,0x18,0x09,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orl	%ecx, 291(%r8,%rax,4), %edx
+# INTEL: {nf}	or	edx, dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x6c,0x1c,0x09,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	orq	%r9, %r15
+# INTEL: {evex}	or	r15, r9
+0x62,0x54,0xfc,0x08,0x09,0xcf
+
+# ATT:   {nf}	orq	%r9, %r15
+# INTEL: {nf}	or	r15, r9
+0x62,0x54,0xfc,0x0c,0x09,0xcf
+
+# ATT:   orq	%r9, %r15, %r11
+# INTEL: or	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x09,0xcf
+
+# ATT:   {nf}	orq	%r9, %r15, %r11
+# INTEL: {nf}	or	r11, r15, r9
+0x62,0x54,0xa4,0x1c,0x09,0xcf
+
+# ATT:   {evex}	orq	%r9, 291(%r8,%rax,4)
+# INTEL: {evex}	or	qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0xfc,0x08,0x09,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orq	%r9, 291(%r8,%rax,4)
+# INTEL: {nf}	or	qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0xfc,0x0c,0x09,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   orq	%r9, 291(%r8,%rax,4), %r15
+# INTEL: or	r15, qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0x84,0x18,0x09,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orq	%r9, 291(%r8,%rax,4), %r15
+# INTEL: {nf}	or	r15, qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0x84,0x1c,0x09,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	orb	291(%r8,%rax,4), %bl
+# INTEL: {evex}	or	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x0a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orb	291(%r8,%rax,4), %bl
+# INTEL: {nf}	or	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0x0a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   orb	291(%r8,%rax,4), %bl, %cl
+# INTEL: or	cl, bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0x0a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orb	291(%r8,%rax,4), %bl, %cl
+# INTEL: {nf}	or	cl, bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x1c,0x0a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	orw	291(%r8,%rax,4), %dx
+# INTEL: {evex}	or	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0x0b,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orw	291(%r8,%rax,4), %dx
+# INTEL: {nf}	or	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x0c,0x0b,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   orw	291(%r8,%rax,4), %dx, %ax
+# INTEL: or	ax, dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x18,0x0b,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orw	291(%r8,%rax,4), %dx, %ax
+# INTEL: {nf}	or	ax, dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x1c,0x0b,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	orl	291(%r8,%rax,4), %ecx
+# INTEL: {evex}	or	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orl	291(%r8,%rax,4), %ecx
+# INTEL: {nf}	or	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   orl	291(%r8,%rax,4), %ecx, %edx
+# INTEL: or	edx, ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6c,0x18,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orl	291(%r8,%rax,4), %ecx, %edx
+# INTEL: {nf}	or	edx, ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6c,0x1c,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	orq	291(%r8,%rax,4), %r9
+# INTEL: {evex}	or	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0xfc,0x08,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orq	291(%r8,%rax,4), %r9
+# INTEL: {nf}	or	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0xfc,0x0c,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   orq	291(%r8,%rax,4), %r9, %r15
+# INTEL: or	r15, r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0x84,0x18,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	orq	291(%r8,%rax,4), %r9, %r15
+# INTEL: {nf}	or	r15, r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0x84,0x1c,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/reverse-encoding.txt b/llvm/test/MC/Disassembler/X86/apx/reverse-encoding.txt
new file mode 100644
index 000000000000..e686e5aa28af
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/reverse-encoding.txt
@@ -0,0 +1,400 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+## add
+
+# ATT:   {evex}	addb	%r17b, %r16b
+# INTEL: {evex}	add	r16b, r17b
+0x62,0xec,0x7c,0x08,0x02,0xc1
+
+# ATT:   {evex}	addw	%r17w, %r16w
+# INTEL: {evex}	add	r16w, r17w
+0x62,0xec,0x7d,0x08,0x03,0xc1
+
+# ATT:   {evex}	addl	%r17d, %r16d
+# INTEL: {evex}	add	r16d, r17d
+0x62,0xec,0x7c,0x08,0x03,0xc1
+
+# ATT:   {evex}	addq	%r17, %r16
+# INTEL: {evex}	add	r16, r17
+0x62,0xec,0xfc,0x08,0x03,0xc1
+
+# ATT:   addb	%r17b, %r16b, %r18b
+# INTEL: add	r18b, r16b, r17b
+0x62,0xec,0x6c,0x10,0x02,0xc1
+
+# ATT:   addw	%r17w, %r16w, %r18w
+# INTEL: add	r18w, r16w, r17w
+0x62,0xec,0x6d,0x10,0x03,0xc1
+
+# ATT:   addl	%r17d, %r16d, %r18d
+# INTEL: add	r18d, r16d, r17d
+0x62,0xec,0x6c,0x10,0x03,0xc1
+
+# ATT:   addq	%r17, %r16, %r18
+# INTEL: add	r18, r16, r17
+0x62,0xec,0xec,0x10,0x03,0xc1
+
+# ATT:   {nf}	addb	%r17b, %r16b
+# INTEL: {nf}	add	r16b, r17b
+0x62,0xec,0x7c,0x0c,0x02,0xc1
+
+# ATT:   {nf}	addw	%r17w, %r16w
+# INTEL: {nf}	add	r16w, r17w
+0x62,0xec,0x7d,0x0c,0x03,0xc1
+
+# ATT:   {nf}	addl	%r17d, %r16d
+# INTEL: {nf}	add	r16d, r17d
+0x62,0xec,0x7c,0x0c,0x03,0xc1
+
+# ATT:   {nf}	addq	%r17, %r16
+# INTEL: {nf}	add	r16, r17
+0x62,0xec,0xfc,0x0c,0x03,0xc1
+
+# ATT:   {nf}	addb	%r17b, %r16b, %r18b
+# INTEL: {nf}	add	r18b, r16b, r17b
+0x62,0xec,0x6c,0x14,0x02,0xc1
+
+# ATT:   {nf}	addw	%r17w, %r16w, %r18w
+# INTEL: {nf}	add	r18w, r16w, r17w
+0x62,0xec,0x6d,0x14,0x03,0xc1
+
+# ATT:   {nf}	addl	%r17d, %r16d, %r18d
+# INTEL: {nf}	add	r18d, r16d, r17d
+0x62,0xec,0x6c,0x14,0x03,0xc1
+
+# ATT:   {nf}	addq	%r17, %r16, %r18
+# INTEL: {nf}	add	r18, r16, r17
+0x62,0xec,0xec,0x14,0x03,0xc1
+
+## sub
+
+# ATT:   {evex}	subb	%r17b, %r16b
+# INTEL: {evex}	sub	r16b, r17b
+0x62,0xec,0x7c,0x08,0x2a,0xc1
+
+# ATT:   {evex}	subw	%r17w, %r16w
+# INTEL: {evex}	sub	r16w, r17w
+0x62,0xec,0x7d,0x08,0x2b,0xc1
+
+# ATT:   {evex}	subl	%r17d, %r16d
+# INTEL: {evex}	sub	r16d, r17d
+0x62,0xec,0x7c,0x08,0x2b,0xc1
+
+# ATT:   {evex}	subq	%r17, %r16
+# INTEL: {evex}	sub	r16, r17
+0x62,0xec,0xfc,0x08,0x2b,0xc1
+
+# ATT:   subb	%r17b, %r16b, %r18b
+# INTEL: sub	r18b, r16b, r17b
+0x62,0xec,0x6c,0x10,0x2a,0xc1
+
+# ATT:   subw	%r17w, %r16w, %r18w
+# INTEL: sub	r18w, r16w, r17w
+0x62,0xec,0x6d,0x10,0x2b,0xc1
+
+# ATT:   subl	%r17d, %r16d, %r18d
+# INTEL: sub	r18d, r16d, r17d
+0x62,0xec,0x6c,0x10,0x2b,0xc1
+
+# ATT:   subq	%r17, %r16, %r18
+# INTEL: sub	r18, r16, r17
+0x62,0xec,0xec,0x10,0x2b,0xc1
+
+# ATT:   {nf}	subb	%r17b, %r16b
+# INTEL: {nf}	sub	r16b, r17b
+0x62,0xec,0x7c,0x0c,0x2a,0xc1
+
+# ATT:   {nf}	subw	%r17w, %r16w
+# INTEL: {nf}	sub	r16w, r17w
+0x62,0xec,0x7d,0x0c,0x2b,0xc1
+
+# ATT:   {nf}	subl	%r17d, %r16d
+# INTEL: {nf}	sub	r16d, r17d
+0x62,0xec,0x7c,0x0c,0x2b,0xc1
+
+# ATT:   {nf}	subq	%r17, %r16
+# INTEL: {nf}	sub	r16, r17
+0x62,0xec,0xfc,0x0c,0x2b,0xc1
+
+# ATT:   {nf}	subb	%r17b, %r16b, %r18b
+# INTEL: {nf}	sub	r18b, r16b, r17b
+0x62,0xec,0x6c,0x14,0x2a,0xc1
+
+# ATT:   {nf}	subw	%r17w, %r16w, %r18w
+# INTEL: {nf}	sub	r18w, r16w, r17w
+0x62,0xec,0x6d,0x14,0x2b,0xc1
+
+# ATT:   {nf}	subl	%r17d, %r16d, %r18d
+# INTEL: {nf}	sub	r18d, r16d, r17d
+0x62,0xec,0x6c,0x14,0x2b,0xc1
+
+# ATT:   {nf}	subq	%r17, %r16, %r18
+# INTEL: {nf}	sub	r18, r16, r17
+0x62,0xec,0xec,0x14,0x2b,0xc1
+
+## and
+
+# ATT:   {evex}	andb	%r17b, %r16b
+# INTEL: {evex}	and	r16b, r17b
+0x62,0xec,0x7c,0x08,0x22,0xc1
+
+# ATT:   {evex}	andw	%r17w, %r16w
+# INTEL: {evex}	and	r16w, r17w
+0x62,0xec,0x7d,0x08,0x23,0xc1
+
+# ATT:   {evex}	andl	%r17d, %r16d
+# INTEL: {evex}	and	r16d, r17d
+0x62,0xec,0x7c,0x08,0x23,0xc1
+
+# ATT:   {evex}	andq	%r17, %r16
+# INTEL: {evex}	and	r16, r17
+0x62,0xec,0xfc,0x08,0x23,0xc1
+
+# ATT:   andb	%r17b, %r16b, %r18b
+# INTEL: and	r18b, r16b, r17b
+0x62,0xec,0x6c,0x10,0x22,0xc1
+
+# ATT:   andw	%r17w, %r16w, %r18w
+# INTEL: and	r18w, r16w, r17w
+0x62,0xec,0x6d,0x10,0x23,0xc1
+
+# ATT:   andl	%r17d, %r16d, %r18d
+# INTEL: and	r18d, r16d, r17d
+0x62,0xec,0x6c,0x10,0x23,0xc1
+
+# ATT:   andq	%r17, %r16, %r18
+# INTEL: and	r18, r16, r17
+0x62,0xec,0xec,0x10,0x23,0xc1
+
+# ATT:   {nf}	andb	%r17b, %r16b
+# INTEL: {nf}	and	r16b, r17b
+0x62,0xec,0x7c,0x0c,0x22,0xc1
+
+# ATT:   {nf}	andw	%r17w, %r16w
+# INTEL: {nf}	and	r16w, r17w
+0x62,0xec,0x7d,0x0c,0x23,0xc1
+
+# ATT:   {nf}	andl	%r17d, %r16d
+# INTEL: {nf}	and	r16d, r17d
+0x62,0xec,0x7c,0x0c,0x23,0xc1
+
+# ATT:   {nf}	andq	%r17, %r16
+# INTEL: {nf}	and	r16, r17
+0x62,0xec,0xfc,0x0c,0x23,0xc1
+
+# ATT:   {nf}	andb	%r17b, %r16b, %r18b
+# INTEL: {nf}	and	r18b, r16b, r17b
+0x62,0xec,0x6c,0x14,0x22,0xc1
+
+# ATT:   {nf}	andw	%r17w, %r16w, %r18w
+# INTEL: {nf}	and	r18w, r16w, r17w
+0x62,0xec,0x6d,0x14,0x23,0xc1
+
+# ATT:   {nf}	andl	%r17d, %r16d, %r18d
+# INTEL: {nf}	and	r18d, r16d, r17d
+0x62,0xec,0x6c,0x14,0x23,0xc1
+
+# ATT:   {nf}	andq	%r17, %r16, %r18
+# INTEL: {nf}	and	r18, r16, r17
+0x62,0xec,0xec,0x14,0x23,0xc1
+
+## or
+
+# ATT:   {evex}	orb	%r17b, %r16b
+# INTEL: {evex}	or	r16b, r17b
+0x62,0xec,0x7c,0x08,0x0a,0xc1
+
+# ATT:   {evex}	orw	%r17w, %r16w
+# INTEL: {evex}	or	r16w, r17w
+0x62,0xec,0x7d,0x08,0x0b,0xc1
+
+# ATT:   {evex}	orl	%r17d, %r16d
+# INTEL: {evex}	or	r16d, r17d
+0x62,0xec,0x7c,0x08,0x0b,0xc1
+
+# ATT:   {evex}	orq	%r17, %r16
+# INTEL: {evex}	or	r16, r17
+0x62,0xec,0xfc,0x08,0x0b,0xc1
+
+# ATT:   orb	%r17b, %r16b, %r18b
+# INTEL: or	r18b, r16b, r17b
+0x62,0xec,0x6c,0x10,0x0a,0xc1
+
+# ATT:   orw	%r17w, %r16w, %r18w
+# INTEL: or	r18w, r16w, r17w
+0x62,0xec,0x6d,0x10,0x0b,0xc1
+
+# ATT:   orl	%r17d, %r16d, %r18d
+# INTEL: or	r18d, r16d, r17d
+0x62,0xec,0x6c,0x10,0x0b,0xc1
+
+# ATT:   orq	%r17, %r16, %r18
+# INTEL: or	r18, r16, r17
+0x62,0xec,0xec,0x10,0x0b,0xc1
+
+# ATT:   {nf}	orb	%r17b, %r16b
+# INTEL: {nf}	or	r16b, r17b
+0x62,0xec,0x7c,0x0c,0x0a,0xc1
+
+# ATT:   {nf}	orw	%r17w, %r16w
+# INTEL: {nf}	or	r16w, r17w
+0x62,0xec,0x7d,0x0c,0x0b,0xc1
+
+# ATT:   {nf}	orl	%r17d, %r16d
+# INTEL: {nf}	or	r16d, r17d
+0x62,0xec,0x7c,0x0c,0x0b,0xc1
+
+# ATT:   {nf}	orq	%r17, %r16
+# INTEL: {nf}	or	r16, r17
+0x62,0xec,0xfc,0x0c,0x0b,0xc1
+
+# ATT:   {nf}	orb	%r17b, %r16b, %r18b
+# INTEL: {nf}	or	r18b, r16b, r17b
+0x62,0xec,0x6c,0x14,0x0a,0xc1
+
+# ATT:   {nf}	orw	%r17w, %r16w, %r18w
+# INTEL: {nf}	or	r18w, r16w, r17w
+0x62,0xec,0x6d,0x14,0x0b,0xc1
+
+# ATT:   {nf}	orl	%r17d, %r16d, %r18d
+# INTEL: {nf}	or	r18d, r16d, r17d
+0x62,0xec,0x6c,0x14,0x0b,0xc1
+
+# ATT:   {nf}	orq	%r17, %r16, %r18
+# INTEL: {nf}	or	r18, r16, r17
+0x62,0xec,0xec,0x14,0x0b,0xc1
+
+## xor
+
+# ATT:   {evex}	xorb	%r17b, %r16b
+# INTEL: {evex}	xor	r16b, r17b
+0x62,0xec,0x7c,0x08,0x32,0xc1
+
+# ATT:   {evex}	xorw	%r17w, %r16w
+# INTEL: {evex}	xor	r16w, r17w
+0x62,0xec,0x7d,0x08,0x33,0xc1
+
+# ATT:   {evex}	xorl	%r17d, %r16d
+# INTEL: {evex}	xor	r16d, r17d
+0x62,0xec,0x7c,0x08,0x33,0xc1
+
+# ATT:   {evex}	xorq	%r17, %r16
+# INTEL: {evex}	xor	r16, r17
+0x62,0xec,0xfc,0x08,0x33,0xc1
+
+# ATT:   xorb	%r17b, %r16b, %r18b
+# INTEL: xor	r18b, r16b, r17b
+0x62,0xec,0x6c,0x10,0x32,0xc1
+
+# ATT:   xorw	%r17w, %r16w, %r18w
+# INTEL: xor	r18w, r16w, r17w
+0x62,0xec,0x6d,0x10,0x33,0xc1
+
+# ATT:   xorl	%r17d, %r16d, %r18d
+# INTEL: xor	r18d, r16d, r17d
+0x62,0xec,0x6c,0x10,0x33,0xc1
+
+# ATT:   xorq	%r17, %r16, %r18
+# INTEL: xor	r18, r16, r17
+0x62,0xec,0xec,0x10,0x33,0xc1
+
+# ATT:   {nf}	xorb	%r17b, %r16b
+# INTEL: {nf}	xor	r16b, r17b
+0x62,0xec,0x7c,0x0c,0x32,0xc1
+
+# ATT:   {nf}	xorw	%r17w, %r16w
+# INTEL: {nf}	xor	r16w, r17w
+0x62,0xec,0x7d,0x0c,0x33,0xc1
+
+# ATT:   {nf}	xorl	%r17d, %r16d
+# INTEL: {nf}	xor	r16d, r17d
+0x62,0xec,0x7c,0x0c,0x33,0xc1
+
+# ATT:   {nf}	xorq	%r17, %r16
+# INTEL: {nf}	xor	r16, r17
+0x62,0xec,0xfc,0x0c,0x33,0xc1
+
+# ATT:   {nf}	xorb	%r17b, %r16b, %r18b
+# INTEL: {nf}	xor	r18b, r16b, r17b
+0x62,0xec,0x6c,0x14,0x32,0xc1
+
+# ATT:   {nf}	xorw	%r17w, %r16w, %r18w
+# INTEL: {nf}	xor	r18w, r16w, r17w
+0x62,0xec,0x6d,0x14,0x33,0xc1
+
+# ATT:   {nf}	xorl	%r17d, %r16d, %r18d
+# INTEL: {nf}	xor	r18d, r16d, r17d
+0x62,0xec,0x6c,0x14,0x33,0xc1
+
+# ATT:   {nf}	xorq	%r17, %r16, %r18
+# INTEL: {nf}	xor	r18, r16, r17
+0x62,0xec,0xec,0x14,0x33,0xc1
+
+## adc
+
+# ATT:   {evex}	adcb	%r17b, %r16b
+# INTEL: {evex}	adc	r16b, r17b
+0x62,0xec,0x7c,0x08,0x12,0xc1
+
+# ATT:   {evex}	adcw	%r17w, %r16w
+# INTEL: {evex}	adc	r16w, r17w
+0x62,0xec,0x7d,0x08,0x13,0xc1
+
+# ATT:   {evex}	adcl	%r17d, %r16d
+# INTEL: {evex}	adc	r16d, r17d
+0x62,0xec,0x7c,0x08,0x13,0xc1
+
+# ATT:   {evex}	adcq	%r17, %r16
+# INTEL: {evex}	adc	r16, r17
+0x62,0xec,0xfc,0x08,0x13,0xc1
+
+# ATT:   adcb	%r17b, %r16b, %r18b
+# INTEL: adc	r18b, r16b, r17b
+0x62,0xec,0x6c,0x10,0x12,0xc1
+
+# ATT:   adcw	%r17w, %r16w, %r18w
+# INTEL: adc	r18w, r16w, r17w
+0x62,0xec,0x6d,0x10,0x13,0xc1
+
+# ATT:   adcl	%r17d, %r16d, %r18d
+# INTEL: adc	r18d, r16d, r17d
+0x62,0xec,0x6c,0x10,0x13,0xc1
+
+# ATT:   adcq	%r17, %r16, %r18
+# INTEL: adc	r18, r16, r17
+0x62,0xec,0xec,0x10,0x13,0xc1
+
+## sbb
+
+# ATT:   {evex}	sbbb	%r17b, %r16b
+# INTEL: {evex}	sbb	r16b, r17b
+0x62,0xec,0x7c,0x08,0x1a,0xc1
+
+# ATT:   {evex}	sbbw	%r17w, %r16w
+# INTEL: {evex}	sbb	r16w, r17w
+0x62,0xec,0x7d,0x08,0x1b,0xc1
+
+# ATT:   {evex}	sbbl	%r17d, %r16d
+# INTEL: {evex}	sbb	r16d, r17d
+0x62,0xec,0x7c,0x08,0x1b,0xc1
+
+# ATT:   {evex}	sbbq	%r17, %r16
+# INTEL: {evex}	sbb	r16, r17
+0x62,0xec,0xfc,0x08,0x1b,0xc1
+
+# ATT:   sbbb	%r17b, %r16b, %r18b
+# INTEL: sbb	r18b, r16b, r17b
+0x62,0xec,0x6c,0x10,0x1a,0xc1
+
+# ATT:   sbbw	%r17w, %r16w, %r18w
+# INTEL: sbb	r18w, r16w, r17w
+0x62,0xec,0x6d,0x10,0x1b,0xc1
+
+# ATT:   sbbl	%r17d, %r16d, %r18d
+# INTEL: sbb	r18d, r16d, r17d
+0x62,0xec,0x6c,0x10,0x1b,0xc1
+
+# ATT:   sbbq	%r17, %r16, %r18
+# INTEL: sbb	r18, r16, r17
+0x62,0xec,0xec,0x10,0x1b,0xc1
diff --git a/llvm/test/MC/Disassembler/X86/apx/sbb.txt b/llvm/test/MC/Disassembler/X86/apx/sbb.txt
new file mode 100644
index 000000000000..c707d519326c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/sbb.txt
@@ -0,0 +1,210 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	sbbb	$123, %bl
+# INTEL: {evex}	sbb	bl, 123
+0x62,0xf4,0x7c,0x08,0x80,0xdb,0x7b
+
+# ATT:   sbbb	$123, %bl, %cl
+# INTEL: sbb	cl, bl, 123
+0x62,0xf4,0x74,0x18,0x80,0xdb,0x7b
+
+# ATT:   {evex}	sbbw	$123, %dx
+# INTEL: {evex}	sbb	dx, 123
+0x62,0xf4,0x7d,0x08,0x83,0xda,0x7b
+
+# ATT:   sbbw	$123, %dx, %ax
+# INTEL: sbb	ax, dx, 123
+0x62,0xf4,0x7d,0x18,0x83,0xda,0x7b
+
+# ATT:   {evex}	sbbl	$123, %ecx
+# INTEL: {evex}	sbb	ecx, 123
+0x62,0xf4,0x7c,0x08,0x83,0xd9,0x7b
+
+# ATT:   sbbl	$123, %ecx, %edx
+# INTEL: sbb	edx, ecx, 123
+0x62,0xf4,0x6c,0x18,0x83,0xd9,0x7b
+
+# ATT:   {evex}	sbbq	$123, %r9
+# INTEL: {evex}	sbb	r9, 123
+0x62,0xd4,0xfc,0x08,0x83,0xd9,0x7b
+
+# ATT:   sbbq	$123, %r9, %r15
+# INTEL: sbb	r15, r9, 123
+0x62,0xd4,0x84,0x18,0x83,0xd9,0x7b
+
+# ATT:   {evex}	sbbb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	sbb	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x80,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   sbbb	$123, 291(%r8,%rax,4), %bl
+# INTEL: sbb	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0x80,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	sbbw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	sbb	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   sbbw	$123, 291(%r8,%rax,4), %dx
+# INTEL: sbb	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	sbbl	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	sbb	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   sbbl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: sbb	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	sbbq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	sbb	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   sbbq	$123, 291(%r8,%rax,4), %r9
+# INTEL: sbb	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	sbbw	$1234, %dx
+# INTEL: {evex}	sbb	dx, 1234
+0x62,0xf4,0x7d,0x08,0x81,0xda,0xd2,0x04
+
+# ATT:   sbbw	$1234, %dx, %ax
+# INTEL: sbb	ax, dx, 1234
+0x62,0xf4,0x7d,0x18,0x81,0xda,0xd2,0x04
+
+# ATT:   {evex}	sbbw	$1234, 291(%r8,%rax,4)
+# INTEL: {evex}	sbb	word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x7d,0x08,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   sbbw	$1234, 291(%r8,%rax,4), %dx
+# INTEL: sbb	dx, word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x6d,0x18,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {evex}	sbbl	$123456, %ecx
+# INTEL: {evex}	sbb	ecx, 123456
+0x62,0xf4,0x7c,0x08,0x81,0xd9,0x40,0xe2,0x01,0x00
+
+# ATT:   sbbl	$123456, %ecx, %edx
+# INTEL: sbb	edx, ecx, 123456
+0x62,0xf4,0x6c,0x18,0x81,0xd9,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	sbbq	$123456, %r9
+# INTEL: {evex}	sbb	r9, 123456
+0x62,0xd4,0xfc,0x08,0x81,0xd9,0x40,0xe2,0x01,0x00
+
+# ATT:   sbbq	$123456, %r9, %r15
+# INTEL: sbb	r15, r9, 123456
+0x62,0xd4,0x84,0x18,0x81,0xd9,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	sbbl	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	sbb	dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x7c,0x08,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   sbbl	$123456, 291(%r8,%rax,4), %ecx
+# INTEL: sbb	ecx, dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x74,0x18,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	sbbq	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	sbb	qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xfc,0x08,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   sbbq	$123456, 291(%r8,%rax,4), %r9
+# INTEL: sbb	r9, qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xb4,0x18,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	sbbb	%bl, %cl
+# INTEL: {evex}	sbb	cl, bl
+0x62,0xf4,0x7c,0x08,0x18,0xd9
+
+# ATT:   sbbb	%bl, %cl, %r8b
+# INTEL: sbb	r8b, cl, bl
+0x62,0xf4,0x3c,0x18,0x18,0xd9
+
+# ATT:   {evex}	sbbb	%bl, 291(%r8,%rax,4)
+# INTEL: {evex}	sbb	byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x7c,0x08,0x18,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sbbb	%bl, 291(%r8,%rax,4), %cl
+# INTEL: sbb	cl, byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x74,0x18,0x18,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sbbw	%dx, %ax
+# INTEL: {evex}	sbb	ax, dx
+0x62,0xf4,0x7d,0x08,0x19,0xd0
+
+# ATT:   sbbw	%dx, %ax, %r9w
+# INTEL: sbb	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x19,0xd0
+
+# ATT:   {evex}	sbbw	%dx, 291(%r8,%rax,4)
+# INTEL: {evex}	sbb	word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x08,0x19,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sbbw	%dx, 291(%r8,%rax,4), %ax
+# INTEL: sbb	ax, word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x18,0x19,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sbbl	%ecx, %edx
+# INTEL: {evex}	sbb	edx, ecx
+0x62,0xf4,0x7c,0x08,0x19,0xca
+
+# ATT:   sbbl	%ecx, %edx, %r10d
+# INTEL: sbb	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x19,0xca
+
+# ATT:   {evex}	sbbl	%ecx, 291(%r8,%rax,4)
+# INTEL: {evex}	sbb	dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x7c,0x08,0x19,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sbbl	%ecx, 291(%r8,%rax,4), %edx
+# INTEL: sbb	edx, dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x6c,0x18,0x19,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sbbq	%r9, %r15
+# INTEL: {evex}	sbb	r15, r9
+0x62,0x54,0xfc,0x08,0x19,0xcf
+
+# ATT:   sbbq	%r9, %r15, %r11
+# INTEL: sbb	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x19,0xcf
+
+# ATT:   {evex}	sbbq	%r9, 291(%r8,%rax,4)
+# INTEL: {evex}	sbb	qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0xfc,0x08,0x19,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sbbq	%r9, 291(%r8,%rax,4), %r15
+# INTEL: sbb	r15, qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0x84,0x18,0x19,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sbbb	291(%r8,%rax,4), %bl
+# INTEL: {evex}	sbb	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x1a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sbbb	291(%r8,%rax,4), %bl, %cl
+# INTEL: sbb	cl, bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0x1a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sbbw	291(%r8,%rax,4), %dx
+# INTEL: {evex}	sbb	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0x1b,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sbbw	291(%r8,%rax,4), %dx, %ax
+# INTEL: sbb	ax, dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x18,0x1b,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sbbl	291(%r8,%rax,4), %ecx
+# INTEL: {evex}	sbb	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x1b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sbbl	291(%r8,%rax,4), %ecx, %edx
+# INTEL: sbb	edx, ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6c,0x18,0x1b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sbbq	291(%r8,%rax,4), %r9
+# INTEL: {evex}	sbb	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0xfc,0x08,0x1b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sbbq	291(%r8,%rax,4), %r9, %r15
+# INTEL: sbb	r15, r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0x84,0x18,0x1b,0x8c,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/sub.txt b/llvm/test/MC/Disassembler/X86/apx/sub.txt
new file mode 100644
index 000000000000..d5ec87d376fd
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/sub.txt
@@ -0,0 +1,418 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	subb	$123, %bl
+# INTEL: {evex}	sub	bl, 123
+0x62,0xf4,0x7c,0x08,0x80,0xeb,0x7b
+
+# ATT:   {nf}	subb	$123, %bl
+# INTEL: {nf}	sub	bl, 123
+0x62,0xf4,0x7c,0x0c,0x80,0xeb,0x7b
+
+# ATT:   subb	$123, %bl, %cl
+# INTEL: sub	cl, bl, 123
+0x62,0xf4,0x74,0x18,0x80,0xeb,0x7b
+
+# ATT:   {nf}	subb	$123, %bl, %cl
+# INTEL: {nf}	sub	cl, bl, 123
+0x62,0xf4,0x74,0x1c,0x80,0xeb,0x7b
+
+# ATT:   {evex}	subw	$123, %dx
+# INTEL: {evex}	sub	dx, 123
+0x62,0xf4,0x7d,0x08,0x83,0xea,0x7b
+
+# ATT:   {nf}	subw	$123, %dx
+# INTEL: {nf}	sub	dx, 123
+0x62,0xf4,0x7d,0x0c,0x83,0xea,0x7b
+
+# ATT:   subw	$123, %dx, %ax
+# INTEL: sub	ax, dx, 123
+0x62,0xf4,0x7d,0x18,0x83,0xea,0x7b
+
+# ATT:   {nf}	subw	$123, %dx, %ax
+# INTEL: {nf}	sub	ax, dx, 123
+0x62,0xf4,0x7d,0x1c,0x83,0xea,0x7b
+
+# ATT:   {evex}	subl	$123, %ecx
+# INTEL: {evex}	sub	ecx, 123
+0x62,0xf4,0x7c,0x08,0x83,0xe9,0x7b
+
+# ATT:   {nf}	subl	$123, %ecx
+# INTEL: {nf}	sub	ecx, 123
+0x62,0xf4,0x7c,0x0c,0x83,0xe9,0x7b
+
+# ATT:   subl	$123, %ecx, %edx
+# INTEL: sub	edx, ecx, 123
+0x62,0xf4,0x6c,0x18,0x83,0xe9,0x7b
+
+# ATT:   {nf}	subl	$123, %ecx, %edx
+# INTEL: {nf}	sub	edx, ecx, 123
+0x62,0xf4,0x6c,0x1c,0x83,0xe9,0x7b
+
+# ATT:   {evex}	subq	$123, %r9
+# INTEL: {evex}	sub	r9, 123
+0x62,0xd4,0xfc,0x08,0x83,0xe9,0x7b
+
+# ATT:   {nf}	subq	$123, %r9
+# INTEL: {nf}	sub	r9, 123
+0x62,0xd4,0xfc,0x0c,0x83,0xe9,0x7b
+
+# ATT:   subq	$123, %r9, %r15
+# INTEL: sub	r15, r9, 123
+0x62,0xd4,0x84,0x18,0x83,0xe9,0x7b
+
+# ATT:   {nf}	subq	$123, %r9, %r15
+# INTEL: {nf}	sub	r15, r9, 123
+0x62,0xd4,0x84,0x1c,0x83,0xe9,0x7b
+
+# ATT:   {evex}	subb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	sub	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x80,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	subb	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	sub	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0x80,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   subb	$123, 291(%r8,%rax,4), %bl
+# INTEL: sub	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0x80,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	subb	$123, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	sub	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x1c,0x80,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	subw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	sub	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	subw	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	sub	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x0c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   subw	$123, 291(%r8,%rax,4), %dx
+# INTEL: sub	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	subw	$123, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	sub	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x1c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	subl	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	sub	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	subl	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	sub	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   subl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: sub	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	subl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	sub	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x1c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	subq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	sub	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	subq	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	sub	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x0c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   subq	$123, 291(%r8,%rax,4), %r9
+# INTEL: sub	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	subq	$123, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	sub	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x1c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	subw	$1234, %dx
+# INTEL: {evex}	sub	dx, 1234
+0x62,0xf4,0x7d,0x08,0x81,0xea,0xd2,0x04
+
+# ATT:   {nf}	subw	$1234, %dx
+# INTEL: {nf}	sub	dx, 1234
+0x62,0xf4,0x7d,0x0c,0x81,0xea,0xd2,0x04
+
+# ATT:   subw	$1234, %dx, %ax
+# INTEL: sub	ax, dx, 1234
+0x62,0xf4,0x7d,0x18,0x81,0xea,0xd2,0x04
+
+# ATT:   {nf}	subw	$1234, %dx, %ax
+# INTEL: {nf}	sub	ax, dx, 1234
+0x62,0xf4,0x7d,0x1c,0x81,0xea,0xd2,0x04
+
+# ATT:   {evex}	subw	$1234, 291(%r8,%rax,4)
+# INTEL: {evex}	sub	word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x7d,0x08,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {nf}	subw	$1234, 291(%r8,%rax,4)
+# INTEL: {nf}	sub	word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x7d,0x0c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   subw	$1234, 291(%r8,%rax,4), %dx
+# INTEL: sub	dx, word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x6d,0x18,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {nf}	subw	$1234, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	sub	dx, word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x6d,0x1c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {evex}	subl	$123456, %ecx
+# INTEL: {evex}	sub	ecx, 123456
+0x62,0xf4,0x7c,0x08,0x81,0xe9,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	subl	$123456, %ecx
+# INTEL: {nf}	sub	ecx, 123456
+0x62,0xf4,0x7c,0x0c,0x81,0xe9,0x40,0xe2,0x01,0x00
+
+# ATT:   subl	$123456, %ecx, %edx
+# INTEL: sub	edx, ecx, 123456
+0x62,0xf4,0x6c,0x18,0x81,0xe9,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	subl	$123456, %ecx, %edx
+# INTEL: {nf}	sub	edx, ecx, 123456
+0x62,0xf4,0x6c,0x1c,0x81,0xe9,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	subq	$123456, %r9
+# INTEL: {evex}	sub	r9, 123456
+0x62,0xd4,0xfc,0x08,0x81,0xe9,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	subq	$123456, %r9
+# INTEL: {nf}	sub	r9, 123456
+0x62,0xd4,0xfc,0x0c,0x81,0xe9,0x40,0xe2,0x01,0x00
+
+# ATT:   subq	$123456, %r9, %r15
+# INTEL: sub	r15, r9, 123456
+0x62,0xd4,0x84,0x18,0x81,0xe9,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	subq	$123456, %r9, %r15
+# INTEL: {nf}	sub	r15, r9, 123456
+0x62,0xd4,0x84,0x1c,0x81,0xe9,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	subl	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	sub	dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x7c,0x08,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	subl	$123456, 291(%r8,%rax,4)
+# INTEL: {nf}	sub	dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x7c,0x0c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   subl	$123456, 291(%r8,%rax,4), %ecx
+# INTEL: sub	ecx, dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x74,0x18,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	subl	$123456, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	sub	ecx, dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x74,0x1c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	subq	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	sub	qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xfc,0x08,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	subq	$123456, 291(%r8,%rax,4)
+# INTEL: {nf}	sub	qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xfc,0x0c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   subq	$123456, 291(%r8,%rax,4), %r9
+# INTEL: sub	r9, qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xb4,0x18,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	subq	$123456, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	sub	r9, qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xb4,0x1c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	subb	%bl, %cl
+# INTEL: {evex}	sub	cl, bl
+0x62,0xf4,0x7c,0x08,0x28,0xd9
+
+# ATT:   {nf}	subb	%bl, %cl
+# INTEL: {nf}	sub	cl, bl
+0x62,0xf4,0x7c,0x0c,0x28,0xd9
+
+# ATT:   subb	%bl, %cl, %r8b
+# INTEL: sub	r8b, cl, bl
+0x62,0xf4,0x3c,0x18,0x28,0xd9
+
+# ATT:   {nf}	subb	%bl, %cl, %r8b
+# INTEL: {nf}	sub	r8b, cl, bl
+0x62,0xf4,0x3c,0x1c,0x28,0xd9
+
+# ATT:   {evex}	subb	%bl, 291(%r8,%rax,4)
+# INTEL: {evex}	sub	byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x7c,0x08,0x28,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subb	%bl, 291(%r8,%rax,4)
+# INTEL: {nf}	sub	byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x7c,0x0c,0x28,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   subb	%bl, 291(%r8,%rax,4), %cl
+# INTEL: sub	cl, byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x74,0x18,0x28,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subb	%bl, 291(%r8,%rax,4), %cl
+# INTEL: {nf}	sub	cl, byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x74,0x1c,0x28,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	subw	%dx, %ax
+# INTEL: {evex}	sub	ax, dx
+0x62,0xf4,0x7d,0x08,0x29,0xd0
+
+# ATT:   {nf}	subw	%dx, %ax
+# INTEL: {nf}	sub	ax, dx
+0x62,0xf4,0x7d,0x0c,0x29,0xd0
+
+# ATT:   subw	%dx, %ax, %r9w
+# INTEL: sub	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x29,0xd0
+
+# ATT:   {nf}	subw	%dx, %ax, %r9w
+# INTEL: {nf}	sub	r9w, ax, dx
+0x62,0xf4,0x35,0x1c,0x29,0xd0
+
+# ATT:   {evex}	subw	%dx, 291(%r8,%rax,4)
+# INTEL: {evex}	sub	word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x08,0x29,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subw	%dx, 291(%r8,%rax,4)
+# INTEL: {nf}	sub	word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x0c,0x29,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   subw	%dx, 291(%r8,%rax,4), %ax
+# INTEL: sub	ax, word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x18,0x29,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subw	%dx, 291(%r8,%rax,4), %ax
+# INTEL: {nf}	sub	ax, word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x1c,0x29,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	subl	%ecx, %edx
+# INTEL: {evex}	sub	edx, ecx
+0x62,0xf4,0x7c,0x08,0x29,0xca
+
+# ATT:   {nf}	subl	%ecx, %edx
+# INTEL: {nf}	sub	edx, ecx
+0x62,0xf4,0x7c,0x0c,0x29,0xca
+
+# ATT:   subl	%ecx, %edx, %r10d
+# INTEL: sub	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x29,0xca
+
+# ATT:   {nf}	subl	%ecx, %edx, %r10d
+# INTEL: {nf}	sub	r10d, edx, ecx
+0x62,0xf4,0x2c,0x1c,0x29,0xca
+
+# ATT:   {evex}	subl	%ecx, 291(%r8,%rax,4)
+# INTEL: {evex}	sub	dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x7c,0x08,0x29,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subl	%ecx, 291(%r8,%rax,4)
+# INTEL: {nf}	sub	dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x7c,0x0c,0x29,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   subl	%ecx, 291(%r8,%rax,4), %edx
+# INTEL: sub	edx, dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x6c,0x18,0x29,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subl	%ecx, 291(%r8,%rax,4), %edx
+# INTEL: {nf}	sub	edx, dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x6c,0x1c,0x29,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	subq	%r9, %r15
+# INTEL: {evex}	sub	r15, r9
+0x62,0x54,0xfc,0x08,0x29,0xcf
+
+# ATT:   {nf}	subq	%r9, %r15
+# INTEL: {nf}	sub	r15, r9
+0x62,0x54,0xfc,0x0c,0x29,0xcf
+
+# ATT:   subq	%r9, %r15, %r11
+# INTEL: sub	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x29,0xcf
+
+# ATT:   {nf}	subq	%r9, %r15, %r11
+# INTEL: {nf}	sub	r11, r15, r9
+0x62,0x54,0xa4,0x1c,0x29,0xcf
+
+# ATT:   {evex}	subq	%r9, 291(%r8,%rax,4)
+# INTEL: {evex}	sub	qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0xfc,0x08,0x29,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subq	%r9, 291(%r8,%rax,4)
+# INTEL: {nf}	sub	qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0xfc,0x0c,0x29,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   subq	%r9, 291(%r8,%rax,4), %r15
+# INTEL: sub	r15, qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0x84,0x18,0x29,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subq	%r9, 291(%r8,%rax,4), %r15
+# INTEL: {nf}	sub	r15, qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0x84,0x1c,0x29,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	subb	291(%r8,%rax,4), %bl
+# INTEL: {evex}	sub	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x2a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subb	291(%r8,%rax,4), %bl
+# INTEL: {nf}	sub	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0x2a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   subb	291(%r8,%rax,4), %bl, %cl
+# INTEL: sub	cl, bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0x2a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subb	291(%r8,%rax,4), %bl, %cl
+# INTEL: {nf}	sub	cl, bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x1c,0x2a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	subw	291(%r8,%rax,4), %dx
+# INTEL: {evex}	sub	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0x2b,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subw	291(%r8,%rax,4), %dx
+# INTEL: {nf}	sub	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x0c,0x2b,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   subw	291(%r8,%rax,4), %dx, %ax
+# INTEL: sub	ax, dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x18,0x2b,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subw	291(%r8,%rax,4), %dx, %ax
+# INTEL: {nf}	sub	ax, dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x1c,0x2b,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	subl	291(%r8,%rax,4), %ecx
+# INTEL: {evex}	sub	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subl	291(%r8,%rax,4), %ecx
+# INTEL: {nf}	sub	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   subl	291(%r8,%rax,4), %ecx, %edx
+# INTEL: sub	edx, ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6c,0x18,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subl	291(%r8,%rax,4), %ecx, %edx
+# INTEL: {nf}	sub	edx, ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6c,0x1c,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	subq	291(%r8,%rax,4), %r9
+# INTEL: {evex}	sub	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0xfc,0x08,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subq	291(%r8,%rax,4), %r9
+# INTEL: {nf}	sub	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0xfc,0x0c,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   subq	291(%r8,%rax,4), %r9, %r15
+# INTEL: sub	r15, r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0x84,0x18,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	subq	291(%r8,%rax,4), %r9, %r15
+# INTEL: {nf}	sub	r15, r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0x84,0x1c,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrssd.txt b/llvm/test/MC/Disassembler/X86/apx/wrssd.txt
new file mode 100644
index 000000000000..600e85e1440e
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/wrssd.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   wrssd	%r18d, 291(%r28,%r29,4)
+# INTEL: wrssd	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrssq.txt b/llvm/test/MC/Disassembler/X86/apx/wrssq.txt
new file mode 100644
index 000000000000..9f5b26321fd2
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/wrssq.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   wrssq	%r19, 291(%r28,%r29,4)
+# INTEL: wrssq	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrussd.txt b/llvm/test/MC/Disassembler/X86/apx/wrussd.txt
new file mode 100644
index 000000000000..1b8b0007e2d3
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/wrussd.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   wrussd	%r18d, 291(%r28,%r29,4)
+# INTEL: wrussd	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrussq.txt b/llvm/test/MC/Disassembler/X86/apx/wrussq.txt
new file mode 100644
index 000000000000..7ff51f617c5c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/wrussq.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   wrussq	%r19, 291(%r28,%r29,4)
+# INTEL: wrussq	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/xor.txt b/llvm/test/MC/Disassembler/X86/apx/xor.txt
new file mode 100644
index 000000000000..69c52989c13f
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/xor.txt
@@ -0,0 +1,418 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	xorb	$123, %bl
+# INTEL: {evex}	xor	bl, 123
+0x62,0xf4,0x7c,0x08,0x80,0xf3,0x7b
+
+# ATT:   {nf}	xorb	$123, %bl
+# INTEL: {nf}	xor	bl, 123
+0x62,0xf4,0x7c,0x0c,0x80,0xf3,0x7b
+
+# ATT:   xorb	$123, %bl, %cl
+# INTEL: xor	cl, bl, 123
+0x62,0xf4,0x74,0x18,0x80,0xf3,0x7b
+
+# ATT:   {nf}	xorb	$123, %bl, %cl
+# INTEL: {nf}	xor	cl, bl, 123
+0x62,0xf4,0x74,0x1c,0x80,0xf3,0x7b
+
+# ATT:   {evex}	xorw	$123, %dx
+# INTEL: {evex}	xor	dx, 123
+0x62,0xf4,0x7d,0x08,0x83,0xf2,0x7b
+
+# ATT:   {nf}	xorw	$123, %dx
+# INTEL: {nf}	xor	dx, 123
+0x62,0xf4,0x7d,0x0c,0x83,0xf2,0x7b
+
+# ATT:   xorw	$123, %dx, %ax
+# INTEL: xor	ax, dx, 123
+0x62,0xf4,0x7d,0x18,0x83,0xf2,0x7b
+
+# ATT:   {nf}	xorw	$123, %dx, %ax
+# INTEL: {nf}	xor	ax, dx, 123
+0x62,0xf4,0x7d,0x1c,0x83,0xf2,0x7b
+
+# ATT:   {evex}	xorl	$123, %ecx
+# INTEL: {evex}	xor	ecx, 123
+0x62,0xf4,0x7c,0x08,0x83,0xf1,0x7b
+
+# ATT:   {nf}	xorl	$123, %ecx
+# INTEL: {nf}	xor	ecx, 123
+0x62,0xf4,0x7c,0x0c,0x83,0xf1,0x7b
+
+# ATT:   xorl	$123, %ecx, %edx
+# INTEL: xor	edx, ecx, 123
+0x62,0xf4,0x6c,0x18,0x83,0xf1,0x7b
+
+# ATT:   {nf}	xorl	$123, %ecx, %edx
+# INTEL: {nf}	xor	edx, ecx, 123
+0x62,0xf4,0x6c,0x1c,0x83,0xf1,0x7b
+
+# ATT:   {evex}	xorq	$123, %r9
+# INTEL: {evex}	xor	r9, 123
+0x62,0xd4,0xfc,0x08,0x83,0xf1,0x7b
+
+# ATT:   {nf}	xorq	$123, %r9
+# INTEL: {nf}	xor	r9, 123
+0x62,0xd4,0xfc,0x0c,0x83,0xf1,0x7b
+
+# ATT:   xorq	$123, %r9, %r15
+# INTEL: xor	r15, r9, 123
+0x62,0xd4,0x84,0x18,0x83,0xf1,0x7b
+
+# ATT:   {nf}	xorq	$123, %r9, %r15
+# INTEL: {nf}	xor	r15, r9, 123
+0x62,0xd4,0x84,0x1c,0x83,0xf1,0x7b
+
+# ATT:   {evex}	xorb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	xor	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x80,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	xorb	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	xor	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0x80,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   xorb	$123, 291(%r8,%rax,4), %bl
+# INTEL: xor	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0x80,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	xorb	$123, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	xor	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x1c,0x80,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	xorw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	xor	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	xorw	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	xor	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x0c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   xorw	$123, 291(%r8,%rax,4), %dx
+# INTEL: xor	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	xorw	$123, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	xor	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x1c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	xorl	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	xor	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	xorl	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	xor	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   xorl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: xor	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	xorl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	xor	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x1c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	xorq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	xor	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	xorq	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	xor	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x0c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   xorq	$123, 291(%r8,%rax,4), %r9
+# INTEL: xor	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	xorq	$123, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	xor	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x1c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	xorw	$1234, %dx
+# INTEL: {evex}	xor	dx, 1234
+0x62,0xf4,0x7d,0x08,0x81,0xf2,0xd2,0x04
+
+# ATT:   {nf}	xorw	$1234, %dx
+# INTEL: {nf}	xor	dx, 1234
+0x62,0xf4,0x7d,0x0c,0x81,0xf2,0xd2,0x04
+
+# ATT:   xorw	$1234, %dx, %ax
+# INTEL: xor	ax, dx, 1234
+0x62,0xf4,0x7d,0x18,0x81,0xf2,0xd2,0x04
+
+# ATT:   {nf}	xorw	$1234, %dx, %ax
+# INTEL: {nf}	xor	ax, dx, 1234
+0x62,0xf4,0x7d,0x1c,0x81,0xf2,0xd2,0x04
+
+# ATT:   {evex}	xorw	$1234, 291(%r8,%rax,4)
+# INTEL: {evex}	xor	word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x7d,0x08,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {nf}	xorw	$1234, 291(%r8,%rax,4)
+# INTEL: {nf}	xor	word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x7d,0x0c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   xorw	$1234, 291(%r8,%rax,4), %dx
+# INTEL: xor	dx, word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x6d,0x18,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {nf}	xorw	$1234, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	xor	dx, word ptr [r8 + 4*rax + 291], 1234
+0x62,0xd4,0x6d,0x1c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04
+
+# ATT:   {evex}	xorl	$123456, %ecx
+# INTEL: {evex}	xor	ecx, 123456
+0x62,0xf4,0x7c,0x08,0x81,0xf1,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	xorl	$123456, %ecx
+# INTEL: {nf}	xor	ecx, 123456
+0x62,0xf4,0x7c,0x0c,0x81,0xf1,0x40,0xe2,0x01,0x00
+
+# ATT:   xorl	$123456, %ecx, %edx
+# INTEL: xor	edx, ecx, 123456
+0x62,0xf4,0x6c,0x18,0x81,0xf1,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	xorl	$123456, %ecx, %edx
+# INTEL: {nf}	xor	edx, ecx, 123456
+0x62,0xf4,0x6c,0x1c,0x81,0xf1,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	xorq	$123456, %r9
+# INTEL: {evex}	xor	r9, 123456
+0x62,0xd4,0xfc,0x08,0x81,0xf1,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	xorq	$123456, %r9
+# INTEL: {nf}	xor	r9, 123456
+0x62,0xd4,0xfc,0x0c,0x81,0xf1,0x40,0xe2,0x01,0x00
+
+# ATT:   xorq	$123456, %r9, %r15
+# INTEL: xor	r15, r9, 123456
+0x62,0xd4,0x84,0x18,0x81,0xf1,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	xorq	$123456, %r9, %r15
+# INTEL: {nf}	xor	r15, r9, 123456
+0x62,0xd4,0x84,0x1c,0x81,0xf1,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	xorl	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	xor	dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x7c,0x08,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	xorl	$123456, 291(%r8,%rax,4)
+# INTEL: {nf}	xor	dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x7c,0x0c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   xorl	$123456, 291(%r8,%rax,4), %ecx
+# INTEL: xor	ecx, dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x74,0x18,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	xorl	$123456, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	xor	ecx, dword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0x74,0x1c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	xorq	$123456, 291(%r8,%rax,4)
+# INTEL: {evex}	xor	qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xfc,0x08,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	xorq	$123456, 291(%r8,%rax,4)
+# INTEL: {nf}	xor	qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xfc,0x0c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   xorq	$123456, 291(%r8,%rax,4), %r9
+# INTEL: xor	r9, qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xb4,0x18,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {nf}	xorq	$123456, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	xor	r9, qword ptr [r8 + 4*rax + 291], 123456
+0x62,0xd4,0xb4,0x1c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00
+
+# ATT:   {evex}	xorb	%bl, %cl
+# INTEL: {evex}	xor	cl, bl
+0x62,0xf4,0x7c,0x08,0x30,0xd9
+
+# ATT:   {nf}	xorb	%bl, %cl
+# INTEL: {nf}	xor	cl, bl
+0x62,0xf4,0x7c,0x0c,0x30,0xd9
+
+# ATT:   xorb	%bl, %cl, %r8b
+# INTEL: xor	r8b, cl, bl
+0x62,0xf4,0x3c,0x18,0x30,0xd9
+
+# ATT:   {nf}	xorb	%bl, %cl, %r8b
+# INTEL: {nf}	xor	r8b, cl, bl
+0x62,0xf4,0x3c,0x1c,0x30,0xd9
+
+# ATT:   {evex}	xorb	%bl, 291(%r8,%rax,4)
+# INTEL: {evex}	xor	byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x7c,0x08,0x30,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorb	%bl, 291(%r8,%rax,4)
+# INTEL: {nf}	xor	byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x7c,0x0c,0x30,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   xorb	%bl, 291(%r8,%rax,4), %cl
+# INTEL: xor	cl, byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x74,0x18,0x30,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorb	%bl, 291(%r8,%rax,4), %cl
+# INTEL: {nf}	xor	cl, byte ptr [r8 + 4*rax + 291], bl
+0x62,0xd4,0x74,0x1c,0x30,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	xorw	%dx, %ax
+# INTEL: {evex}	xor	ax, dx
+0x62,0xf4,0x7d,0x08,0x31,0xd0
+
+# ATT:   {nf}	xorw	%dx, %ax
+# INTEL: {nf}	xor	ax, dx
+0x62,0xf4,0x7d,0x0c,0x31,0xd0
+
+# ATT:   xorw	%dx, %ax, %r9w
+# INTEL: xor	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x31,0xd0
+
+# ATT:   {nf}	xorw	%dx, %ax, %r9w
+# INTEL: {nf}	xor	r9w, ax, dx
+0x62,0xf4,0x35,0x1c,0x31,0xd0
+
+# ATT:   {evex}	xorw	%dx, 291(%r8,%rax,4)
+# INTEL: {evex}	xor	word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x08,0x31,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorw	%dx, 291(%r8,%rax,4)
+# INTEL: {nf}	xor	word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x0c,0x31,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   xorw	%dx, 291(%r8,%rax,4), %ax
+# INTEL: xor	ax, word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x18,0x31,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorw	%dx, 291(%r8,%rax,4), %ax
+# INTEL: {nf}	xor	ax, word ptr [r8 + 4*rax + 291], dx
+0x62,0xd4,0x7d,0x1c,0x31,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	xorl	%ecx, %edx
+# INTEL: {evex}	xor	edx, ecx
+0x62,0xf4,0x7c,0x08,0x31,0xca
+
+# ATT:   {nf}	xorl	%ecx, %edx
+# INTEL: {nf}	xor	edx, ecx
+0x62,0xf4,0x7c,0x0c,0x31,0xca
+
+# ATT:   xorl	%ecx, %edx, %r10d
+# INTEL: xor	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x31,0xca
+
+# ATT:   {nf}	xorl	%ecx, %edx, %r10d
+# INTEL: {nf}	xor	r10d, edx, ecx
+0x62,0xf4,0x2c,0x1c,0x31,0xca
+
+# ATT:   {evex}	xorl	%ecx, 291(%r8,%rax,4)
+# INTEL: {evex}	xor	dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x7c,0x08,0x31,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorl	%ecx, 291(%r8,%rax,4)
+# INTEL: {nf}	xor	dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x7c,0x0c,0x31,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   xorl	%ecx, 291(%r8,%rax,4), %edx
+# INTEL: xor	edx, dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x6c,0x18,0x31,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorl	%ecx, 291(%r8,%rax,4), %edx
+# INTEL: {nf}	xor	edx, dword ptr [r8 + 4*rax + 291], ecx
+0x62,0xd4,0x6c,0x1c,0x31,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	xorq	%r9, %r15
+# INTEL: {evex}	xor	r15, r9
+0x62,0x54,0xfc,0x08,0x31,0xcf
+
+# ATT:   {nf}	xorq	%r9, %r15
+# INTEL: {nf}	xor	r15, r9
+0x62,0x54,0xfc,0x0c,0x31,0xcf
+
+# ATT:   xorq	%r9, %r15, %r11
+# INTEL: xor	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x31,0xcf
+
+# ATT:   {nf}	xorq	%r9, %r15, %r11
+# INTEL: {nf}	xor	r11, r15, r9
+0x62,0x54,0xa4,0x1c,0x31,0xcf
+
+# ATT:   {evex}	xorq	%r9, 291(%r8,%rax,4)
+# INTEL: {evex}	xor	qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0xfc,0x08,0x31,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorq	%r9, 291(%r8,%rax,4)
+# INTEL: {nf}	xor	qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0xfc,0x0c,0x31,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   xorq	%r9, 291(%r8,%rax,4), %r15
+# INTEL: xor	r15, qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0x84,0x18,0x31,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorq	%r9, 291(%r8,%rax,4), %r15
+# INTEL: {nf}	xor	r15, qword ptr [r8 + 4*rax + 291], r9
+0x62,0x54,0x84,0x1c,0x31,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	xorb	291(%r8,%rax,4), %bl
+# INTEL: {evex}	xor	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x32,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorb	291(%r8,%rax,4), %bl
+# INTEL: {nf}	xor	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0x32,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   xorb	291(%r8,%rax,4), %bl, %cl
+# INTEL: xor	cl, bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0x32,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorb	291(%r8,%rax,4), %bl, %cl
+# INTEL: {nf}	xor	cl, bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x1c,0x32,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	xorw	291(%r8,%rax,4), %dx
+# INTEL: {evex}	xor	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0x33,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorw	291(%r8,%rax,4), %dx
+# INTEL: {nf}	xor	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x0c,0x33,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   xorw	291(%r8,%rax,4), %dx, %ax
+# INTEL: xor	ax, dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x18,0x33,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorw	291(%r8,%rax,4), %dx, %ax
+# INTEL: {nf}	xor	ax, dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x1c,0x33,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	xorl	291(%r8,%rax,4), %ecx
+# INTEL: {evex}	xor	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0x33,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorl	291(%r8,%rax,4), %ecx
+# INTEL: {nf}	xor	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0x33,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   xorl	291(%r8,%rax,4), %ecx, %edx
+# INTEL: xor	edx, ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6c,0x18,0x33,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorl	291(%r8,%rax,4), %ecx, %edx
+# INTEL: {nf}	xor	edx, ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6c,0x1c,0x33,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	xorq	291(%r8,%rax,4), %r9
+# INTEL: {evex}	xor	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0xfc,0x08,0x33,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorq	291(%r8,%rax,4), %r9
+# INTEL: {nf}	xor	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0xfc,0x0c,0x33,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   xorq	291(%r8,%rax,4), %r9, %r15
+# INTEL: xor	r15, r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0x84,0x18,0x33,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	xorq	291(%r8,%rax,4), %r9, %r15
+# INTEL: {nf}	xor	r15, r9, qword ptr [r8 + 4*rax + 291]
+0x62,0x54,0x84,0x1c,0x33,0x8c,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/LoongArch/Macros/macros-la.s b/llvm/test/MC/LoongArch/Macros/macros-la.s
index 924e4326b8e5..1a1d12d7d7df 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-la.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-la.s
@@ -1,66 +1,128 @@
 # RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX
+
+# RELOC:      Relocations [
+# RELOC-NEXT:   Section ({{.*}}) .rela.text {
 
 la.abs $a0, sym_abs
 # CHECK:      lu12i.w $a0, %abs_hi20(sym_abs)
 # CHECK-NEXT: ori $a0, $a0, %abs_lo12(sym_abs)
 # CHECK-NEXT: lu32i.d $a0, %abs64_lo20(sym_abs)
 # CHECK-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_abs)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_ABS_HI20 sym_abs 0x0
+# RELOC-NEXT: R_LARCH_ABS_LO12 sym_abs 0x0
+# RELOC-NEXT: R_LARCH_ABS64_LO20 sym_abs 0x0
+# RELOC-NEXT: R_LARCH_ABS64_HI12 sym_abs 0x0
 
 la.pcrel $a0, sym_pcrel
-# CHECK:      pcalau12i $a0, %pc_hi20(sym_pcrel)
+# CHECK-NEXT: pcalau12i $a0, %pc_hi20(sym_pcrel)
 # CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(sym_pcrel)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_pcrel 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.pcrel $a0, $a1, sym_pcrel_large
-# CHECK:      pcalau12i $a0, %pc_hi20(sym_pcrel_large)
+# CHECK-NEXT: pcalau12i $a0, %pc_hi20(sym_pcrel_large)
 # CHECK-NEXT: addi.d $a1, $zero, %pc_lo12(sym_pcrel_large)
 # CHECK-NEXT: lu32i.d $a1, %pc64_lo20(sym_pcrel_large)
 # CHECK-NEXT: lu52i.d $a1, $a1, %pc64_hi12(sym_pcrel_large)
 # CHECK-NEXT: add.d $a0, $a0, $a1
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_pcrel_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA64_LO20 sym_pcrel_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA64_HI12 sym_pcrel_large 0x0
 
 la.got $a0, sym_got
-# CHECK:      pcalau12i $a0, %got_pc_hi20(sym_got)
+# CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(sym_got)
 # CHECK-NEXT: ld.d $a0, $a0, %got_pc_lo12(sym_got)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_got 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_got 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.got $a0, $a1, sym_got_large
-# CHECK:      pcalau12i $a0, %got_pc_hi20(sym_got_large)
+# CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(sym_got_large)
 # CHECK-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_got_large)
 # CHECK-NEXT: lu32i.d $a1, %got64_pc_lo20(sym_got_large)
 # CHECK-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(sym_got_large)
 # CHECK-NEXT: ldx.d $a0, $a0, $a1
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_got_large 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_got_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_got_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_got_large 0x0
 
 la.tls.le $a0, sym_le
-# CHECK:      lu12i.w $a0, %le_hi20(sym_le)
+# CHECK-NEXT: lu12i.w $a0, %le_hi20(sym_le)
 # CHECK-NEXT: ori $a0, $a0, %le_lo12(sym_le)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_LE_HI20 sym_le 0x0
+# RELOC-NEXT: R_LARCH_TLS_LE_LO12 sym_le 0x0
 
 la.tls.ie $a0, sym_ie
-# CHECK:      pcalau12i $a0, %ie_pc_hi20(sym_ie)
+# CHECK-NEXT: pcalau12i $a0, %ie_pc_hi20(sym_ie)
 # CHECK-NEXT: ld.d $a0, $a0, %ie_pc_lo12(sym_ie)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_IE_PC_HI20 sym_ie 0x0
+# RELOC-NEXT: R_LARCH_TLS_IE_PC_LO12 sym_ie 0x0
 
 la.tls.ie $a0, $a1, sym_ie_large
-# CHECK:      pcalau12i $a0, %ie_pc_hi20(sym_ie_large)
+# CHECK-NEXT: pcalau12i $a0, %ie_pc_hi20(sym_ie_large)
 # CHECK-NEXT: addi.d $a1, $zero, %ie_pc_lo12(sym_ie_large)
 # CHECK-NEXT: lu32i.d $a1, %ie64_pc_lo20(sym_ie_large)
 # CHECK-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(sym_ie_large)
 # CHECK-NEXT: ldx.d $a0, $a0, $a1
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_IE_PC_HI20 sym_ie_large 0x0
+# RELOC-NEXT: R_LARCH_TLS_IE_PC_LO12 sym_ie_large 0x0
+# RELOC-NEXT: R_LARCH_TLS_IE64_PC_LO20 sym_ie_large 0x0
+# RELOC-NEXT: R_LARCH_TLS_IE64_PC_HI12 sym_ie_large 0x0
 
 la.tls.ld $a0, sym_ld
-# CHECK:      pcalau12i $a0, %ld_pc_hi20(sym_ld)
+# CHECK-NEXT: pcalau12i $a0, %ld_pc_hi20(sym_ld)
 # CHECK-NEXT: addi.d $a0, $a0, %got_pc_lo12(sym_ld)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_LD_PC_HI20 sym_ld 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_ld 0x0
 
 la.tls.ld $a0, $a1, sym_ld_large
-# CHECK:      pcalau12i $a0, %ld_pc_hi20(sym_ld_large)
+# CHECK-NEXT: pcalau12i $a0, %ld_pc_hi20(sym_ld_large)
 # CHECK-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_ld_large)
 # CHECK-NEXT: lu32i.d $a1, %got64_pc_lo20(sym_ld_large)
 # CHECK-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(sym_ld_large)
 # CHECK-NEXT: add.d $a0, $a0, $a1
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_LD_PC_HI20 sym_ld_large 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_ld_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_ld_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_ld_large 0x0
 
 la.tls.gd $a0, sym_gd
-# CHECK:      pcalau12i $a0, %gd_pc_hi20(sym_gd)
+# CHECK-NEXT: pcalau12i $a0, %gd_pc_hi20(sym_gd)
 # CHECK-NEXT: addi.d $a0, $a0, %got_pc_lo12(sym_gd)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_GD_PC_HI20 sym_gd 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_gd 0x0
 
 la.tls.gd $a0, $a1, sym_gd_large
-# CHECK:      pcalau12i $a0, %gd_pc_hi20(sym_gd_large)
+# CHECK-NEXT: pcalau12i $a0, %gd_pc_hi20(sym_gd_large)
 # CHECK-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_gd_large)
 # CHECK-NEXT: lu32i.d $a1, %got64_pc_lo20(sym_gd_large)
 # CHECK-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(sym_gd_large)
 # CHECK-NEXT: add.d $a0, $a0, $a1
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_GD_PC_HI20 sym_gd_large 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_gd_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_gd_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_gd_large 0x0
+
+# RELOC-NEXT:   }
+# RELOC-NEXT: ]
diff --git a/llvm/test/MC/LoongArch/Misc/subsection.s b/llvm/test/MC/LoongArch/Misc/subsection.s
index 0bd22b474536..566a2408d691 100644
--- a/llvm/test/MC/LoongArch/Misc/subsection.s
+++ b/llvm/test/MC/LoongArch/Misc/subsection.s
@@ -1,5 +1,5 @@
 # RUN: not llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR,NORELAX --implicit-check-not=error:
-## TODO: not llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR,RELAX --implicit-check-not=error:
+# RUN: not llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR,RELAX --implicit-check-not=error:
 
 a:
   nop
diff --git a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
index 532eb4e0561a..c4454f5bb98d 100644
--- a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+++ b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
@@ -18,7 +18,9 @@
 # RELAX:       Relocations [
 # RELAX-NEXT:    Section ({{.*}}) .rela.text {
 # RELAX-NEXT:      0x10 R_LARCH_PCALA_HI20 .L1 0x0
+# RELAX-NEXT:      0x10 R_LARCH_RELAX - 0x0
 # RELAX-NEXT:      0x14 R_LARCH_PCALA_LO12 .L1 0x0
+# RELAX-NEXT:      0x14 R_LARCH_RELAX - 0x0
 # RELAX-NEXT:    }
 # RELAX-NEXT:    Section ({{.*}}) .rela.data {
 # RELAX-NEXT:      0xF R_LARCH_ADD8 .L3 0x0
@@ -29,13 +31,21 @@
 # RELAX-NEXT:      0x12 R_LARCH_SUB32 .L2 0x0
 # RELAX-NEXT:      0x16 R_LARCH_ADD64 .L3 0x0
 # RELAX-NEXT:      0x16 R_LARCH_SUB64 .L2 0x0
+# RELAX-NEXT:      0x1E R_LARCH_ADD8 .L4 0x0
+# RELAX-NEXT:      0x1E R_LARCH_SUB8 .L3 0x0
+# RELAX-NEXT:      0x1F R_LARCH_ADD16 .L4 0x0
+# RELAX-NEXT:      0x1F R_LARCH_SUB16 .L3 0x0
+# RELAX-NEXT:      0x21 R_LARCH_ADD32 .L4 0x0
+# RELAX-NEXT:      0x21 R_LARCH_SUB32 .L3 0x0
+# RELAX-NEXT:      0x25 R_LARCH_ADD64 .L4 0x0
+# RELAX-NEXT:      0x25 R_LARCH_SUB64 .L3 0x0
 # RELAX-NEXT:    }
 # RELAX-NEXT:  ]
 
 # RELAX:      Hex dump of section '.data':
 # RELAX-NEXT: 0x00000000 04040004 00000004 00000000 00000000
-# RELAX-NEXT: 0x00000010 00000000 00000000 00000000 00000808
-# RELAX-NEXT: 0x00000020 00080000 00080000 00000000 00
+# RELAX-NEXT: 0x00000010 00000000 00000000 00000000 00000000
+# RELAX-NEXT: 0x00000020 00000000 00000000 00000000 00
 
 .text
 .L1:
@@ -60,8 +70,6 @@
 .short .L3 - .L2
 .word  .L3 - .L2
 .dword .L3 - .L2
-## TODO
-## With relaxation, emit relocs because la.pcrel is a linker-relaxable inst.
 .byte  .L4 - .L3
 .short .L4 - .L3
 .word  .L4 - .L3
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index f1c080580fe2..4f8a8dfdbcec 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -282,9 +282,6 @@
 .attribute arch, "rv32i_zvfbfwma0p8"
 # CHECK: .attribute     5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin0p8_zve32f1p0_zve32x1p0_zvfbfmin0p8_zvfbfwma0p8_zvl32b1p0"
 
-.attribute arch, "rv64i_xsfcie"
-# CHECK: attribute      5, "rv64i2p1_xsfcie1p0"
-
 .attribute arch, "rv32izacas1p0"
 # CHECK: attribute      5, "rv32i2p1_a2p1_zacas1p0"
 
@@ -312,5 +309,8 @@
 .attribute arch, "rv32i_zicfilp0p4"
 # CHECK: attribute      5, "rv32i2p1_zicfilp0p4"
 
+.attribute arch, "rv32i_zicfiss0p4"
+# CHECK: .attribute     5, "rv32i2p1_zicfiss0p4_zicsr2p0_zimop0p1"
+
 .attribute arch, "rv64i_xsfvfwmaccqqq"
 # CHECK: attribute      5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin0p8_zvl32b1p0_xsfvfwmaccqqq1p0"
diff --git a/llvm/test/MC/RISCV/compressed-zicfiss.s b/llvm/test/MC/RISCV/compressed-zicfiss.s
new file mode 100644
index 000000000000..50ea2e24083e
--- /dev/null
+++ b/llvm/test/MC/RISCV/compressed-zicfiss.s
@@ -0,0 +1,53 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+experimental-zcmop -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zicfiss,+experimental-zcmop < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-zicfiss,+experimental-zcmop -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+experimental-zcmop -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zicfiss,+experimental-zcmop < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-zicfiss,+experimental-zcmop -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+#
+# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
+
+# CHECK-ASM-AND-OBJ: c.sspopchk t0
+# CHECK-ASM: encoding: [0x81,0x62]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+sspopchk x5
+
+# CHECK-ASM-AND-OBJ: c.sspopchk t0
+# CHECK-ASM: encoding: [0x81,0x62]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+sspopchk t0
+
+# CHECK-ASM-AND-OBJ: c.sspush ra
+# CHECK-ASM: encoding: [0x81,0x60]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+sspush x1
+
+# CHECK-ASM-AND-OBJ: c.sspush ra
+# CHECK-ASM: encoding: [0x81,0x60]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+sspush ra
+
+# CHECK-ASM-AND-OBJ: c.sspush ra
+# CHECK-ASM: encoding: [0x81,0x60]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zcmop' (Compressed May-Be-Operations), 'Zicfiss' (Shadow stack)
+c.sspush x1
+
+# CHECK-ASM-AND-OBJ: c.sspush ra
+# CHECK-ASM: encoding: [0x81,0x60]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zcmop' (Compressed May-Be-Operations), 'Zicfiss' (Shadow stack)
+c.sspush ra
+
+# CHECK-ASM-AND-OBJ: c.sspopchk t0
+# CHECK-ASM: encoding: [0x81,0x62]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zcmop' (Compressed May-Be-Operations), 'Zicfiss' (Shadow stack)
+c.sspopchk x5
+
+# CHECK-ASM-AND-OBJ: c.sspopchk t0
+# CHECK-ASM: encoding: [0x81,0x62]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zcmop' (Compressed May-Be-Operations), 'Zicfiss' (Shadow stack)
+c.sspopchk t0
diff --git a/llvm/test/MC/RISCV/fixups-expr.s b/llvm/test/MC/RISCV/fixups-expr.s
index 20e5aacac619..8a02d29de1ab 100644
--- a/llvm/test/MC/RISCV/fixups-expr.s
+++ b/llvm/test/MC/RISCV/fixups-expr.s
@@ -31,19 +31,21 @@ G2:
 .half G2-G1
 .byte .L2-.L1
 .byte G2-G1
-# RELAX: 0x0 R_RISCV_ADD64 .L2 0x0
-# RELAX: 0x0 R_RISCV_SUB64 .L1 0x0
-# RELAX: 0x8 R_RISCV_ADD64 G2 0x0
-# RELAX: 0x8 R_RISCV_SUB64 G1 0x0
-# RELAX: 0x10 R_RISCV_ADD32 .L2 0x0
-# RELAX: 0x10 R_RISCV_SUB32 .L1 0x0
-# RELAX: 0x14 R_RISCV_ADD32 G2 0x0
-# RELAX: 0x14 R_RISCV_SUB32 G1 0x0
-# RELAX: 0x18 R_RISCV_ADD16 .L2 0x0
-# RELAX: 0x18 R_RISCV_SUB16 .L1 0x0
-# RELAX: 0x1A R_RISCV_ADD16 G2 0x0
-# RELAX: 0x1A R_RISCV_SUB16 G1 0x0
-# RELAX: 0x1C R_RISCV_ADD8 .L2 0x0
-# RELAX: 0x1C R_RISCV_SUB8 .L1 0x0
-# RELAX: 0x1D R_RISCV_ADD8 G2 0x0
-# RELAX: 0x1D R_RISCV_SUB8 G1 0x0
+# RELAX:      .rela.data {
+# RELAX-NEXT:   0x0 R_RISCV_ADD64 .L2 0x0
+# RELAX-NEXT:   0x0 R_RISCV_SUB64 .L1 0x0
+# RELAX-NEXT:   0x8 R_RISCV_ADD64 G2 0x0
+# RELAX-NEXT:   0x8 R_RISCV_SUB64 G1 0x0
+# RELAX-NEXT:   0x10 R_RISCV_ADD32 .L2 0x0
+# RELAX-NEXT:   0x10 R_RISCV_SUB32 .L1 0x0
+# RELAX-NEXT:   0x14 R_RISCV_ADD32 G2 0x0
+# RELAX-NEXT:   0x14 R_RISCV_SUB32 G1 0x0
+# RELAX-NEXT:   0x18 R_RISCV_ADD16 .L2 0x0
+# RELAX-NEXT:   0x18 R_RISCV_SUB16 .L1 0x0
+# RELAX-NEXT:   0x1A R_RISCV_ADD16 G2 0x0
+# RELAX-NEXT:   0x1A R_RISCV_SUB16 G1 0x0
+# RELAX-NEXT:   0x1C R_RISCV_ADD8 .L2 0x0
+# RELAX-NEXT:   0x1C R_RISCV_SUB8 .L1 0x0
+# RELAX-NEXT:   0x1D R_RISCV_ADD8 G2 0x0
+# RELAX-NEXT:   0x1D R_RISCV_SUB8 G1 0x0
+# RELAX-NEXT: }
diff --git a/llvm/test/MC/RISCV/machine-csr-names.s b/llvm/test/MC/RISCV/machine-csr-names.s
index 664cf0301eff..3b6d73b5118b 100644
--- a/llvm/test/MC/RISCV/machine-csr-names.s
+++ b/llvm/test/MC/RISCV/machine-csr-names.s
@@ -9,9 +9,6 @@
 # RUN: llvm-mc -filetype=obj -triple riscv64 < %s \
 # RUN:     | llvm-objdump -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST-ALIAS %s
-#
-# RUN: llvm-mc -triple riscv32 %s 2>&1 | FileCheck -check-prefix CHECK-WARN %s
-# RUN: llvm-mc -triple riscv64 %s 2>&1 | FileCheck -check-prefix CHECK-WARN %s
 
 ##################################
 # Machine Information Registers
@@ -1495,8 +1492,6 @@ csrrs t1, dscratch, zero
 # uimm12
 csrrs t2, 0x7B2, zero
 
-# CHECK-WARN: warning: 'dscratch' is a deprecated alias for 'dscratch0'
-
 # dscratch1
 # name
 # CHECK-INST: csrrs t1, dscratch1, zero
@@ -1949,8 +1944,6 @@ csrrs t1, mucounteren, zero
 # uimm12
 csrrs t2, 0x320, zero
 
-# CHECK-WARN: warning: 'mucounteren' is a deprecated alias for 'mcountinhibit'
-
 # mhpmevent3
 # name
 # CHECK-INST: csrrs t1, mhpmevent3, zero
diff --git a/llvm/test/MC/RISCV/rv32zcmop-invalid.s b/llvm/test/MC/RISCV/rv32zcmop-invalid.s
new file mode 100644
index 000000000000..1641c8ddd00b
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zcmop-invalid.s
@@ -0,0 +1,7 @@
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zcmop < %s 2>&1 | FileCheck %s
+
+cmop.0 # CHECK: :[[@LINE]]:1: error: unrecognized instruction mnemonic
+
+cmop.1 t0 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction
+
+cmop.1 0x0 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv32zicfiss-invalid.s b/llvm/test/MC/RISCV/rv32zicfiss-invalid.s
new file mode 100644
index 000000000000..1cedcb97e2e7
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zicfiss-invalid.s
@@ -0,0 +1,17 @@
+# RUN: not llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+c -riscv-no-aliases -show-encoding \
+# RUN:     2>&1 | FileCheck -check-prefixes=CHECK-ERR %s
+
+# CHECK-ERR: error: invalid operand for instruction
+sspopchk a1
+
+# CHECK-ERR: error: invalid operand for instruction
+c.sspush t0
+
+# CHECK-ERR: error: invalid operand for instruction
+c.sspopchk ra
+
+# CHECK-ERR: error: invalid operand for instruction
+sspush a0
+
+# CHECK-ERR: error: invalid operand for instruction
+ssrdp zero
diff --git a/llvm/test/MC/RISCV/rv32zimop-invalid.s b/llvm/test/MC/RISCV/rv32zimop-invalid.s
new file mode 100644
index 000000000000..e6c3adc4cd30
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zimop-invalid.s
@@ -0,0 +1,6 @@
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zimop < %s 2>&1 | FileCheck %s
+
+# Too few operands
+mop.r.0 t0 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
+# Too few operands
+mop.rr.0 t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
diff --git a/llvm/test/MC/RISCV/rv64zicfiss-invalid.s b/llvm/test/MC/RISCV/rv64zicfiss-invalid.s
new file mode 100644
index 000000000000..1296940455e8
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv64zicfiss-invalid.s
@@ -0,0 +1,17 @@
+# RUN: not llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+c -riscv-no-aliases -show-encoding \
+# RUN:     2>&1 | FileCheck -check-prefixes=CHECK-ERR %s
+
+# CHECK-ERR: error: invalid operand for instruction
+sspopchk a1
+
+# CHECK-ERR: error: invalid operand for instruction
+c.sspush t0
+
+# CHECK-ERR: error: invalid operand for instruction
+c.sspopchk ra
+
+# CHECK-ERR: error: invalid operand for instruction
+sspush a0
+
+# CHECK-ERR: error: invalid operand for instruction
+ssrdp zero
diff --git a/llvm/test/MC/RISCV/rvzcmop-valid.s b/llvm/test/MC/RISCV/rvzcmop-valid.s
new file mode 100644
index 000000000000..c26bb2959fed
--- /dev/null
+++ b/llvm/test/MC/RISCV/rvzcmop-valid.s
@@ -0,0 +1,42 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zcmop -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zcmop -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zcmop < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-zcmop -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zcmop < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-zcmop -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+
+# CHECK-ASM-AND-OBJ: cmop.1
+# CHECK-ASM: encoding: [0x81,0x60]
+cmop.1
+
+# CHECK-ASM-AND-OBJ: cmop.3
+# CHECK-ASM: encoding: [0x81,0x61]
+cmop.3
+
+# CHECK-ASM-AND-OBJ: cmop.5
+# CHECK-ASM: encoding: [0x81,0x62]
+cmop.5
+
+# CHECK-ASM-AND-OBJ: cmop.7
+# CHECK-ASM: encoding: [0x81,0x63]
+cmop.7
+
+# CHECK-ASM-AND-OBJ: cmop.9
+# CHECK-ASM: encoding: [0x81,0x64]
+cmop.9
+
+# CHECK-ASM-AND-OBJ: cmop.11
+# CHECK-ASM: encoding: [0x81,0x65]
+cmop.11
+
+# CHECK-ASM-AND-OBJ: cmop.13
+# CHECK-ASM: encoding: [0x81,0x66]
+cmop.13
+
+# CHECK-ASM-AND-OBJ: cmop.15
+# CHECK-ASM: encoding: [0x81,0x67]
+cmop.15
diff --git a/llvm/test/MC/RISCV/rvzimop-valid.s b/llvm/test/MC/RISCV/rvzimop-valid.s
new file mode 100644
index 000000000000..155293662990
--- /dev/null
+++ b/llvm/test/MC/RISCV/rvzimop-valid.s
@@ -0,0 +1,26 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zimop -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zimop -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zimop < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-zimop -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zimop < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-zimop -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+
+# CHECK-ASM-AND-OBJ: mop.r.0 a2, a1
+# CHECK-ASM: encoding: [0x73,0xc6,0xc5,0x81]
+mop.r.0 a2, a1
+
+# CHECK-ASM-AND-OBJ: mop.r.31 a2, a1
+# CHECK-ASM: encoding: [0x73,0xc6,0xf5,0xcd]
+mop.r.31 a2, a1
+
+# CHECK-ASM-AND-OBJ: mop.rr.0 a3, a2, a1
+# CHECK-ASM: encoding: [0xf3,0x46,0xb6,0x82]
+mop.rr.0 a3, a2, a1
+
+# CHECK-ASM-AND-OBJ: mop.rr.7 a3, a2, a1
+# CHECK-ASM: encoding: [0xf3,0x46,0xb6,0xce]
+mop.rr.7 a3, a2, a1
+\ No newline at end of file
diff --git a/llvm/test/MC/RISCV/xsfcie-invalid.s b/llvm/test/MC/RISCV/xsfcie-invalid.s
deleted file mode 100644
index a84ffeeaa054..000000000000
--- a/llvm/test/MC/RISCV/xsfcie-invalid.s
+++ /dev/null
@@ -1,39 +0,0 @@
-# SCIE - SiFive Custom Instructions Extension.
-# RUN: not llvm-mc -triple riscv32 -mattr=-xsfcie < %s 2>&1 | FileCheck %s
-# RUN: not llvm-mc -triple riscv64 -mattr=-xsfcie < %s 2>&1 | FileCheck %s
-
-cflush.d.l1 0x10 # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
-
-cdiscard.d.l1 0x10 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction
-
-cflush.d.l1 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'XSfcie' (SiFive Custom Instruction Extension SCIE.)
-
-cdiscard.d.l1 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'XSfcie' (SiFive Custom Instruction Extension SCIE.)
-
-cflush.d.l1 x0 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'XSfcie' (SiFive Custom Instruction Extension SCIE.)
-
-cflush.d.l1 x7 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'XSfcie' (SiFive Custom Instruction Extension SCIE.)
-
-cdiscard.d.l1 x0 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'XSfcie' (SiFive Custom Instruction Extension SCIE.)
-
-cdiscard.d.l1 x7 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'XSfcie' (SiFive Custom Instruction Extension SCIE.)
-
-cease x1 # CHECK: :[[@LINE]]:7: error: invalid operand for instruction
-
-cease 0x10 # CHECK: :[[@LINE]]:7: error: invalid operand for instruction
-
-cease # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'XSfcie' (SiFive Custom Instruction Extension SCIE.)
-
-csrr t1, mbpm # CHECK: :[[@LINE]]:10: error: system register use requires an option to be enabled
-
-csrr t1, mfd # CHECK: :[[@LINE]]:10: error: system register use requires an option to be enabled
-
-csrr t1, mpd # CHECK: :[[@LINE]]:10: error: system register use requires an option to be enabled
-
-csrr t1, mnscratch # CHECK: :[[@LINE]]:10: error: system register use requires an option to be enabled
-
-csrr t1, mnepc # CHECK: :[[@LINE]]:10: error: system register use requires an option to be enabled
-
-csrr t1, mncause # CHECK: :[[@LINE]]:10: error: system register use requires an option to be enabled
-
-csrr t1, mnstatus # CHECK: :[[@LINE]]:10: error: system register use requires an option to be enabled
diff --git a/llvm/test/MC/RISCV/xsfcie-valid.s b/llvm/test/MC/RISCV/xsfcie-valid.s
deleted file mode 100644
index 25f743f5ed79..000000000000
--- a/llvm/test/MC/RISCV/xsfcie-valid.s
+++ /dev/null
@@ -1,136 +0,0 @@
-# SCIE - SiFive Custom Instructions Extension.
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+xsfcie -riscv-no-aliases -show-encoding \
-# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+xsfcie -riscv-no-aliases -show-encoding \
-# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv32 -mattr=+xsfcie -riscv-no-aliases -show-encoding 2>&1 \
-# RUN:     | FileCheck -check-prefixes=CHECK-WARN %s
-# RUN: llvm-mc %s -triple=riscv64 -mattr=+xsfcie -riscv-no-aliases -show-encoding 2>&1 \
-# RUN:     | FileCheck -check-prefixes=CHECK-WARN %s
-# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+xsfcie < %s \
-# RUN:     | llvm-objdump --mattr=+xsfcie -M no-aliases -d - \
-# RUN:     | FileCheck -check-prefix=CHECK-INST %s
-# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+xsfcie < %s \
-# RUN:     | llvm-objdump --mattr=+xsfcie -M no-aliases -d - \
-# RUN:     | FileCheck -check-prefix=CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mcpu=sifive-s76 -riscv-no-aliases -show-encoding \
-# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
-# RUN: llvm-mc %s -triple=riscv64 -mcpu=sifive-s76 -riscv-no-aliases -show-encoding 2>&1 \
-# RUN:     | FileCheck -check-prefixes=CHECK-WARN %s
-# RUN: llvm-mc -filetype=obj -triple riscv64 -mcpu=sifive-s76 < %s \
-# RUN:     | llvm-objdump --mcpu=sifive-s76 -M no-aliases -d - \
-# RUN:     | FileCheck -check-prefix=CHECK-INST %s
-
-# CHECK-INST: cflush.d.l1     zero
-# CHECK-ENC: encoding: [0x73,0x00,0x00,0xfc]
-# CHECK-INST: cflush.d.l1     zero
-# CHECK-ENC: encoding: [0x73,0x00,0x00,0xfc]
-cflush.d.l1 x0
-cflush.d.l1
-
-# CHECK-INST: cflush.d.l1     t2
-# CHECK-ENC: encoding: [0x73,0x80,0x03,0xfc]
-cflush.d.l1 x7
-
-# CHECK-INST: cdiscard.d.l1   zero
-# CHECK-ENC: encoding: [0x73,0x00,0x20,0xfc]
-# CHECK-INST: cdiscard.d.l1     zero
-# CHECK-ENC: encoding: [0x73,0x00,0x20,0xfc]
-cdiscard.d.l1 x0
-cdiscard.d.l1
-
-# CHECK-INST: cdiscard.d.l1   t2
-# CHECK-ENC: encoding: [0x73,0x80,0x23,0xfc]
-cdiscard.d.l1 x7
-
-# CHECK-INST: cease
-# CHECK-ENC: encoding: [0x73,0x00,0x50,0x30]
-cease
-
-# mbpm
-# name
-# CHECK-INST: csrrs t2, mbpm, zero
-# CHECK-ENC: encoding: [0xf3,0x23,0x00,0x7c]
-# uimm12
-# CHECK-INST: csrrs t2, mbpm, zero
-# CHECK-ENC: encoding: [0xf3,0x23,0x00,0x7c]
-# name
-csrrs t2, mbpm, zero
-# uimm12
-csrrs t2, 0x7C0, zero
-
-# mfd
-# name
-# CHECK-INST: csrrs t2, mfd, zero
-# CHECK-ENC: encoding: [0xf3,0x23,0x10,0x7c]
-# uimm12
-# CHECK-INST: csrrs t2, mfd, zero
-# CHECK-ENC: encoding: [0xf3,0x23,0x10,0x7c]
-# name
-csrrs t2, mfd, zero
-# uimm12
-csrrs t2, 0x7C1, zero
-
-# mpd
-# name
-# CHECK-INST: csrrs t2, mpd, zero
-# CHECK-ENC: encoding: [0xf3,0x23,0x80,0x7c]
-# uimm12
-# CHECK-INST: csrrs t2, mpd, zero
-# CHECK-ENC: encoding: [0xf3,0x23,0x80,0x7c]
-# name
-csrrs t2, mpd, zero
-# uimm12
-csrrs t2, 0x7C8, zero
-
-# mnscratch
-# name
-# CHECK-INST: csrrs t1, mnscratch, zero
-# CHECK-ENC: encoding: [0x73,0x23,0x00,0x35]
-# CHECK-WARN: warning: 'miselect' CSR is not available on the current subtarget. Instead 'mnscratch' CSR will be used.
-# uimm12
-# CHECK-INST: csrrs t2, mnscratch, zero
-# CHECK-ENC: encoding: [0xf3,0x23,0x00,0x35]
-# name
-csrrs t1, mnscratch, zero
-csrrs t1, miselect, zero
-# uimm12
-csrrs t2, 0x350, zero
-
-# mnepc
-# name
-# CHECK-INST: csrrs t1, mnepc, zero
-# CHECK-ENC: encoding: [0x73,0x23,0x10,0x35]
-# CHECK-WARN: warning: 'mireg' CSR is not available on the current subtarget. Instead 'mnepc' CSR will be used.
-# uimm12
-# CHECK-INST: csrrs t2, mnepc, zero
-# CHECK-ENC: encoding: [0xf3,0x23,0x10,0x35]
-# name
-csrrs t1, mnepc, zero
-csrrs t1, mireg, zero
-# uimm12
-csrrs t2, 0x351, zero
-
-# mncause
-# name
-# CHECK-INST: csrrs t1, mncause, zero
-# CHECK-ENC: encoding: [0x73,0x23,0x20,0x35]
-# uimm12
-# CHECK-INST: csrrs t2, mncause, zero
-# CHECK-ENC: encoding: [0xf3,0x23,0x20,0x35]
-# name
-csrrs t1, mncause, zero
-# uimm12
-csrrs t2, 0x352, zero
-
-# mnstatus
-# name
-# CHECK-INST: csrrs t1, mnstatus, zero
-# CHECK-ENC: encoding: [0x73,0x23,0x30,0x35]
-# uimm12
-# CHECK-INST: csrrs t2, mnstatus, zero
-# CHECK-ENC: encoding: [0xf3,0x23,0x30,0x35]
-# name
-csrrs t1, mnstatus, zero
-# uimm12
-csrrs t2, 0x353, zero
diff --git a/llvm/test/MC/RISCV/zicfiss-valid.s b/llvm/test/MC/RISCV/zicfiss-valid.s
new file mode 100644
index 000000000000..fd69d37d7cfa
--- /dev/null
+++ b/llvm/test/MC/RISCV/zicfiss-valid.s
@@ -0,0 +1,102 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a,+experimental-zicfiss -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a,+experimental-zicfiss < %s \
+# RUN:     | llvm-objdump --mattr=+a,+experimental-zicfiss -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc %s -triple=riscv64 -defsym=RV64=1 -mattr=+a,+experimental-zicfiss -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM-RV64,CHECK-ASM,CHECK-ASM-AND-OBJ-RV64,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -defsym=RV64=1 -mattr=+a,+experimental-zicfiss < %s \
+# RUN:     | llvm-objdump --mattr=+a,+experimental-zicfiss -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefixes=CHECK-ASM-AND-OBJ-RV64,CHECK-ASM-AND-OBJ %s
+#
+# RUN: not llvm-mc -triple riscv32 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT %s
+# RUN: not llvm-mc -triple riscv64 -defsym=RV64=1 -riscv-no-aliases -show-encoding < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK-NO-EXT-RV64 %s
+
+# CHECK-ASM-AND-OBJ: sspopchk ra
+# CHECK-ASM: encoding: [0x73,0xc0,0xc0,0xcd]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+sspopchk x1
+
+# CHECK-ASM-AND-OBJ: sspopchk ra
+# CHECK-ASM: encoding: [0x73,0xc0,0xc0,0xcd]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+sspopchk ra
+
+# CHECK-ASM-AND-OBJ: sspopchk t0
+# CHECK-ASM: encoding: [0x73,0xc0,0xc2,0xcd]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+sspopchk x5
+
+# CHECK-ASM-AND-OBJ: sspopchk t0
+# CHECK-ASM: encoding: [0x73,0xc0,0xc2,0xcd]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+sspopchk t0
+
+# CHECK-ASM-AND-OBJ: sspush ra
+# CHECK-ASM: encoding: [0x73,0x40,0x10,0xce]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+sspush x1
+
+# CHECK-ASM-AND-OBJ: sspush ra
+# CHECK-ASM: encoding: [0x73,0x40,0x10,0xce]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+sspush ra
+
+# check-asm-and-obj: sspush t0
+# check-asm: encoding: [0x73,0x40,0x50,0xce]
+# check-no-ext: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+sspush x5
+
+# check-asm-and-obj: sspush t0
+# check-asm: encoding: [0x73,0x40,0x50,0xce]
+# check-no-ext: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+sspush t0
+
+# CHECK-ASM-AND-OBJ: ssrdp ra
+# CHECK-ASM: encoding: [0xf3,0x40,0xc0,0xcd]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+ssrdp ra
+
+# CHECK-ASM-AND-OBJ: ssamoswap.w a4, ra, (s0)
+# CHECK-ASM: encoding: [0x2f,0x27,0x14,0x48]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+ssamoswap.w a4, ra, (s0)
+
+# CHECK-ASM-AND-OBJ: ssamoswap.w.aq a4, ra, (s0)
+# CHECK-ASM: encoding: [0x2f,0x27,0x14,0x4c]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+ssamoswap.w.aq a4, ra, (s0)
+
+# CHECK-ASM-AND-OBJ: ssamoswap.w.rl a4, ra, (s0)
+# CHECK-ASM: encoding: [0x2f,0x27,0x14,0x4a]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+ssamoswap.w.rl a4, ra, (s0)
+
+# CHECK-ASM-AND-OBJ: ssamoswap.w.aqrl a4, ra, (s0)
+# CHECK-ASM: encoding: [0x2f,0x27,0x14,0x4e]
+# CHECK-NO-EXT: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+ssamoswap.w.aqrl a4, ra, (s0)
+
+.ifdef RV64
+# CHECK-ASM-AND-OBJ-RV64: ssamoswap.d a4, ra, (s0)
+# CHECK-ASM-RV64: encoding: [0x2f,0x37,0x14,0x48]
+# CHECK-NO-EXT-RV64: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+ssamoswap.d a4, ra, (s0)
+
+# CHECK-ASM-AND-OBJ-RV64: ssamoswap.d.aq a4, ra, (s0)
+# CHECK-ASM-RV64: encoding: [0x2f,0x37,0x14,0x4c]
+# CHECK-NO-EXT-RV64: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+ssamoswap.d.aq a4, ra, (s0)
+
+# CHECK-ASM-AND-OBJ-RV64: ssamoswap.d.rl a4, ra, (s0)
+# CHECK-ASM-RV64: encoding: [0x2f,0x37,0x14,0x4a]
+# CHECK-NO-EXT-RV64: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+ssamoswap.d.rl a4, ra, (s0)
+
+# CHECK-ASM-AND-OBJ-RV64: ssamoswap.d.aqrl a4, ra, (s0)
+# CHECK-ASM-RV64: encoding: [0x2f,0x37,0x14,0x4e]
+# CHECK-NO-EXT-RV64: error: instruction requires the following: 'Zicfiss' (Shadow stack)
+ssamoswap.d.aqrl a4, ra, (s0)
+.endif
diff --git a/llvm/test/MC/WebAssembly/custom-sections.ll b/llvm/test/MC/WebAssembly/custom-sections.ll
index cf2d7098ae4b..ba669716c934 100644
--- a/llvm/test/MC/WebAssembly/custom-sections.ll
+++ b/llvm/test/MC/WebAssembly/custom-sections.ll
@@ -15,18 +15,18 @@ target triple = "wasm32-unknown-unknown"
 ; CHECK:  Section {
 ; CHECK:    Type: CUSTOM (0x0)
 ; CHECK:    Size: 3
-; CHECK:    Offset: 38
+; CHECK:    Offset: 44
 ; CHECK:    Name: red
 ; CHECK:  }
 ; CHECK:  Section {
 ; CHECK:    Type: CUSTOM (0x0)
 ; CHECK:    Size: 6
-; CHECK:    Offset: 51
+; CHECK:    Offset: 57
 ; CHECK:    Name: green
 ; CHECK:  }
 ; CHECK:  Section {
 ; CHECK:    Type: CUSTOM (0x0)
 ; CHECK:    Size: 25
-; CHECK:    Offset: 84
+; CHECK:    Offset: 90
 ; CHECK:    Name: producers
 ; CHECK:  }
diff --git a/llvm/test/MC/WebAssembly/debug-info.ll b/llvm/test/MC/WebAssembly/debug-info.ll
index c8ab7a93165f..a65ce0ee8392 100644
--- a/llvm/test/MC/WebAssembly/debug-info.ll
+++ b/llvm/test/MC/WebAssembly/debug-info.ll
@@ -7,37 +7,37 @@
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: TYPE (0x1)
 ; CHECK-NEXT:    Size: 4
-; CHECK-NEXT:    Offset: 8
+; CHECK-NEXT:    Offset: 14
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: IMPORT (0x2)
 ; CHECK-NEXT:    Size: 81
-; CHECK-NEXT:    Offset: 18
+; CHECK-NEXT:    Offset: 24
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: FUNCTION (0x3)
 ; CHECK-NEXT:    Size: 2
-; CHECK-NEXT:    Offset: 105
+; CHECK-NEXT:    Offset: 111
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: ELEM (0x9)
 ; CHECK-NEXT:    Size: 7
-; CHECK-NEXT:    Offset: 113
+; CHECK-NEXT:    Offset: 119
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: DATACOUNT (0xC)
 ; CHECK-NEXT:    Size: 1
-; CHECK-NEXT:    Offset: 126
+; CHECK-NEXT:    Offset: 132
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CODE (0xA)
 ; CHECK-NEXT:    Size: 4
-; CHECK-NEXT:    Offset: 133
+; CHECK-NEXT:    Offset: 139
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: DATA (0xB)
 ; CHECK-NEXT:    Size: 19
-; CHECK-NEXT:    Offset: 143
+; CHECK-NEXT:    Offset: 149
 ; CHECK-NEXT:    Segments [
 ; CHECK-NEXT:      Segment {
 ; CHECK-NEXT:        Name: .data.foo
@@ -54,91 +54,91 @@
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 86
-; CHECK-NEXT:    Offset: 168
+; CHECK-NEXT:    Offset: 174
 ; CHECK-NEXT:    Name: .debug_abbrev
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 114
-; CHECK-NEXT:    Offset: 274
+; CHECK-NEXT:    Offset: 280
 ; CHECK-NEXT:    Name: .debug_info
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 48
-; CHECK-NEXT:    Offset: 406
+; CHECK-NEXT:    Offset: 412
 ; CHECK-NEXT:    Name: .debug_aranges
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 121
-; CHECK-NEXT:    Offset: 475
+; CHECK-NEXT:    Offset: 481
 ; CHECK-NEXT:    Name: .debug_str
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 42
-; CHECK-NEXT:    Offset: 613
+; CHECK-NEXT:    Offset: 619
 ; CHECK-NEXT:    Name: .debug_pubnames
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 26
-; CHECK-NEXT:    Offset: 677
+; CHECK-NEXT:    Offset: 683
 ; CHECK-NEXT:    Name: .debug_pubtypes
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 56
-; CHECK-NEXT:    Offset: 725
+; CHECK-NEXT:    Offset: 731
 ; CHECK-NEXT:    Name: .debug_line
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 91
-; CHECK-NEXT:    Offset: 799
+; CHECK-NEXT:    Offset: 805
 ; CHECK-NEXT:    Name: linking
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 9
-; CHECK-NEXT:    Offset: 904
+; CHECK-NEXT:    Offset: 910
 ; CHECK-NEXT:    Name: reloc.DATA
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 61
-; CHECK-NEXT:    Offset: 930
+; CHECK-NEXT:    Offset: 936
 ; CHECK-NEXT:    Name: reloc..debug_info
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 18
-; CHECK-NEXT:    Offset: 1015
+; CHECK-NEXT:    Offset: 1021
 ; CHECK-NEXT:    Name: reloc..debug_aranges
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 6
-; CHECK-NEXT:    Offset: 1060
+; CHECK-NEXT:    Offset: 1066
 ; CHECK-NEXT:    Name: reloc..debug_pubnames
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 6
-; CHECK-NEXT:    Offset: 1094
+; CHECK-NEXT:    Offset: 1100
 ; CHECK-NEXT:    Name: reloc..debug_pubtypes
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 6
-; CHECK-NEXT:    Offset: 1128
+; CHECK-NEXT:    Offset: 1134
 ; CHECK-NEXT:    Name: reloc..debug_line
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 77
-; CHECK-NEXT:    Offset: 1158
+; CHECK-NEXT:    Offset: 1164
 ; CHECK-NEXT:    Name: producers
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:]
diff --git a/llvm/test/MC/WebAssembly/debug-info64.ll b/llvm/test/MC/WebAssembly/debug-info64.ll
index a63200c908b7..d0081164d73e 100644
--- a/llvm/test/MC/WebAssembly/debug-info64.ll
+++ b/llvm/test/MC/WebAssembly/debug-info64.ll
@@ -7,37 +7,37 @@
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: TYPE (0x1)
 ; CHECK-NEXT:     Size: 4
-; CHECK-NEXT:     Offset: 8
+; CHECK-NEXT:     Offset: 14
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: IMPORT (0x2)
 ; CHECK-NEXT:     Size: 81
-; CHECK-NEXT:     Offset: 18
+; CHECK-NEXT:     Offset: 24
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: FUNCTION (0x3)
 ; CHECK-NEXT:     Size: 2
-; CHECK-NEXT:     Offset: 105
+; CHECK-NEXT:     Offset: 111
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: ELEM (0x9)
 ; CHECK-NEXT:     Size: 7
-; CHECK-NEXT:     Offset: 113
+; CHECK-NEXT:     Offset: 119
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: DATACOUNT (0xC)
 ; CHECK-NEXT:     Size: 1
-; CHECK-NEXT:     Offset: 126
+; CHECK-NEXT:     Offset: 132
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CODE (0xA)
 ; CHECK-NEXT:     Size: 4
-; CHECK-NEXT:     Offset: 133
+; CHECK-NEXT:     Offset: 139
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: DATA (0xB)
 ; CHECK-NEXT:     Size: 27
-; CHECK-NEXT:     Offset: 143
+; CHECK-NEXT:     Offset: 149
 ; CHECK-NEXT:     Segments [
 ; CHECK-NEXT:       Segment {
 ; CHECK-NEXT:         Name: .data.foo
@@ -54,97 +54,97 @@
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 86
-; CHECK-NEXT:     Offset: 176
+; CHECK-NEXT:     Offset: 182
 ; CHECK-NEXT:     Name: .debug_abbrev
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 130
-; CHECK-NEXT:     Offset: 282
+; CHECK-NEXT:     Offset: 288
 ; CHECK-NEXT:     Name: .debug_info
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 80
-; CHECK-NEXT:     Offset: 430
+; CHECK-NEXT:     Offset: 436
 ; CHECK-NEXT:     Name: .debug_aranges
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 121
-; CHECK-NEXT:     Offset: 531
+; CHECK-NEXT:     Offset: 537
 ; CHECK-NEXT:     Name: .debug_str
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 42
-; CHECK-NEXT:     Offset: 669
+; CHECK-NEXT:     Offset: 675
 ; CHECK-NEXT:     Name: .debug_pubnames
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 26
-; CHECK-NEXT:     Offset: 733
+; CHECK-NEXT:     Offset: 739
 ; CHECK-NEXT:     Name: .debug_pubtypes
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 60
-; CHECK-NEXT:     Offset: 781
+; CHECK-NEXT:     Offset: 787
 ; CHECK-NEXT:     Name: .debug_line
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 91
-; CHECK-NEXT:     Offset: 859
+; CHECK-NEXT:     Offset: 865
 ; CHECK-NEXT:     Name: linking
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 9
-; CHECK-NEXT:     Offset: 964
+; CHECK-NEXT:     Offset: 970
 ; CHECK-NEXT:     Name: reloc.DATA
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 61
-; CHECK-NEXT:     Offset: 990
+; CHECK-NEXT:     Offset: 996
 ; CHECK-NEXT:     Name: reloc..debug_info
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 18
-; CHECK-NEXT:     Offset: 1075
+; CHECK-NEXT:     Offset: 1081
 ; CHECK-NEXT:     Name: reloc..debug_aranges
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 6
-; CHECK-NEXT:     Offset: 1120
+; CHECK-NEXT:     Offset: 1126
 ; CHECK-NEXT:     Name: reloc..debug_pubnames
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 6
-; CHECK-NEXT:     Offset: 1154
+; CHECK-NEXT:     Offset: 1160
 ; CHECK-NEXT:     Name: reloc..debug_pubtypes
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 6
-; CHECK-NEXT:     Offset: 1188
+; CHECK-NEXT:     Offset: 1194
 ; CHECK-NEXT:     Name: reloc..debug_line
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 77
-; CHECK-NEXT:     Offset: 1218
+; CHECK-NEXT:     Offset: 1224
 ; CHECK-NEXT:     Name: producers
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 11
-; CHECK-NEXT:     Offset: 1311
+; CHECK-NEXT:     Offset: 1317
 ; CHECK-NEXT:     Name: target_features
 ; CHECK-NEXT:   }
 ; CHECK-NEXT: ]
diff --git a/llvm/test/MC/WebAssembly/tag-section.ll b/llvm/test/MC/WebAssembly/tag-section.ll
index c40e132e36f0..56738ec80c8d 100644
--- a/llvm/test/MC/WebAssembly/tag-section.ll
+++ b/llvm/test/MC/WebAssembly/tag-section.ll
@@ -53,4 +53,4 @@ define i32 @test_throw1(ptr %p) {
 
 ; SEC:          Type: TAG (0xD)
 ; SEC-NEXT:     Size: 3
-; SEC-NEXT:     Offset: 63
+; SEC-NEXT:     Offset: 69
diff --git a/llvm/test/MC/X86/apx/adc-att.s b/llvm/test/MC/X86/apx/adc-att.s
new file mode 100644
index 000000000000..e4862161d1f3
--- /dev/null
+++ b/llvm/test/MC/X86/apx/adc-att.s
@@ -0,0 +1,161 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-52: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	adcb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xd3,0x7b]
+         {evex}	adcb	$123, %bl
+# CHECK: adcb	$123, %bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xd3,0x7b]
+         adcb	$123, %bl, %cl
+# CHECK: {evex}	adcw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xd2,0x7b]
+         {evex}	adcw	$123, %dx
+# CHECK: adcw	$123, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xd2,0x7b]
+         adcw	$123, %dx, %ax
+# CHECK: {evex}	adcl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xd1,0x7b]
+         {evex}	adcl	$123, %ecx
+# CHECK: adcl	$123, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xd1,0x7b]
+         adcl	$123, %ecx, %edx
+# CHECK: {evex}	adcq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xd1,0x7b]
+         {evex}	adcq	$123, %r9
+# CHECK: adcq	$123, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xd1,0x7b]
+         adcq	$123, %r9, %r15
+# CHECK: {evex}	adcb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	adcb	$123, 291(%r8,%rax,4)
+# CHECK: adcb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         adcb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	adcw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	adcw	$123, 291(%r8,%rax,4)
+# CHECK: adcw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         adcw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	adcl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	adcl	$123, 291(%r8,%rax,4)
+# CHECK: adcl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         adcl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	adcq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	adcq	$123, 291(%r8,%rax,4)
+# CHECK: adcq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         adcq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	adcw	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xd2,0xd2,0x04]
+         {evex}	adcw	$1234, %dx
+# CHECK: adcw	$1234, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xd2,0xd2,0x04]
+         adcw	$1234, %dx, %ax
+# CHECK: {evex}	adcw	$1234, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	adcw	$1234, 291(%r8,%rax,4)
+# CHECK: adcw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         adcw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	adcl	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xd1,0x40,0xe2,0x01,0x00]
+         {evex}	adcl	$123456, %ecx
+# CHECK: adcl	$123456, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xd1,0x40,0xe2,0x01,0x00]
+         adcl	$123456, %ecx, %edx
+# CHECK: {evex}	adcq	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xd1,0x40,0xe2,0x01,0x00]
+         {evex}	adcq	$123456, %r9
+# CHECK: adcq	$123456, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xd1,0x40,0xe2,0x01,0x00]
+         adcq	$123456, %r9, %r15
+# CHECK: {evex}	adcl	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	adcl	$123456, 291(%r8,%rax,4)
+# CHECK: adcl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         adcl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	adcq	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	adcq	$123456, 291(%r8,%rax,4)
+# CHECK: adcq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         adcq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	adcb	%bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x10,0xd9]
+         {evex}	adcb	%bl, %cl
+# CHECK: adcb	%bl, %cl, %r8b
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x10,0xd9]
+         adcb	%bl, %cl, %r8b
+# CHECK: {evex}	adcb	%bl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x10,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adcb	%bl, 291(%r8,%rax,4)
+# CHECK: adcb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x10,0x9c,0x80,0x23,0x01,0x00,0x00]
+         adcb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: {evex}	adcw	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x11,0xd0]
+         {evex}	adcw	%dx, %ax
+# CHECK: adcw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x11,0xd0]
+         adcw	%dx, %ax, %r9w
+# CHECK: {evex}	adcw	%dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x11,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adcw	%dx, 291(%r8,%rax,4)
+# CHECK: adcw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x11,0x94,0x80,0x23,0x01,0x00,0x00]
+         adcw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: {evex}	adcl	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x11,0xca]
+         {evex}	adcl	%ecx, %edx
+# CHECK: adcl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x11,0xca]
+         adcl	%ecx, %edx, %r10d
+# CHECK: {evex}	adcl	%ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x11,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adcl	%ecx, 291(%r8,%rax,4)
+# CHECK: adcl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x11,0x8c,0x80,0x23,0x01,0x00,0x00]
+         adcl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: {evex}	adcq	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x11,0xcf]
+         {evex}	adcq	%r9, %r15
+# CHECK: adcq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x11,0xcf]
+         adcq	%r9, %r15, %r11
+# CHECK: {evex}	adcq	%r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x11,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adcq	%r9, 291(%r8,%rax,4)
+# CHECK: adcq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x11,0x8c,0x80,0x23,0x01,0x00,0x00]
+         adcq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: {evex}	adcb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x12,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adcb	291(%r8,%rax,4), %bl
+# CHECK: adcb	291(%r8,%rax,4), %bl, %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x12,0x9c,0x80,0x23,0x01,0x00,0x00]
+         adcb	291(%r8,%rax,4), %bl, %cl
+# CHECK: {evex}	adcw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x13,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adcw	291(%r8,%rax,4), %dx
+# CHECK: adcw	291(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x13,0x94,0x80,0x23,0x01,0x00,0x00]
+         adcw	291(%r8,%rax,4), %dx, %ax
+# CHECK: {evex}	adcl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x13,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adcl	291(%r8,%rax,4), %ecx
+# CHECK: adcl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x13,0x8c,0x80,0x23,0x01,0x00,0x00]
+         adcl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: {evex}	adcq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x13,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adcq	291(%r8,%rax,4), %r9
+# CHECK: adcq	291(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x13,0x8c,0x80,0x23,0x01,0x00,0x00]
+         adcq	291(%r8,%rax,4), %r9, %r15
diff --git a/llvm/test/MC/X86/apx/adc-intel.s b/llvm/test/MC/X86/apx/adc-intel.s
new file mode 100644
index 000000000000..740b2463bad7
--- /dev/null
+++ b/llvm/test/MC/X86/apx/adc-intel.s
@@ -0,0 +1,158 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	adc	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xd3,0x7b]
+         {evex}	adc	bl, 123
+# CHECK: adc	cl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xd3,0x7b]
+         adc	cl, bl, 123
+# CHECK: {evex}	adc	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xd2,0x7b]
+         {evex}	adc	dx, 123
+# CHECK: adc	ax, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xd2,0x7b]
+         adc	ax, dx, 123
+# CHECK: {evex}	adc	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xd1,0x7b]
+         {evex}	adc	ecx, 123
+# CHECK: adc	edx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xd1,0x7b]
+         adc	edx, ecx, 123
+# CHECK: {evex}	adc	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xd1,0x7b]
+         {evex}	adc	r9, 123
+# CHECK: adc	r15, r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xd1,0x7b]
+         adc	r15, r9, 123
+# CHECK: {evex}	adc	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	adc	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: adc	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         adc	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	adc	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	adc	word ptr [r8 + 4*rax + 291], 123
+# CHECK: adc	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         adc	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	adc	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	adc	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: adc	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         adc	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	adc	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	adc	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: adc	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         adc	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	adc	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xd2,0xd2,0x04]
+         {evex}	adc	dx, 1234
+# CHECK: adc	ax, dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xd2,0xd2,0x04]
+         adc	ax, dx, 1234
+# CHECK: {evex}	adc	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	adc	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: adc	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         adc	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {evex}	adc	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xd1,0x40,0xe2,0x01,0x00]
+         {evex}	adc	ecx, 123456
+# CHECK: adc	edx, ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xd1,0x40,0xe2,0x01,0x00]
+         adc	edx, ecx, 123456
+# CHECK: {evex}	adc	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xd1,0x40,0xe2,0x01,0x00]
+         {evex}	adc	r9, 123456
+# CHECK: adc	r15, r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xd1,0x40,0xe2,0x01,0x00]
+         adc	r15, r9, 123456
+# CHECK: {evex}	adc	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	adc	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: adc	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         adc	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	adc	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	adc	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: adc	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0x94,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         adc	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	adc	cl, bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x10,0xd9]
+         {evex}	adc	cl, bl
+# CHECK: adc	r8b, cl, bl
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x10,0xd9]
+         adc	r8b, cl, bl
+# CHECK: {evex}	adc	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x10,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adc	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: adc	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x10,0x9c,0x80,0x23,0x01,0x00,0x00]
+         adc	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {evex}	adc	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x11,0xd0]
+         {evex}	adc	ax, dx
+# CHECK: adc	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x11,0xd0]
+         adc	r9w, ax, dx
+# CHECK: {evex}	adc	word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x11,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adc	word ptr [r8 + 4*rax + 291], dx
+# CHECK: adc	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x11,0x94,0x80,0x23,0x01,0x00,0x00]
+         adc	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: {evex}	adc	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x11,0xca]
+         {evex}	adc	edx, ecx
+# CHECK: adc	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x11,0xca]
+         adc	r10d, edx, ecx
+# CHECK: {evex}	adc	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x11,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adc	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: adc	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x11,0x8c,0x80,0x23,0x01,0x00,0x00]
+         adc	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {evex}	adc	r15, r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x11,0xcf]
+         {evex}	adc	r15, r9
+# CHECK: adc	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x11,0xcf]
+         adc	r11, r15, r9
+# CHECK: {evex}	adc	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x11,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adc	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: adc	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x11,0x8c,0x80,0x23,0x01,0x00,0x00]
+         adc	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {evex}	adc	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x12,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adc	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: adc	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x12,0x9c,0x80,0x23,0x01,0x00,0x00]
+         adc	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	adc	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x13,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adc	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: adc	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x13,0x94,0x80,0x23,0x01,0x00,0x00]
+         adc	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	adc	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x13,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adc	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: adc	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x13,0x8c,0x80,0x23,0x01,0x00,0x00]
+         adc	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	adc	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x13,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	adc	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: adc	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x13,0x8c,0x80,0x23,0x01,0x00,0x00]
+         adc	r15, r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/add-att.s b/llvm/test/MC/X86/apx/add-att.s
new file mode 100644
index 000000000000..97c0b036c433
--- /dev/null
+++ b/llvm/test/MC/X86/apx/add-att.s
@@ -0,0 +1,317 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-104: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	addb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xc3,0x7b]
+         {evex}	addb	$123, %bl
+# CHECK: {nf}	addb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x80,0xc3,0x7b]
+         {nf}	addb	$123, %bl
+# CHECK: addb	$123, %bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xc3,0x7b]
+         addb	$123, %bl, %cl
+# CHECK: {nf}	addb	$123, %bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x80,0xc3,0x7b]
+         {nf}	addb	$123, %bl, %cl
+# CHECK: {evex}	addw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xc2,0x7b]
+         {evex}	addw	$123, %dx
+# CHECK: {nf}	addw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x83,0xc2,0x7b]
+         {nf}	addw	$123, %dx
+# CHECK: addw	$123, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xc2,0x7b]
+         addw	$123, %dx, %ax
+# CHECK: {nf}	addw	$123, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xc2,0x7b]
+         {nf}	addw	$123, %dx, %ax
+# CHECK: {evex}	addl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xc1,0x7b]
+         {evex}	addl	$123, %ecx
+# CHECK: {nf}	addl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x83,0xc1,0x7b]
+         {nf}	addl	$123, %ecx
+# CHECK: addl	$123, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xc1,0x7b]
+         addl	$123, %ecx, %edx
+# CHECK: {nf}	addl	$123, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x83,0xc1,0x7b]
+         {nf}	addl	$123, %ecx, %edx
+# CHECK: {evex}	addq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xc1,0x7b]
+         {evex}	addq	$123, %r9
+# CHECK: {nf}	addq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xc1,0x7b]
+         {nf}	addq	$123, %r9
+# CHECK: addq	$123, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xc1,0x7b]
+         addq	$123, %r9, %r15
+# CHECK: {nf}	addq	$123, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x83,0xc1,0x7b]
+         {nf}	addq	$123, %r9, %r15
+# CHECK: {evex}	addb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	addb	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	addb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x80,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	addb	$123, 291(%r8,%rax,4)
+# CHECK: addb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         addb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	addb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0x80,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	addb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	addw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	addw	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	addw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	addw	$123, 291(%r8,%rax,4)
+# CHECK: addw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         addw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	addw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	addw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	addl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	addl	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	addl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	addl	$123, 291(%r8,%rax,4)
+# CHECK: addl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         addl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	addl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	addl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	addq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	addq	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	addq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	addq	$123, 291(%r8,%rax,4)
+# CHECK: addq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         addq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	addq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	addq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	addw	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xc2,0xd2,0x04]
+         {evex}	addw	$1234, %dx
+# CHECK: {nf}	addw	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x81,0xc2,0xd2,0x04]
+         {nf}	addw	$1234, %dx
+# CHECK: addw	$1234, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xc2,0xd2,0x04]
+         addw	$1234, %dx, %ax
+# CHECK: {nf}	addw	$1234, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xc2,0xd2,0x04]
+         {nf}	addw	$1234, %dx, %ax
+# CHECK: {evex}	addw	$1234, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	addw	$1234, 291(%r8,%rax,4)
+# CHECK: {nf}	addw	$1234, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	addw	$1234, 291(%r8,%rax,4)
+# CHECK: addw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         addw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	addw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	addw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	addl	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         {evex}	addl	$123456, %ecx
+# CHECK: {nf}	addl	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         {nf}	addl	$123456, %ecx
+# CHECK: addl	$123456, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         addl	$123456, %ecx, %edx
+# CHECK: {nf}	addl	$123456, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         {nf}	addl	$123456, %ecx, %edx
+# CHECK: {evex}	addq	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         {evex}	addq	$123456, %r9
+# CHECK: {nf}	addq	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         {nf}	addq	$123456, %r9
+# CHECK: addq	$123456, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         addq	$123456, %r9, %r15
+# CHECK: {nf}	addq	$123456, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         {nf}	addq	$123456, %r9, %r15
+# CHECK: {evex}	addl	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	addl	$123456, 291(%r8,%rax,4)
+# CHECK: {nf}	addl	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	addl	$123456, 291(%r8,%rax,4)
+# CHECK: addl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         addl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	addl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	addl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	addq	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	addq	$123456, 291(%r8,%rax,4)
+# CHECK: {nf}	addq	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	addq	$123456, 291(%r8,%rax,4)
+# CHECK: addq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         addq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	addq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	addq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	addb	%bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x00,0xd9]
+         {evex}	addb	%bl, %cl
+# CHECK: {nf}	addb	%bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x00,0xd9]
+         {nf}	addb	%bl, %cl
+# CHECK: addb	%bl, %cl, %r8b
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x00,0xd9]
+         addb	%bl, %cl, %r8b
+# CHECK: {nf}	addb	%bl, %cl, %r8b
+# CHECK: encoding: [0x62,0xf4,0x3c,0x1c,0x00,0xd9]
+         {nf}	addb	%bl, %cl, %r8b
+# CHECK: {evex}	addb	%bl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x00,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	addb	%bl, 291(%r8,%rax,4)
+# CHECK: {nf}	addb	%bl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x00,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addb	%bl, 291(%r8,%rax,4)
+# CHECK: addb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x00,0x9c,0x80,0x23,0x01,0x00,0x00]
+         addb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: {nf}	addb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x00,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: {evex}	addw	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x01,0xd0]
+         {evex}	addw	%dx, %ax
+# CHECK: {nf}	addw	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x01,0xd0]
+         {nf}	addw	%dx, %ax
+# CHECK: addw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x01,0xd0]
+         addw	%dx, %ax, %r9w
+# CHECK: {nf}	addw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x1c,0x01,0xd0]
+         {nf}	addw	%dx, %ax, %r9w
+# CHECK: {evex}	addw	%dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x01,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	addw	%dx, 291(%r8,%rax,4)
+# CHECK: {nf}	addw	%dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x01,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addw	%dx, 291(%r8,%rax,4)
+# CHECK: addw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x01,0x94,0x80,0x23,0x01,0x00,0x00]
+         addw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: {nf}	addw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x01,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: {evex}	addl	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x01,0xca]
+         {evex}	addl	%ecx, %edx
+# CHECK: {nf}	addl	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x01,0xca]
+         {nf}	addl	%ecx, %edx
+# CHECK: addl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x01,0xca]
+         addl	%ecx, %edx, %r10d
+# CHECK: {nf}	addl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x1c,0x01,0xca]
+         {nf}	addl	%ecx, %edx, %r10d
+# CHECK: {evex}	addl	%ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	addl	%ecx, 291(%r8,%rax,4)
+# CHECK: {nf}	addl	%ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addl	%ecx, 291(%r8,%rax,4)
+# CHECK: addl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         addl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: {nf}	addl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: {evex}	addq	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x01,0xcf]
+         {evex}	addq	%r9, %r15
+# CHECK: {nf}	addq	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x01,0xcf]
+         {nf}	addq	%r9, %r15
+# CHECK: addq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x01,0xcf]
+         addq	%r9, %r15, %r11
+# CHECK: {nf}	addq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x1c,0x01,0xcf]
+         {nf}	addq	%r9, %r15, %r11
+# CHECK: {evex}	addq	%r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	addq	%r9, 291(%r8,%rax,4)
+# CHECK: {nf}	addq	%r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addq	%r9, 291(%r8,%rax,4)
+# CHECK: addq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         addq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: {nf}	addq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: {evex}	addb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x02,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	addb	291(%r8,%rax,4), %bl
+# CHECK: {nf}	addb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x02,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addb	291(%r8,%rax,4), %bl
+# CHECK: addb	291(%r8,%rax,4), %bl, %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x02,0x9c,0x80,0x23,0x01,0x00,0x00]
+         addb	291(%r8,%rax,4), %bl, %cl
+# CHECK: {nf}	addb	291(%r8,%rax,4), %bl, %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x02,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addb	291(%r8,%rax,4), %bl, %cl
+# CHECK: {evex}	addw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x03,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	addw	291(%r8,%rax,4), %dx
+# CHECK: {nf}	addw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x03,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addw	291(%r8,%rax,4), %dx
+# CHECK: addw	291(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x03,0x94,0x80,0x23,0x01,0x00,0x00]
+         addw	291(%r8,%rax,4), %dx, %ax
+# CHECK: {nf}	addw	291(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x03,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addw	291(%r8,%rax,4), %dx, %ax
+# CHECK: {evex}	addl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	addl	291(%r8,%rax,4), %ecx
+# CHECK: {nf}	addl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addl	291(%r8,%rax,4), %ecx
+# CHECK: addl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         addl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: {nf}	addl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: {evex}	addq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	addq	291(%r8,%rax,4), %r9
+# CHECK: {nf}	addq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addq	291(%r8,%rax,4), %r9
+# CHECK: addq	291(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         addq	291(%r8,%rax,4), %r9, %r15
+# CHECK: {nf}	addq	291(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	addq	291(%r8,%rax,4), %r9, %r15
diff --git a/llvm/test/MC/X86/apx/add-intel.s b/llvm/test/MC/X86/apx/add-intel.s
new file mode 100644
index 000000000000..5f0a931e2ff4
--- /dev/null
+++ b/llvm/test/MC/X86/apx/add-intel.s
@@ -0,0 +1,314 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	add	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xc3,0x7b]
+         {evex}	add	bl, 123
+# CHECK: {nf}	add	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x80,0xc3,0x7b]
+         {nf}	add	bl, 123
+# CHECK: add	cl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xc3,0x7b]
+         add	cl, bl, 123
+# CHECK: {nf}	add	cl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x80,0xc3,0x7b]
+         {nf}	add	cl, bl, 123
+# CHECK: {evex}	add	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xc2,0x7b]
+         {evex}	add	dx, 123
+# CHECK: {nf}	add	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x83,0xc2,0x7b]
+         {nf}	add	dx, 123
+# CHECK: add	ax, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xc2,0x7b]
+         add	ax, dx, 123
+# CHECK: {nf}	add	ax, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xc2,0x7b]
+         {nf}	add	ax, dx, 123
+# CHECK: {evex}	add	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xc1,0x7b]
+         {evex}	add	ecx, 123
+# CHECK: {nf}	add	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x83,0xc1,0x7b]
+         {nf}	add	ecx, 123
+# CHECK: add	edx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xc1,0x7b]
+         add	edx, ecx, 123
+# CHECK: {nf}	add	edx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x83,0xc1,0x7b]
+         {nf}	add	edx, ecx, 123
+# CHECK: {evex}	add	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xc1,0x7b]
+         {evex}	add	r9, 123
+# CHECK: {nf}	add	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xc1,0x7b]
+         {nf}	add	r9, 123
+# CHECK: add	r15, r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xc1,0x7b]
+         add	r15, r9, 123
+# CHECK: {nf}	add	r15, r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x83,0xc1,0x7b]
+         {nf}	add	r15, r9, 123
+# CHECK: {evex}	add	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	add	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	add	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x80,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	add	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: add	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         add	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	add	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0x80,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	add	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	add	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	add	word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	add	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	add	word ptr [r8 + 4*rax + 291], 123
+# CHECK: add	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         add	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	add	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	add	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	add	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	add	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	add	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	add	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: add	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         add	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	add	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	add	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	add	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	add	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	add	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	add	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: add	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         add	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	add	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x83,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	add	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	add	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xc2,0xd2,0x04]
+         {evex}	add	dx, 1234
+# CHECK: {nf}	add	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x81,0xc2,0xd2,0x04]
+         {nf}	add	dx, 1234
+# CHECK: add	ax, dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xc2,0xd2,0x04]
+         add	ax, dx, 1234
+# CHECK: {nf}	add	ax, dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xc2,0xd2,0x04]
+         {nf}	add	ax, dx, 1234
+# CHECK: {evex}	add	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	add	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {nf}	add	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	add	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: add	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         add	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {nf}	add	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	add	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {evex}	add	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         {evex}	add	ecx, 123456
+# CHECK: {nf}	add	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         {nf}	add	ecx, 123456
+# CHECK: add	edx, ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         add	edx, ecx, 123456
+# CHECK: {nf}	add	edx, ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         {nf}	add	edx, ecx, 123456
+# CHECK: {evex}	add	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         {evex}	add	r9, 123456
+# CHECK: {nf}	add	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         {nf}	add	r9, 123456
+# CHECK: add	r15, r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         add	r15, r9, 123456
+# CHECK: {nf}	add	r15, r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x81,0xc1,0x40,0xe2,0x01,0x00]
+         {nf}	add	r15, r9, 123456
+# CHECK: {evex}	add	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	add	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	add	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	add	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: add	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         add	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	add	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	add	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	add	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	add	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	add	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	add	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: add	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         add	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	add	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x81,0x84,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	add	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	add	cl, bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x00,0xd9]
+         {evex}	add	cl, bl
+# CHECK: {nf}	add	cl, bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x00,0xd9]
+         {nf}	add	cl, bl
+# CHECK: add	r8b, cl, bl
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x00,0xd9]
+         add	r8b, cl, bl
+# CHECK: {nf}	add	r8b, cl, bl
+# CHECK: encoding: [0x62,0xf4,0x3c,0x1c,0x00,0xd9]
+         {nf}	add	r8b, cl, bl
+# CHECK: {evex}	add	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x00,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	add	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {nf}	add	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x00,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: add	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x00,0x9c,0x80,0x23,0x01,0x00,0x00]
+         add	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {nf}	add	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x00,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {evex}	add	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x01,0xd0]
+         {evex}	add	ax, dx
+# CHECK: {nf}	add	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x01,0xd0]
+         {nf}	add	ax, dx
+# CHECK: add	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x01,0xd0]
+         add	r9w, ax, dx
+# CHECK: {nf}	add	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x1c,0x01,0xd0]
+         {nf}	add	r9w, ax, dx
+# CHECK: {evex}	add	word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x01,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	add	word ptr [r8 + 4*rax + 291], dx
+# CHECK: {nf}	add	word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x01,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	word ptr [r8 + 4*rax + 291], dx
+# CHECK: add	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x01,0x94,0x80,0x23,0x01,0x00,0x00]
+         add	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: {nf}	add	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x01,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: {evex}	add	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x01,0xca]
+         {evex}	add	edx, ecx
+# CHECK: {nf}	add	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x01,0xca]
+         {nf}	add	edx, ecx
+# CHECK: add	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x01,0xca]
+         add	r10d, edx, ecx
+# CHECK: {nf}	add	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x1c,0x01,0xca]
+         {nf}	add	r10d, edx, ecx
+# CHECK: {evex}	add	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	add	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {nf}	add	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: add	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         add	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {nf}	add	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {evex}	add	r15, r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x01,0xcf]
+         {evex}	add	r15, r9
+# CHECK: {nf}	add	r15, r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x01,0xcf]
+         {nf}	add	r15, r9
+# CHECK: add	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x01,0xcf]
+         add	r11, r15, r9
+# CHECK: {nf}	add	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x1c,0x01,0xcf]
+         {nf}	add	r11, r15, r9
+# CHECK: {evex}	add	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	add	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {nf}	add	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: add	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         add	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {nf}	add	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x01,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {evex}	add	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x02,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	add	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	add	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x02,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: add	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x02,0x9c,0x80,0x23,0x01,0x00,0x00]
+         add	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	add	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x02,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	add	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x03,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	add	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	add	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x03,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: add	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x03,0x94,0x80,0x23,0x01,0x00,0x00]
+         add	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	add	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x03,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	add	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	add	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	add	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: add	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         add	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	add	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	add	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	add	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	add	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: add	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         add	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	add	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x03,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	add	r15, r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/amx-tile-att.s b/llvm/test/MC/X86/apx/amx-tile-att.s
new file mode 100644
index 000000000000..f4a47c16d193
--- /dev/null
+++ b/llvm/test/MC/X86/apx/amx-tile-att.s
@@ -0,0 +1,24 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-5: error:
+# ERROR-NOT: error:
+# CHECK: ldtilecfg	291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x9a,0x78,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00]
+         ldtilecfg	291(%r28,%r29,4)
+
+# CHECK: sttilecfg	291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x9a,0x79,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00]
+         sttilecfg	291(%r28,%r29,4)
+
+# CHECK: tileloadd	291(%r28,%r29,4), %tmm6
+# CHECK: encoding: [0x62,0x9a,0x7b,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
+         tileloadd	291(%r28,%r29,4), %tmm6
+
+# CHECK: tileloaddt1	291(%r28,%r29,4), %tmm6
+# CHECK: encoding: [0x62,0x9a,0x79,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
+         tileloaddt1	291(%r28,%r29,4), %tmm6
+
+# CHECK: tilestored	%tmm6, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x9a,0x7a,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
+         tilestored	%tmm6, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/amx-tile-intel.s b/llvm/test/MC/X86/apx/amx-tile-intel.s
new file mode 100644
index 000000000000..dd7b87b1806c
--- /dev/null
+++ b/llvm/test/MC/X86/apx/amx-tile-intel.s
@@ -0,0 +1,21 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: ldtilecfg	[r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0x78,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00]
+         ldtilecfg	[r28 + 4*r29 + 291]
+
+# CHECK: sttilecfg	[r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0x79,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00]
+         sttilecfg	[r28 + 4*r29 + 291]
+
+# CHECK: tileloadd	tmm6, [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0x7b,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
+         tileloadd	tmm6, [r28 + 4*r29 + 291]
+
+# CHECK: tileloaddt1	tmm6, [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0x79,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
+         tileloaddt1	tmm6, [r28 + 4*r29 + 291]
+
+# CHECK: tilestored	[r28 + 4*r29 + 291], tmm6
+# CHECK: encoding: [0x62,0x9a,0x7a,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
+         tilestored	[r28 + 4*r29 + 291], tmm6
diff --git a/llvm/test/MC/X86/apx/and-att.s b/llvm/test/MC/X86/apx/and-att.s
new file mode 100644
index 000000000000..7b4953756e3b
--- /dev/null
+++ b/llvm/test/MC/X86/apx/and-att.s
@@ -0,0 +1,317 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-104: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	andb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xe3,0x7b]
+         {evex}	andb	$123, %bl
+# CHECK: {nf}	andb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x80,0xe3,0x7b]
+         {nf}	andb	$123, %bl
+# CHECK: andb	$123, %bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xe3,0x7b]
+         andb	$123, %bl, %cl
+# CHECK: {nf}	andb	$123, %bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x80,0xe3,0x7b]
+         {nf}	andb	$123, %bl, %cl
+# CHECK: {evex}	andw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xe2,0x7b]
+         {evex}	andw	$123, %dx
+# CHECK: {nf}	andw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x83,0xe2,0x7b]
+         {nf}	andw	$123, %dx
+# CHECK: andw	$123, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xe2,0x7b]
+         andw	$123, %dx, %ax
+# CHECK: {nf}	andw	$123, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xe2,0x7b]
+         {nf}	andw	$123, %dx, %ax
+# CHECK: {evex}	andl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xe1,0x7b]
+         {evex}	andl	$123, %ecx
+# CHECK: {nf}	andl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x83,0xe1,0x7b]
+         {nf}	andl	$123, %ecx
+# CHECK: andl	$123, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xe1,0x7b]
+         andl	$123, %ecx, %edx
+# CHECK: {nf}	andl	$123, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x83,0xe1,0x7b]
+         {nf}	andl	$123, %ecx, %edx
+# CHECK: {evex}	andq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xe1,0x7b]
+         {evex}	andq	$123, %r9
+# CHECK: {nf}	andq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xe1,0x7b]
+         {nf}	andq	$123, %r9
+# CHECK: andq	$123, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xe1,0x7b]
+         andq	$123, %r9, %r15
+# CHECK: {nf}	andq	$123, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x83,0xe1,0x7b]
+         {nf}	andq	$123, %r9, %r15
+# CHECK: {evex}	andb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	andb	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	andb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x80,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	andb	$123, 291(%r8,%rax,4)
+# CHECK: andb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         andb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	andb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0x80,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	andb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	andw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	andw	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	andw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	andw	$123, 291(%r8,%rax,4)
+# CHECK: andw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         andw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	andw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	andw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	andl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	andl	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	andl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	andl	$123, 291(%r8,%rax,4)
+# CHECK: andl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         andl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	andl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	andl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	andq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	andq	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	andq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	andq	$123, 291(%r8,%rax,4)
+# CHECK: andq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         andq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	andq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	andq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	andw	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xe2,0xd2,0x04]
+         {evex}	andw	$1234, %dx
+# CHECK: {nf}	andw	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x81,0xe2,0xd2,0x04]
+         {nf}	andw	$1234, %dx
+# CHECK: andw	$1234, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xe2,0xd2,0x04]
+         andw	$1234, %dx, %ax
+# CHECK: {nf}	andw	$1234, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xe2,0xd2,0x04]
+         {nf}	andw	$1234, %dx, %ax
+# CHECK: {evex}	andw	$1234, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	andw	$1234, 291(%r8,%rax,4)
+# CHECK: {nf}	andw	$1234, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	andw	$1234, 291(%r8,%rax,4)
+# CHECK: andw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         andw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	andw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	andw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	andl	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         {evex}	andl	$123456, %ecx
+# CHECK: {nf}	andl	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         {nf}	andl	$123456, %ecx
+# CHECK: andl	$123456, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         andl	$123456, %ecx, %edx
+# CHECK: {nf}	andl	$123456, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         {nf}	andl	$123456, %ecx, %edx
+# CHECK: {evex}	andq	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         {evex}	andq	$123456, %r9
+# CHECK: {nf}	andq	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         {nf}	andq	$123456, %r9
+# CHECK: andq	$123456, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         andq	$123456, %r9, %r15
+# CHECK: {nf}	andq	$123456, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         {nf}	andq	$123456, %r9, %r15
+# CHECK: {evex}	andl	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	andl	$123456, 291(%r8,%rax,4)
+# CHECK: {nf}	andl	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	andl	$123456, 291(%r8,%rax,4)
+# CHECK: andl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         andl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	andl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	andl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	andq	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	andq	$123456, 291(%r8,%rax,4)
+# CHECK: {nf}	andq	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	andq	$123456, 291(%r8,%rax,4)
+# CHECK: andq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         andq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	andq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	andq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	andb	%bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x20,0xd9]
+         {evex}	andb	%bl, %cl
+# CHECK: {nf}	andb	%bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x20,0xd9]
+         {nf}	andb	%bl, %cl
+# CHECK: andb	%bl, %cl, %r8b
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x20,0xd9]
+         andb	%bl, %cl, %r8b
+# CHECK: {nf}	andb	%bl, %cl, %r8b
+# CHECK: encoding: [0x62,0xf4,0x3c,0x1c,0x20,0xd9]
+         {nf}	andb	%bl, %cl, %r8b
+# CHECK: {evex}	andb	%bl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x20,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	andb	%bl, 291(%r8,%rax,4)
+# CHECK: {nf}	andb	%bl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x20,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andb	%bl, 291(%r8,%rax,4)
+# CHECK: andb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x20,0x9c,0x80,0x23,0x01,0x00,0x00]
+         andb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: {nf}	andb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x20,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: {evex}	andw	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x21,0xd0]
+         {evex}	andw	%dx, %ax
+# CHECK: {nf}	andw	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x21,0xd0]
+         {nf}	andw	%dx, %ax
+# CHECK: andw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x21,0xd0]
+         andw	%dx, %ax, %r9w
+# CHECK: {nf}	andw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x1c,0x21,0xd0]
+         {nf}	andw	%dx, %ax, %r9w
+# CHECK: {evex}	andw	%dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x21,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	andw	%dx, 291(%r8,%rax,4)
+# CHECK: {nf}	andw	%dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x21,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andw	%dx, 291(%r8,%rax,4)
+# CHECK: andw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x21,0x94,0x80,0x23,0x01,0x00,0x00]
+         andw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: {nf}	andw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x21,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: {evex}	andl	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x21,0xca]
+         {evex}	andl	%ecx, %edx
+# CHECK: {nf}	andl	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x21,0xca]
+         {nf}	andl	%ecx, %edx
+# CHECK: andl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x21,0xca]
+         andl	%ecx, %edx, %r10d
+# CHECK: {nf}	andl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x1c,0x21,0xca]
+         {nf}	andl	%ecx, %edx, %r10d
+# CHECK: {evex}	andl	%ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	andl	%ecx, 291(%r8,%rax,4)
+# CHECK: {nf}	andl	%ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andl	%ecx, 291(%r8,%rax,4)
+# CHECK: andl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         andl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: {nf}	andl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: {evex}	andq	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x21,0xcf]
+         {evex}	andq	%r9, %r15
+# CHECK: {nf}	andq	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x21,0xcf]
+         {nf}	andq	%r9, %r15
+# CHECK: andq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x21,0xcf]
+         andq	%r9, %r15, %r11
+# CHECK: {nf}	andq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x1c,0x21,0xcf]
+         {nf}	andq	%r9, %r15, %r11
+# CHECK: {evex}	andq	%r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	andq	%r9, 291(%r8,%rax,4)
+# CHECK: {nf}	andq	%r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andq	%r9, 291(%r8,%rax,4)
+# CHECK: andq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         andq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: {nf}	andq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: {evex}	andb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x22,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	andb	291(%r8,%rax,4), %bl
+# CHECK: {nf}	andb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x22,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andb	291(%r8,%rax,4), %bl
+# CHECK: andb	291(%r8,%rax,4), %bl, %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x22,0x9c,0x80,0x23,0x01,0x00,0x00]
+         andb	291(%r8,%rax,4), %bl, %cl
+# CHECK: {nf}	andb	291(%r8,%rax,4), %bl, %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x22,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andb	291(%r8,%rax,4), %bl, %cl
+# CHECK: {evex}	andw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x23,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	andw	291(%r8,%rax,4), %dx
+# CHECK: {nf}	andw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x23,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andw	291(%r8,%rax,4), %dx
+# CHECK: andw	291(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x23,0x94,0x80,0x23,0x01,0x00,0x00]
+         andw	291(%r8,%rax,4), %dx, %ax
+# CHECK: {nf}	andw	291(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x23,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andw	291(%r8,%rax,4), %dx, %ax
+# CHECK: {evex}	andl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	andl	291(%r8,%rax,4), %ecx
+# CHECK: {nf}	andl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andl	291(%r8,%rax,4), %ecx
+# CHECK: andl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         andl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: {nf}	andl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: {evex}	andq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	andq	291(%r8,%rax,4), %r9
+# CHECK: {nf}	andq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andq	291(%r8,%rax,4), %r9
+# CHECK: andq	291(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         andq	291(%r8,%rax,4), %r9, %r15
+# CHECK: {nf}	andq	291(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	andq	291(%r8,%rax,4), %r9, %r15
diff --git a/llvm/test/MC/X86/apx/and-intel.s b/llvm/test/MC/X86/apx/and-intel.s
new file mode 100644
index 000000000000..fa4a0fbb3006
--- /dev/null
+++ b/llvm/test/MC/X86/apx/and-intel.s
@@ -0,0 +1,314 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	and	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xe3,0x7b]
+         {evex}	and	bl, 123
+# CHECK: {nf}	and	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x80,0xe3,0x7b]
+         {nf}	and	bl, 123
+# CHECK: and	cl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xe3,0x7b]
+         and	cl, bl, 123
+# CHECK: {nf}	and	cl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x80,0xe3,0x7b]
+         {nf}	and	cl, bl, 123
+# CHECK: {evex}	and	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xe2,0x7b]
+         {evex}	and	dx, 123
+# CHECK: {nf}	and	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x83,0xe2,0x7b]
+         {nf}	and	dx, 123
+# CHECK: and	ax, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xe2,0x7b]
+         and	ax, dx, 123
+# CHECK: {nf}	and	ax, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xe2,0x7b]
+         {nf}	and	ax, dx, 123
+# CHECK: {evex}	and	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xe1,0x7b]
+         {evex}	and	ecx, 123
+# CHECK: {nf}	and	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x83,0xe1,0x7b]
+         {nf}	and	ecx, 123
+# CHECK: and	edx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xe1,0x7b]
+         and	edx, ecx, 123
+# CHECK: {nf}	and	edx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x83,0xe1,0x7b]
+         {nf}	and	edx, ecx, 123
+# CHECK: {evex}	and	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xe1,0x7b]
+         {evex}	and	r9, 123
+# CHECK: {nf}	and	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xe1,0x7b]
+         {nf}	and	r9, 123
+# CHECK: and	r15, r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xe1,0x7b]
+         and	r15, r9, 123
+# CHECK: {nf}	and	r15, r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x83,0xe1,0x7b]
+         {nf}	and	r15, r9, 123
+# CHECK: {evex}	and	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	and	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	and	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x80,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	and	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: and	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         and	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	and	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0x80,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	and	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	and	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	and	word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	and	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	and	word ptr [r8 + 4*rax + 291], 123
+# CHECK: and	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         and	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	and	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	and	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	and	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	and	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	and	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	and	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: and	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         and	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	and	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	and	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	and	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	and	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	and	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	and	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: and	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         and	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	and	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x83,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	and	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	and	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xe2,0xd2,0x04]
+         {evex}	and	dx, 1234
+# CHECK: {nf}	and	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x81,0xe2,0xd2,0x04]
+         {nf}	and	dx, 1234
+# CHECK: and	ax, dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xe2,0xd2,0x04]
+         and	ax, dx, 1234
+# CHECK: {nf}	and	ax, dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xe2,0xd2,0x04]
+         {nf}	and	ax, dx, 1234
+# CHECK: {evex}	and	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	and	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {nf}	and	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	and	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: and	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         and	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {nf}	and	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	and	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {evex}	and	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         {evex}	and	ecx, 123456
+# CHECK: {nf}	and	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         {nf}	and	ecx, 123456
+# CHECK: and	edx, ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         and	edx, ecx, 123456
+# CHECK: {nf}	and	edx, ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         {nf}	and	edx, ecx, 123456
+# CHECK: {evex}	and	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         {evex}	and	r9, 123456
+# CHECK: {nf}	and	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         {nf}	and	r9, 123456
+# CHECK: and	r15, r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         and	r15, r9, 123456
+# CHECK: {nf}	and	r15, r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x81,0xe1,0x40,0xe2,0x01,0x00]
+         {nf}	and	r15, r9, 123456
+# CHECK: {evex}	and	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	and	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	and	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	and	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: and	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         and	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	and	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	and	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	and	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	and	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	and	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	and	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: and	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         and	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	and	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x81,0xa4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	and	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	and	cl, bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x20,0xd9]
+         {evex}	and	cl, bl
+# CHECK: {nf}	and	cl, bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x20,0xd9]
+         {nf}	and	cl, bl
+# CHECK: and	r8b, cl, bl
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x20,0xd9]
+         and	r8b, cl, bl
+# CHECK: {nf}	and	r8b, cl, bl
+# CHECK: encoding: [0x62,0xf4,0x3c,0x1c,0x20,0xd9]
+         {nf}	and	r8b, cl, bl
+# CHECK: {evex}	and	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x20,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	and	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {nf}	and	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x20,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: and	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x20,0x9c,0x80,0x23,0x01,0x00,0x00]
+         and	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {nf}	and	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x20,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {evex}	and	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x21,0xd0]
+         {evex}	and	ax, dx
+# CHECK: {nf}	and	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x21,0xd0]
+         {nf}	and	ax, dx
+# CHECK: and	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x21,0xd0]
+         and	r9w, ax, dx
+# CHECK: {nf}	and	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x1c,0x21,0xd0]
+         {nf}	and	r9w, ax, dx
+# CHECK: {evex}	and	word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x21,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	and	word ptr [r8 + 4*rax + 291], dx
+# CHECK: {nf}	and	word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x21,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	word ptr [r8 + 4*rax + 291], dx
+# CHECK: and	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x21,0x94,0x80,0x23,0x01,0x00,0x00]
+         and	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: {nf}	and	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x21,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: {evex}	and	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x21,0xca]
+         {evex}	and	edx, ecx
+# CHECK: {nf}	and	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x21,0xca]
+         {nf}	and	edx, ecx
+# CHECK: and	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x21,0xca]
+         and	r10d, edx, ecx
+# CHECK: {nf}	and	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x1c,0x21,0xca]
+         {nf}	and	r10d, edx, ecx
+# CHECK: {evex}	and	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	and	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {nf}	and	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: and	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         and	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {nf}	and	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {evex}	and	r15, r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x21,0xcf]
+         {evex}	and	r15, r9
+# CHECK: {nf}	and	r15, r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x21,0xcf]
+         {nf}	and	r15, r9
+# CHECK: and	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x21,0xcf]
+         and	r11, r15, r9
+# CHECK: {nf}	and	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x1c,0x21,0xcf]
+         {nf}	and	r11, r15, r9
+# CHECK: {evex}	and	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	and	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {nf}	and	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: and	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         and	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {nf}	and	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x21,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {evex}	and	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x22,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	and	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	and	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x22,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: and	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x22,0x9c,0x80,0x23,0x01,0x00,0x00]
+         and	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	and	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x22,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	and	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x23,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	and	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	and	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x23,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: and	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x23,0x94,0x80,0x23,0x01,0x00,0x00]
+         and	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	and	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x23,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	and	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	and	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	and	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: and	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         and	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	and	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	and	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	and	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	and	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: and	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         and	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	and	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x23,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	and	r15, r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/cmpccxadd-att.s b/llvm/test/MC/X86/apx/cmpccxadd-att.s
new file mode 100644
index 000000000000..ce23588a1849
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cmpccxadd-att.s
@@ -0,0 +1,124 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-30: error:
+# ERROR-NOT: error:
+# CHECK: cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpbexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe6,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpbexadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpbxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe2,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpbxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpbxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpbxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmplexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xee,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmplexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmplexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xee,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmplexadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmplxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xec,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmplxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmplxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmplxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpnoxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe1,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnoxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpnoxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe1,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnoxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpnpxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xeb,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnpxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpnpxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xeb,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnpxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpnsxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe9,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnsxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpnsxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe9,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnsxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpoxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe0,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpoxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpoxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe0,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpoxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmppxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xea,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmppxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmppxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xea,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmppxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpsxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe8,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpsxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpsxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe8,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpsxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpexadd	%r19, %r23, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/cmpccxadd-intel.s b/llvm/test/MC/X86/apx/cmpccxadd-intel.s
new file mode 100644
index 000000000000..c2630d3d9273
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cmpccxadd-intel.s
@@ -0,0 +1,121 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpbexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe6,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpbexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpbxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe2,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpbxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpbxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpbxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmplexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xee,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmplexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmplexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xee,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmplexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmplxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xec,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmplxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmplxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmplxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpnoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe1,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpnoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe1,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpnpxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xeb,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnpxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpnpxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xeb,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnpxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpnsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe9,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpnsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe9,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe0,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe0,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmppxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xea,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmppxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmppxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xea,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmppxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe8,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe8,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
diff --git a/llvm/test/MC/X86/apx/crc32-att.s b/llvm/test/MC/X86/apx/crc32-att.s
new file mode 100644
index 000000000000..16f1f052dbf9
--- /dev/null
+++ b/llvm/test/MC/X86/apx/crc32-att.s
@@ -0,0 +1,92 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-22: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	crc32b	%al, %ebx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf0,0xd8]
+         {evex}	crc32b	%al, %ebx
+
+# CHECK: {evex}	crc32b	%al, %rbx
+# CHECK: encoding: [0x62,0xf4,0xfc,0x08,0xf0,0xd8]
+         {evex}	crc32b	%al, %rbx
+
+# CHECK: {evex}	crc32w	%ax, %ebx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xd8]
+         {evex}	crc32w	%ax, %ebx
+
+# CHECK: {evex}	crc32l	%eax, %ebx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf1,0xd8]
+         {evex}	crc32l	%eax, %ebx
+
+# CHECK: {evex}	crc32q	%rax, %rbx
+# CHECK: encoding: [0x62,0xf4,0xfc,0x08,0xf1,0xd8]
+         {evex}	crc32q	%rax, %rbx
+
+# CHECK: {evex}	crc32w	291(%rax,%rbx,4), %ecx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xf1,0x8c,0x98,0x23,0x01,0x00,0x00]
+         {evex}	crc32w	291(%rax,%rbx,4), %ecx
+
+# CHECK: {evex}	crc32l	291(%rax,%rbx,4), %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf1,0x8c,0x98,0x23,0x01,0x00,0x00]
+         {evex}	crc32l	291(%rax,%rbx,4), %ecx
+
+# CHECK: {evex}	crc32b	291(%rax,%rbx,4), %rcx
+# CHECK: encoding: [0x62,0xf4,0xfc,0x08,0xf0,0x8c,0x98,0x23,0x01,0x00,0x00]
+         {evex}	crc32b	291(%rax,%rbx,4), %rcx
+
+# CHECK: {evex}	crc32q	291(%rax,%rbx,4), %rcx
+# CHECK: encoding: [0x62,0xf4,0xfc,0x08,0xf1,0x8c,0x98,0x23,0x01,0x00,0x00]
+         {evex}	crc32q	291(%rax,%rbx,4), %rcx
+
+# CHECK: crc32b	%r16b, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x08,0xf0,0xf0]
+         crc32b	%r16b, %r22d
+
+# CHECK: crc32b	%r16b, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x08,0xf0,0xf8]
+         crc32b	%r16b, %r23
+
+# CHECK: crc32w	%r17w, %r22d
+# CHECK: encoding: [0x62,0xec,0x7d,0x08,0xf1,0xf1]
+         crc32w	%r17w, %r22d
+
+# CHECK: crc32l	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x08,0xf1,0xf2]
+         crc32l	%r18d, %r22d
+
+# CHECK: crc32q	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x08,0xf1,0xfb]
+         crc32q	%r19, %r23
+
+# CHECK: crc32w	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0xf1,0x94,0xac,0x23,0x01,0x00,0x00]
+         crc32w	291(%r28,%r29,4), %r18d
+
+# CHECK: crc32l	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0xf1,0x94,0xac,0x23,0x01,0x00,0x00]
+         crc32l	291(%r28,%r29,4), %r18d
+
+# CHECK: crc32b	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00]
+         crc32b	291(%r28,%r29,4), %r19
+
+# CHECK: crc32q	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0xf1,0x9c,0xac,0x23,0x01,0x00,0x00]
+         crc32q	291(%r28,%r29,4), %r19
+
+# CHECK: crc32w	123(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0xf1,0x54,0xac,0x7b]
+         crc32w	123(%r28,%r29,4), %r18d
+
+# CHECK: crc32l	123(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0xf1,0x54,0xac,0x7b]
+         crc32l	123(%r28,%r29,4), %r18d
+
+# CHECK: crc32b	123(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0xf0,0x5c,0xac,0x7b]
+         crc32b	123(%r28,%r29,4), %r19
+
+# CHECK: crc32q	123(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0xf1,0x5c,0xac,0x7b]
+         crc32q	123(%r28,%r29,4), %r19
diff --git a/llvm/test/MC/X86/apx/crc32-intel.s b/llvm/test/MC/X86/apx/crc32-intel.s
new file mode 100644
index 000000000000..a7bc915a0a09
--- /dev/null
+++ b/llvm/test/MC/X86/apx/crc32-intel.s
@@ -0,0 +1,89 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: {evex}	crc32	ebx, al
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf0,0xd8]
+         {evex}	crc32	ebx, al
+
+# CHECK: {evex}	crc32	rbx, al
+# CHECK: encoding: [0x62,0xf4,0xfc,0x08,0xf0,0xd8]
+         {evex}	crc32	rbx, al
+
+# CHECK: {evex}	crc32	ebx, ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xd8]
+         {evex}	crc32	ebx, ax
+
+# CHECK: {evex}	crc32	ebx, eax
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf1,0xd8]
+         {evex}	crc32	ebx, eax
+
+# CHECK: {evex}	crc32	rbx, rax
+# CHECK: encoding: [0x62,0xf4,0xfc,0x08,0xf1,0xd8]
+         {evex}	crc32	rbx, rax
+
+# CHECK: {evex}	crc32	ecx, word ptr [rax + 4*rbx + 291]
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xf1,0x8c,0x98,0x23,0x01,0x00,0x00]
+         {evex}	crc32	ecx, word ptr [rax + 4*rbx + 291]
+
+# CHECK: {evex}	crc32	ecx, dword ptr [rax + 4*rbx + 291]
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf1,0x8c,0x98,0x23,0x01,0x00,0x00]
+         {evex}	crc32	ecx, dword ptr [rax + 4*rbx + 291]
+
+# CHECK: {evex}	crc32	rcx, byte ptr [rax + 4*rbx + 291]
+# CHECK: encoding: [0x62,0xf4,0xfc,0x08,0xf0,0x8c,0x98,0x23,0x01,0x00,0x00]
+         {evex}	crc32	rcx, byte ptr [rax + 4*rbx + 291]
+
+# CHECK: {evex}	crc32	rcx, qword ptr [rax + 4*rbx + 291]
+# CHECK: encoding: [0x62,0xf4,0xfc,0x08,0xf1,0x8c,0x98,0x23,0x01,0x00,0x00]
+         {evex}	crc32	rcx, qword ptr [rax + 4*rbx + 291]
+
+# CHECK: crc32	r22d, r16b
+# CHECK: encoding: [0x62,0xec,0x7c,0x08,0xf0,0xf0]
+         crc32	r22d, r16b
+
+# CHECK: crc32	r23, r16b
+# CHECK: encoding: [0x62,0xec,0xfc,0x08,0xf0,0xf8]
+         crc32	r23, r16b
+
+# CHECK: crc32	r22d, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x08,0xf1,0xf1]
+         crc32	r22d, r17w
+
+# CHECK: crc32	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x08,0xf1,0xf2]
+         crc32	r22d, r18d
+
+# CHECK: crc32	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x08,0xf1,0xfb]
+         crc32	r23, r19
+
+# CHECK: crc32	r18d, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0xf1,0x94,0xac,0x23,0x01,0x00,0x00]
+         crc32	r18d, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: crc32	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0xf1,0x94,0xac,0x23,0x01,0x00,0x00]
+         crc32	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: crc32	r19, byte ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0xf0,0x9c,0xac,0x23,0x01,0x00,0x00]
+         crc32	r19, byte ptr [r28 + 4*r29 + 291]
+
+# CHECK: crc32	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0xf1,0x9c,0xac,0x23,0x01,0x00,0x00]
+         crc32	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: crc32	r18d, word ptr [r28 + 4*r29 + 123]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0xf1,0x54,0xac,0x7b]
+         crc32	r18d, word ptr [r28 + 4*r29 + 123]
+
+# CHECK: crc32	r18d, dword ptr [r28 + 4*r29 + 123]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0xf1,0x54,0xac,0x7b]
+         crc32	r18d, dword ptr [r28 + 4*r29 + 123]
+
+# CHECK: crc32	r19, byte ptr [r28 + 4*r29 + 123]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0xf0,0x5c,0xac,0x7b]
+         crc32	r19, byte ptr [r28 + 4*r29 + 123]
+
+# CHECK: crc32	r19, qword ptr [r28 + 4*r29 + 123]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0xf1,0x5c,0xac,0x7b]
+         crc32	r19, qword ptr [r28 + 4*r29 + 123]
diff --git a/llvm/test/MC/X86/apx/evex-format-att.s b/llvm/test/MC/X86/apx/evex-format-att.s
index 0b2e860d6ba0..577e19697694 100644
--- a/llvm/test/MC/X86/apx/evex-format-att.s
+++ b/llvm/test/MC/X86/apx/evex-format-att.s
@@ -7,59 +7,102 @@
 # CHECK: encoding: [0x62,0xfb,0x79,0x48,0x19,0x04,0x08,0x01]
          vextractf32x4	$1, %zmm0, (%r16,%r17)
 
+# CHECK: addq	%r16, 123(%r17), %r18
+# CHECK: encoding: [0x62,0xec,0xec,0x10,0x01,0x41,0x7b]
+         addq	%r16, 123(%r17), %r18
+
 ## MRMSrcMem
 
 # CHECK: vbroadcasti32x4	(%r16,%r17), %zmm0
 # CHECK: encoding: [0x62,0xfa,0x79,0x48,0x5a,0x04,0x08]
          vbroadcasti32x4	(%r16,%r17), %zmm0
 
+# CHECK: subq	123(%r16), %r17, %r18
+# CHECK: encoding: [0x62,0xec,0xec,0x10,0x2b,0x48,0x7b]
+         subq	123(%r16), %r17, %r18
+
 ## MRM0m
 
 # CHECK: vprorq	$0, (%r16,%r17), %zmm0
 # CHECK: encoding: [0x62,0xf9,0xf9,0x48,0x72,0x04,0x08,0x00]
          vprorq	$0, (%r16,%r17), %zmm0
 
+# CHECK: addq	$127, 123(%r16), %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x40,0x7b,0x7f]
+         addq	$127, 123(%r16), %r17
+
 ## MRM1m
 
 # CHECK: vprolq	$0, (%r16,%r17), %zmm0
 # CHECK: encoding: [0x62,0xf9,0xf9,0x48,0x72,0x0c,0x08,0x00]
          vprolq	$0, (%r16,%r17), %zmm0
+
+# CHECK: orq	$127, 123(%r16), %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x48,0x7b,0x7f]
+         orq	$127, 123(%r16), %r17
+
 ## MRM2m
 
 # CHECK: vpsrlq	$0, (%r16,%r17), %zmm0
 # CHECK: encoding: [0x62,0xf9,0xf9,0x48,0x73,0x14,0x08,0x00]
          vpsrlq	$0, (%r16,%r17), %zmm0
 
+# CHECK: adcq	$127, 123(%r16), %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x50,0x7b,0x7f]
+         adcq	$127, 123(%r16), %r17
+
 ## MRM3m
 
 # CHECK: vpsrldq	$0, (%r16,%r17), %zmm0
 # CHECK: encoding: [0x62,0xf9,0x79,0x48,0x73,0x1c,0x08,0x00]
          vpsrldq	$0, (%r16,%r17), %zmm0
 
+# CHECK: sbbq	$127, 123(%r16), %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x58,0x7b,0x7f]
+         sbbq	$127, 123(%r16), %r17
+
 ## MRM4m
 
 # CHECK: vpsraq	$0, (%r16,%r17), %zmm0
 # CHECK: encoding: [0x62,0xf9,0xf9,0x48,0x72,0x24,0x08,0x00]
          vpsraq	$0, (%r16,%r17), %zmm0
 
+# CHECK: andq	$127, 123(%r16), %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x60,0x7b,0x7f]
+         andq	$127, 123(%r16), %r17
+
 ## MRM5m
 
 # CHECK: vscatterpf0dps	(%r16,%zmm0) {%k1}
 # CHECK: encoding: [0x62,0xfa,0x7d,0x49,0xc6,0x2c,0x00]
          vscatterpf0dps	(%r16,%zmm0) {%k1}
 
+# CHECK: subq	$127, 123(%r16), %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x68,0x7b,0x7f]
+         subq	$127, 123(%r16), %r17
+
 ## MRM6m
 
 # CHECK: vpsllq	$0, (%r16,%r17), %zmm0
 # CHECK: encoding: [0x62,0xf9,0xf9,0x48,0x73,0x34,0x08,0x00]
          vpsllq	$0, (%r16,%r17), %zmm0
 
+# CHECK: xorq	$127, 123(%r16), %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x70,0x7b,0x7f]
+         xorq	$127, 123(%r16), %r17
+
 ## MRM7m
 
 # CHECK: vpslldq	$0, (%r16,%r17), %zmm0
 # CHECK: encoding: [0x62,0xf9,0x79,0x48,0x73,0x3c,0x08,0x00]
          vpslldq	$0, (%r16,%r17), %zmm0
 
+## MRMDestMem4VOp3CC
+
+# CHECK: cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
 ## MRMSrcMem4VOp3
 
 # CHECK: bzhiq	%r19, 291(%r28,%r29,4), %r23
@@ -72,8 +115,70 @@
 # CHECK: encoding: [0x62,0xeb,0x7d,0x08,0x17,0xc0,0x01]
          vextractps	$1, %xmm16, %r16d
 
+# CHECK: {nf}	addq	%r16, %r17
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x01,0xc1]
+         {nf}	addq	%r16, %r17
+
+## MRMSrcReg
+
+# CHECK: mulxq	%r16, %r17, %r18
+# CHECK: encoding: [0x62,0xea,0xf7,0x00,0xf6,0xd0]
+         mulxq	%r16, %r17, %r18
+
 ## MRMSrcReg4VOp3
 
 # CHECK: bzhiq	%r19, %r23, %r27
 # CHECK: encoding: [0x62,0x6a,0xe4,0x00,0xf5,0xdf]
          bzhiq	%r19, %r23, %r27
+
+## MRM0r
+
+# CHECK: addq	$127, %r16, %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xc0,0x7f]
+         addq	$127, %r16, %r17
+
+## MRM1r
+
+# CHECK: orq	$127, %r16, %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xc8,0x7f]
+         orq	$127, %r16, %r17
+
+## MRM2r
+
+# CHECK: adcq	$127, %r16, %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xd0,0x7f]
+         adcq	$127, %r16, %r17
+
+## MRM3r
+
+# CHECK: sbbq	$127, %r16, %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xd8,0x7f]
+         sbbq	$127, %r16, %r17
+
+## MRM4r
+
+# CHECK: andq	$127, %r16, %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xe0,0x7f]
+         andq	$127, %r16, %r17
+
+## MRM5r
+
+# CHECK: subq	$127, %r16, %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xe8,0x7f]
+         subq	$127, %r16, %r17
+
+## MRM6r
+
+# CHECK: xorq	$127, %r16, %r17
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xf0,0x7f]
+         xorq	$127, %r16, %r17
+
+## NoCD8
+
+# CHECK: {nf}	negq	123(%r16)
+# CHECK: encoding: [0x62,0xfc,0xfc,0x0c,0xf7,0x58,0x7b]
+         {nf}	negq	123(%r16)
+
+# CHECK: {evex}	notq	123(%r16)
+# CHECK: encoding: [0x62,0xfc,0xfc,0x08,0xf7,0x50,0x7b]
+         {evex}	notq	123(%r16)
diff --git a/llvm/test/MC/X86/apx/evex-format-intel.s b/llvm/test/MC/X86/apx/evex-format-intel.s
index ececb7137b11..b35664343bf6 100644
--- a/llvm/test/MC/X86/apx/evex-format-intel.s
+++ b/llvm/test/MC/X86/apx/evex-format-intel.s
@@ -7,59 +7,102 @@
 # CHECK: encoding: [0x62,0xfb,0x79,0x48,0x19,0x04,0x08,0x01]
          vextractf32x4	xmmword ptr [r16 + r17], zmm0, 1
 
+# CHECK: add	r18, qword ptr [r17 + 123], r16
+# CHECK: encoding: [0x62,0xec,0xec,0x10,0x01,0x41,0x7b]
+         add	r18, qword ptr [r17 + 123], r16
+
 ## MRMSrcMem
 
 # CHECK: vbroadcasti32x4	zmm0, xmmword ptr [r16 + r17]
 # CHECK: encoding: [0x62,0xfa,0x79,0x48,0x5a,0x04,0x08]
          vbroadcasti32x4	zmm0, xmmword ptr [r16 + r17]
 
+# CHECK: sub	r18, r17, qword ptr [r16 + 123]
+# CHECK: encoding: [0x62,0xec,0xec,0x10,0x2b,0x48,0x7b]
+         sub	r18, r17, qword ptr [r16 + 123]
+
 ## MRM0m
 
 # CHECK: vprorq	zmm0, zmmword ptr [r16 + r17], 0
 # CHECK: encoding: [0x62,0xf9,0xf9,0x48,0x72,0x04,0x08,0x00]
          vprorq	zmm0, zmmword ptr [r16 + r17], 0
 
+# CHECK: add	r17, qword ptr [r16 + 123], 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x40,0x7b,0x7f]
+         add	r17, qword ptr [r16 + 123], 127
+
 ## MRM1m
 
 # CHECK: vprolq	zmm0, zmmword ptr [r16 + r17], 0
 # CHECK: encoding: [0x62,0xf9,0xf9,0x48,0x72,0x0c,0x08,0x00]
          vprolq	zmm0, zmmword ptr [r16 + r17], 0
 
+# CHECK: or	r17, qword ptr [r16 + 123], 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x48,0x7b,0x7f]
+         or	r17, qword ptr [r16 + 123], 127
+
 ## MRM2m
 
 # CHECK: vpsrlq	zmm0, zmmword ptr [r16 + r17], 0
 # CHECK: encoding: [0x62,0xf9,0xf9,0x48,0x73,0x14,0x08,0x00]
          vpsrlq	zmm0, zmmword ptr [r16 + r17], 0
 
+# CHECK: adc	r17, qword ptr [r16 + 123], 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x50,0x7b,0x7f]
+         adc	r17, qword ptr [r16 + 123], 127
+
 ## MRM3m
 
 # CHECK: vpsrldq	zmm0, zmmword ptr [r16 + r17], 0
 # CHECK: encoding: [0x62,0xf9,0x79,0x48,0x73,0x1c,0x08,0x00]
          vpsrldq	zmm0, zmmword ptr [r16 + r17], 0
+
+# CHECK: sbb	r17, qword ptr [r16 + 123], 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x58,0x7b,0x7f]
+         sbb	r17, qword ptr [r16 + 123], 127
+
 ## MRM4m
 
 # CHECK: vpsraq	zmm0, zmmword ptr [r16 + r17], 0
 # CHECK: encoding: [0x62,0xf9,0xf9,0x48,0x72,0x24,0x08,0x00]
          vpsraq	zmm0, zmmword ptr [r16 + r17], 0
 
+# CHECK: and	r17, qword ptr [r16 + 123], 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x60,0x7b,0x7f]
+         and	r17, qword ptr [r16 + 123], 127
+
 ## MRM5m
 ## AsmParser is buggy for this KNC instruction
 # C;HECK: vscatterpf0dps	{k1}, zmmword ptr [r16 + zmm0]
 # C;HECK: encoding: [0x62,0xfa,0x7d,0x49,0xc6,0x2c,0x00]
 #         vscatterpf0dps	{k1}, zmmword ptr [r16 + zmm0]
 
+# CHECK: sub	r17, qword ptr [r16 + 123], 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x68,0x7b,0x7f]
+         sub	r17, qword ptr [r16 + 123], 127
+
 ## MRM6m
 
 # CHECK: vpsllq	zmm0, zmmword ptr [r16 + r17], 0
 # CHECK: encoding: [0x62,0xf9,0xf9,0x48,0x73,0x34,0x08,0x00]
          vpsllq	zmm0, zmmword ptr [r16 + r17], 0
 
+# CHECK: xor	r17, qword ptr [r16 + 123], 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0x70,0x7b,0x7f]
+         xor	r17, qword ptr [r16 + 123], 127
+
 ## MRM7m
 
 # CHECK: vpslldq	zmm0, zmmword ptr [r16 + r17], 0
 # CHECK: encoding: [0x62,0xf9,0x79,0x48,0x73,0x3c,0x08,0x00]
          vpslldq	zmm0, zmmword ptr [r16 + r17], 0
 
+## MRMDestMem4VOp3CC
+
+# CHECK: cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
 ## MRMSrcMem4VOp3
 
 # CHECK: bzhi	r23, qword ptr [r28 + 4*r29 + 291], r19
@@ -72,8 +115,70 @@
 # CHECK: encoding: [0x62,0xeb,0x7d,0x08,0x17,0xc0,0x01]
          vextractps	r16d, xmm16, 1
 
+# CHECK: {nf}	add	r17, r16
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x01,0xc1]
+         {nf}	add	r17, r16
+
+## MRMSrcReg
+
+# CHECK: mulx	r18, r17, r16
+# CHECK: encoding: [0x62,0xea,0xf7,0x00,0xf6,0xd0]
+         mulx	r18, r17, r16
+
 ## MRMSrcReg4VOp3
 
 # CHECK: bzhi	r27, r23, r19
 # CHECK: encoding: [0x62,0x6a,0xe4,0x00,0xf5,0xdf]
          bzhi	r27, r23, r19
+
+## MRM0r
+
+# CHECK: add	r17, r16, 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xc0,0x7f]
+         add	r17, r16, 127
+
+## MRM1r
+
+# CHECK: or	r17, r16, 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xc8,0x7f]
+         or	r17, r16, 127
+
+## MRM2r
+
+# CHECK: adc	r17, r16, 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xd0,0x7f]
+         adc	r17, r16, 127
+
+## MRM3r
+
+# CHECK: sbb	r17, r16, 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xd8,0x7f]
+         sbb	r17, r16, 127
+
+## MRM4r
+
+# CHECK: and	r17, r16, 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xe0,0x7f]
+         and	r17, r16, 127
+
+## MRM5r
+
+# CHECK: sub	r17, r16, 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xe8,0x7f]
+         sub	r17, r16, 127
+
+## MRM6r
+
+# CHECK: xor	r17, r16, 127
+# CHECK: encoding: [0x62,0xfc,0xf4,0x10,0x83,0xf0,0x7f]
+         xor	r17, r16, 127
+
+## NoCD8
+
+# CHECK: {nf}	neg	qword ptr [r16 + 123]
+# CHECK: encoding: [0x62,0xfc,0xfc,0x0c,0xf7,0x58,0x7b]
+         {nf}	neg	qword ptr [r16 + 123]
+
+# CHECK: {evex}	not	qword ptr [r16 + 123]
+# CHECK: encoding: [0x62,0xfc,0xfc,0x08,0xf7,0x50,0x7b]
+         {evex}	not	qword ptr [r16 + 123]
diff --git a/llvm/test/MC/X86/apx/neg-att.s b/llvm/test/MC/X86/apx/neg-att.s
new file mode 100644
index 000000000000..c40241b7960d
--- /dev/null
+++ b/llvm/test/MC/X86/apx/neg-att.s
@@ -0,0 +1,101 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-32: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	negb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf6,0xdb]
+         {evex}	negb	%bl
+# CHECK: {nf}	negb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xf6,0xdb]
+         {nf}	negb	%bl
+# CHECK: negb	%bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xf6,0xdb]
+         negb	%bl, %bl
+# CHECK: {nf}	negb	%bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xf6,0xdb]
+         {nf}	negb	%bl, %bl
+# CHECK: {evex}	negw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xf7,0xda]
+         {evex}	negw	%dx
+# CHECK: {nf}	negw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xf7,0xda]
+         {nf}	negw	%dx
+# CHECK: negw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xf7,0xda]
+         negw	%dx, %dx
+# CHECK: {nf}	negw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xf7,0xda]
+         {nf}	negw	%dx, %dx
+# CHECK: {evex}	negl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf7,0xd9]
+         {evex}	negl	%ecx
+# CHECK: {nf}	negl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xf7,0xd9]
+         {nf}	negl	%ecx
+# CHECK: negl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xf7,0xd9]
+         negl	%ecx, %ecx
+# CHECK: {nf}	negl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xf7,0xd9]
+         {nf}	negl	%ecx, %ecx
+# CHECK: {evex}	negq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xf7,0xd9]
+         {evex}	negq	%r9
+# CHECK: {nf}	negq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xf7,0xd9]
+         {nf}	negq	%r9
+# CHECK: negq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xf7,0xd9]
+         negq	%r9, %r9
+# CHECK: {nf}	negq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xf7,0xd9]
+         {nf}	negq	%r9, %r9
+# CHECK: {evex}	negb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xf6,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	negb	291(%r8,%rax,4)
+# CHECK: {nf}	negb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xf6,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	negb	291(%r8,%rax,4)
+# CHECK: negb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xf6,0x9c,0x80,0x23,0x01,0x00,0x00]
+         negb	291(%r8,%rax,4), %bl
+# CHECK: {nf}	negb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xf6,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	negb	291(%r8,%rax,4), %bl
+# CHECK: {evex}	negw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	negw	291(%r8,%rax,4)
+# CHECK: {nf}	negw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	negw	291(%r8,%rax,4)
+# CHECK: negw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         negw	291(%r8,%rax,4), %dx
+# CHECK: {nf}	negw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	negw	291(%r8,%rax,4), %dx
+# CHECK: {evex}	negl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	negl	291(%r8,%rax,4)
+# CHECK: {nf}	negl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	negl	291(%r8,%rax,4)
+# CHECK: negl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         negl	291(%r8,%rax,4), %ecx
+# CHECK: {nf}	negl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	negl	291(%r8,%rax,4), %ecx
+# CHECK: {evex}	negq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	negq	291(%r8,%rax,4)
+# CHECK: {nf}	negq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	negq	291(%r8,%rax,4)
+# CHECK: negq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         negq	291(%r8,%rax,4), %r9
+# CHECK: {nf}	negq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	negq	291(%r8,%rax,4), %r9
diff --git a/llvm/test/MC/X86/apx/neg-intel.s b/llvm/test/MC/X86/apx/neg-intel.s
new file mode 100644
index 000000000000..528c1ba74c15
--- /dev/null
+++ b/llvm/test/MC/X86/apx/neg-intel.s
@@ -0,0 +1,98 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	neg	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf6,0xdb]
+         {evex}	neg	bl
+# CHECK: {nf}	neg	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xf6,0xdb]
+         {nf}	neg	bl
+# CHECK: neg	bl, bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xf6,0xdb]
+         neg	bl, bl
+# CHECK: {nf}	neg	bl, bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xf6,0xdb]
+         {nf}	neg	bl, bl
+# CHECK: {evex}	neg	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xf7,0xda]
+         {evex}	neg	dx
+# CHECK: {nf}	neg	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xf7,0xda]
+         {nf}	neg	dx
+# CHECK: neg	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xf7,0xda]
+         neg	dx, dx
+# CHECK: {nf}	neg	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xf7,0xda]
+         {nf}	neg	dx, dx
+# CHECK: {evex}	neg	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf7,0xd9]
+         {evex}	neg	ecx
+# CHECK: {nf}	neg	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xf7,0xd9]
+         {nf}	neg	ecx
+# CHECK: neg	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xf7,0xd9]
+         neg	ecx, ecx
+# CHECK: {nf}	neg	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xf7,0xd9]
+         {nf}	neg	ecx, ecx
+# CHECK: {evex}	neg	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xf7,0xd9]
+         {evex}	neg	r9
+# CHECK: {nf}	neg	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xf7,0xd9]
+         {nf}	neg	r9
+# CHECK: neg	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xf7,0xd9]
+         neg	r9, r9
+# CHECK: {nf}	neg	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xf7,0xd9]
+         {nf}	neg	r9, r9
+# CHECK: {evex}	neg	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xf6,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	neg	byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	neg	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xf6,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	neg	byte ptr [r8 + 4*rax + 291]
+# CHECK: neg	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xf6,0x9c,0x80,0x23,0x01,0x00,0x00]
+         neg	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	neg	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xf6,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	neg	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	neg	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	neg	word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	neg	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	neg	word ptr [r8 + 4*rax + 291]
+# CHECK: neg	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         neg	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	neg	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	neg	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	neg	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	neg	dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	neg	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	neg	dword ptr [r8 + 4*rax + 291]
+# CHECK: neg	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         neg	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	neg	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	neg	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	neg	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	neg	qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	neg	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	neg	qword ptr [r8 + 4*rax + 291]
+# CHECK: neg	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         neg	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	neg	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xf7,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	neg	r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/not-att.s b/llvm/test/MC/X86/apx/not-att.s
new file mode 100644
index 000000000000..d5b7a406544d
--- /dev/null
+++ b/llvm/test/MC/X86/apx/not-att.s
@@ -0,0 +1,53 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-16: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	notb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf6,0xd3]
+         {evex}	notb	%bl
+# CHECK: notb	%bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xf6,0xd3]
+         notb	%bl, %bl
+# CHECK: {evex}	notw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xf7,0xd2]
+         {evex}	notw	%dx
+# CHECK: notw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xf7,0xd2]
+         notw	%dx, %dx
+# CHECK: {evex}	notl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf7,0xd1]
+         {evex}	notl	%ecx
+# CHECK: notl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xf7,0xd1]
+         notl	%ecx, %ecx
+# CHECK: {evex}	notq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xf7,0xd1]
+         {evex}	notq	%r9
+# CHECK: notq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xf7,0xd1]
+         notq	%r9, %r9
+# CHECK: {evex}	notb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xf6,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	notb	291(%r8,%rax,4)
+# CHECK: notb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xf6,0x94,0x80,0x23,0x01,0x00,0x00]
+         notb	291(%r8,%rax,4), %bl
+# CHECK: {evex}	notw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xf7,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	notw	291(%r8,%rax,4)
+# CHECK: notw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xf7,0x94,0x80,0x23,0x01,0x00,0x00]
+         notw	291(%r8,%rax,4), %dx
+# CHECK: {evex}	notl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xf7,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	notl	291(%r8,%rax,4)
+# CHECK: notl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xf7,0x94,0x80,0x23,0x01,0x00,0x00]
+         notl	291(%r8,%rax,4), %ecx
+# CHECK: {evex}	notq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xf7,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	notq	291(%r8,%rax,4)
+# CHECK: notq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xf7,0x94,0x80,0x23,0x01,0x00,0x00]
+         notq	291(%r8,%rax,4), %r9
diff --git a/llvm/test/MC/X86/apx/not-intel.s b/llvm/test/MC/X86/apx/not-intel.s
new file mode 100644
index 000000000000..5e106559a0cb
--- /dev/null
+++ b/llvm/test/MC/X86/apx/not-intel.s
@@ -0,0 +1,50 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	not	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf6,0xd3]
+         {evex}	not	bl
+# CHECK: not	bl, bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xf6,0xd3]
+         not	bl, bl
+# CHECK: {evex}	not	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xf7,0xd2]
+         {evex}	not	dx
+# CHECK: not	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xf7,0xd2]
+         not	dx, dx
+# CHECK: {evex}	not	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xf7,0xd1]
+         {evex}	not	ecx
+# CHECK: not	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xf7,0xd1]
+         not	ecx, ecx
+# CHECK: {evex}	not	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xf7,0xd1]
+         {evex}	not	r9
+# CHECK: not	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xf7,0xd1]
+         not	r9, r9
+# CHECK: {evex}	not	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xf6,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	not	byte ptr [r8 + 4*rax + 291]
+# CHECK: not	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xf6,0x94,0x80,0x23,0x01,0x00,0x00]
+         not	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	not	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xf7,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	not	word ptr [r8 + 4*rax + 291]
+# CHECK: not	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xf7,0x94,0x80,0x23,0x01,0x00,0x00]
+         not	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	not	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xf7,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	not	dword ptr [r8 + 4*rax + 291]
+# CHECK: not	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xf7,0x94,0x80,0x23,0x01,0x00,0x00]
+         not	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	not	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xf7,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	not	qword ptr [r8 + 4*rax + 291]
+# CHECK: not	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xf7,0x94,0x80,0x23,0x01,0x00,0x00]
+         not	r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/or-att.s b/llvm/test/MC/X86/apx/or-att.s
new file mode 100644
index 000000000000..205d3c397c31
--- /dev/null
+++ b/llvm/test/MC/X86/apx/or-att.s
@@ -0,0 +1,317 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-104: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	orb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xcb,0x7b]
+         {evex}	orb	$123, %bl
+# CHECK: {nf}	orb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x80,0xcb,0x7b]
+         {nf}	orb	$123, %bl
+# CHECK: orb	$123, %bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xcb,0x7b]
+         orb	$123, %bl, %cl
+# CHECK: {nf}	orb	$123, %bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x80,0xcb,0x7b]
+         {nf}	orb	$123, %bl, %cl
+# CHECK: {evex}	orw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xca,0x7b]
+         {evex}	orw	$123, %dx
+# CHECK: {nf}	orw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x83,0xca,0x7b]
+         {nf}	orw	$123, %dx
+# CHECK: orw	$123, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xca,0x7b]
+         orw	$123, %dx, %ax
+# CHECK: {nf}	orw	$123, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xca,0x7b]
+         {nf}	orw	$123, %dx, %ax
+# CHECK: {evex}	orl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xc9,0x7b]
+         {evex}	orl	$123, %ecx
+# CHECK: {nf}	orl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x83,0xc9,0x7b]
+         {nf}	orl	$123, %ecx
+# CHECK: orl	$123, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xc9,0x7b]
+         orl	$123, %ecx, %edx
+# CHECK: {nf}	orl	$123, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x83,0xc9,0x7b]
+         {nf}	orl	$123, %ecx, %edx
+# CHECK: {evex}	orq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xc9,0x7b]
+         {evex}	orq	$123, %r9
+# CHECK: {nf}	orq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xc9,0x7b]
+         {nf}	orq	$123, %r9
+# CHECK: orq	$123, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xc9,0x7b]
+         orq	$123, %r9, %r15
+# CHECK: {nf}	orq	$123, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x83,0xc9,0x7b]
+         {nf}	orq	$123, %r9, %r15
+# CHECK: {evex}	orb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	orb	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	orb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x80,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	orb	$123, 291(%r8,%rax,4)
+# CHECK: orb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         orb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	orb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0x80,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	orb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	orw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	orw	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	orw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	orw	$123, 291(%r8,%rax,4)
+# CHECK: orw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         orw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	orw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	orw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	orl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	orl	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	orl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	orl	$123, 291(%r8,%rax,4)
+# CHECK: orl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         orl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	orl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	orl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	orq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	orq	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	orq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	orq	$123, 291(%r8,%rax,4)
+# CHECK: orq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         orq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	orq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	orq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	orw	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xca,0xd2,0x04]
+         {evex}	orw	$1234, %dx
+# CHECK: {nf}	orw	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x81,0xca,0xd2,0x04]
+         {nf}	orw	$1234, %dx
+# CHECK: orw	$1234, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xca,0xd2,0x04]
+         orw	$1234, %dx, %ax
+# CHECK: {nf}	orw	$1234, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xca,0xd2,0x04]
+         {nf}	orw	$1234, %dx, %ax
+# CHECK: {evex}	orw	$1234, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	orw	$1234, 291(%r8,%rax,4)
+# CHECK: {nf}	orw	$1234, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	orw	$1234, 291(%r8,%rax,4)
+# CHECK: orw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         orw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	orw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	orw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	orl	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         {evex}	orl	$123456, %ecx
+# CHECK: {nf}	orl	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         {nf}	orl	$123456, %ecx
+# CHECK: orl	$123456, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         orl	$123456, %ecx, %edx
+# CHECK: {nf}	orl	$123456, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         {nf}	orl	$123456, %ecx, %edx
+# CHECK: {evex}	orq	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         {evex}	orq	$123456, %r9
+# CHECK: {nf}	orq	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         {nf}	orq	$123456, %r9
+# CHECK: orq	$123456, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         orq	$123456, %r9, %r15
+# CHECK: {nf}	orq	$123456, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         {nf}	orq	$123456, %r9, %r15
+# CHECK: {evex}	orl	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	orl	$123456, 291(%r8,%rax,4)
+# CHECK: {nf}	orl	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	orl	$123456, 291(%r8,%rax,4)
+# CHECK: orl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         orl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	orl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	orl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	orq	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	orq	$123456, 291(%r8,%rax,4)
+# CHECK: {nf}	orq	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	orq	$123456, 291(%r8,%rax,4)
+# CHECK: orq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         orq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	orq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	orq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	orb	%bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x08,0xd9]
+         {evex}	orb	%bl, %cl
+# CHECK: {nf}	orb	%bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x08,0xd9]
+         {nf}	orb	%bl, %cl
+# CHECK: orb	%bl, %cl, %r8b
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x08,0xd9]
+         orb	%bl, %cl, %r8b
+# CHECK: {nf}	orb	%bl, %cl, %r8b
+# CHECK: encoding: [0x62,0xf4,0x3c,0x1c,0x08,0xd9]
+         {nf}	orb	%bl, %cl, %r8b
+# CHECK: {evex}	orb	%bl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x08,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	orb	%bl, 291(%r8,%rax,4)
+# CHECK: {nf}	orb	%bl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x08,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orb	%bl, 291(%r8,%rax,4)
+# CHECK: orb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x08,0x9c,0x80,0x23,0x01,0x00,0x00]
+         orb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: {nf}	orb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x08,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: {evex}	orw	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x09,0xd0]
+         {evex}	orw	%dx, %ax
+# CHECK: {nf}	orw	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x09,0xd0]
+         {nf}	orw	%dx, %ax
+# CHECK: orw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x09,0xd0]
+         orw	%dx, %ax, %r9w
+# CHECK: {nf}	orw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x1c,0x09,0xd0]
+         {nf}	orw	%dx, %ax, %r9w
+# CHECK: {evex}	orw	%dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x09,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	orw	%dx, 291(%r8,%rax,4)
+# CHECK: {nf}	orw	%dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x09,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orw	%dx, 291(%r8,%rax,4)
+# CHECK: orw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x09,0x94,0x80,0x23,0x01,0x00,0x00]
+         orw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: {nf}	orw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x09,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: {evex}	orl	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x09,0xca]
+         {evex}	orl	%ecx, %edx
+# CHECK: {nf}	orl	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x09,0xca]
+         {nf}	orl	%ecx, %edx
+# CHECK: orl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x09,0xca]
+         orl	%ecx, %edx, %r10d
+# CHECK: {nf}	orl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x1c,0x09,0xca]
+         {nf}	orl	%ecx, %edx, %r10d
+# CHECK: {evex}	orl	%ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	orl	%ecx, 291(%r8,%rax,4)
+# CHECK: {nf}	orl	%ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orl	%ecx, 291(%r8,%rax,4)
+# CHECK: orl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         orl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: {nf}	orl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: {evex}	orq	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x09,0xcf]
+         {evex}	orq	%r9, %r15
+# CHECK: {nf}	orq	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x09,0xcf]
+         {nf}	orq	%r9, %r15
+# CHECK: orq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x09,0xcf]
+         orq	%r9, %r15, %r11
+# CHECK: {nf}	orq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x1c,0x09,0xcf]
+         {nf}	orq	%r9, %r15, %r11
+# CHECK: {evex}	orq	%r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	orq	%r9, 291(%r8,%rax,4)
+# CHECK: {nf}	orq	%r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orq	%r9, 291(%r8,%rax,4)
+# CHECK: orq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         orq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: {nf}	orq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: {evex}	orb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x0a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	orb	291(%r8,%rax,4), %bl
+# CHECK: {nf}	orb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x0a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orb	291(%r8,%rax,4), %bl
+# CHECK: orb	291(%r8,%rax,4), %bl, %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x0a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         orb	291(%r8,%rax,4), %bl, %cl
+# CHECK: {nf}	orb	291(%r8,%rax,4), %bl, %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x0a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orb	291(%r8,%rax,4), %bl, %cl
+# CHECK: {evex}	orw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x0b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	orw	291(%r8,%rax,4), %dx
+# CHECK: {nf}	orw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x0b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orw	291(%r8,%rax,4), %dx
+# CHECK: orw	291(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x0b,0x94,0x80,0x23,0x01,0x00,0x00]
+         orw	291(%r8,%rax,4), %dx, %ax
+# CHECK: {nf}	orw	291(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x0b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orw	291(%r8,%rax,4), %dx, %ax
+# CHECK: {evex}	orl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	orl	291(%r8,%rax,4), %ecx
+# CHECK: {nf}	orl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orl	291(%r8,%rax,4), %ecx
+# CHECK: orl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         orl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: {nf}	orl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: {evex}	orq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	orq	291(%r8,%rax,4), %r9
+# CHECK: {nf}	orq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orq	291(%r8,%rax,4), %r9
+# CHECK: orq	291(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         orq	291(%r8,%rax,4), %r9, %r15
+# CHECK: {nf}	orq	291(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	orq	291(%r8,%rax,4), %r9, %r15
diff --git a/llvm/test/MC/X86/apx/or-intel.s b/llvm/test/MC/X86/apx/or-intel.s
new file mode 100644
index 000000000000..d631c732f1df
--- /dev/null
+++ b/llvm/test/MC/X86/apx/or-intel.s
@@ -0,0 +1,314 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	or	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xcb,0x7b]
+         {evex}	or	bl, 123
+# CHECK: {nf}	or	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x80,0xcb,0x7b]
+         {nf}	or	bl, 123
+# CHECK: or	cl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xcb,0x7b]
+         or	cl, bl, 123
+# CHECK: {nf}	or	cl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x80,0xcb,0x7b]
+         {nf}	or	cl, bl, 123
+# CHECK: {evex}	or	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xca,0x7b]
+         {evex}	or	dx, 123
+# CHECK: {nf}	or	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x83,0xca,0x7b]
+         {nf}	or	dx, 123
+# CHECK: or	ax, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xca,0x7b]
+         or	ax, dx, 123
+# CHECK: {nf}	or	ax, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xca,0x7b]
+         {nf}	or	ax, dx, 123
+# CHECK: {evex}	or	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xc9,0x7b]
+         {evex}	or	ecx, 123
+# CHECK: {nf}	or	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x83,0xc9,0x7b]
+         {nf}	or	ecx, 123
+# CHECK: or	edx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xc9,0x7b]
+         or	edx, ecx, 123
+# CHECK: {nf}	or	edx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x83,0xc9,0x7b]
+         {nf}	or	edx, ecx, 123
+# CHECK: {evex}	or	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xc9,0x7b]
+         {evex}	or	r9, 123
+# CHECK: {nf}	or	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xc9,0x7b]
+         {nf}	or	r9, 123
+# CHECK: or	r15, r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xc9,0x7b]
+         or	r15, r9, 123
+# CHECK: {nf}	or	r15, r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x83,0xc9,0x7b]
+         {nf}	or	r15, r9, 123
+# CHECK: {evex}	or	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	or	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	or	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x80,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	or	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: or	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         or	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	or	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0x80,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	or	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	or	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	or	word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	or	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	or	word ptr [r8 + 4*rax + 291], 123
+# CHECK: or	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         or	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	or	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	or	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	or	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	or	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	or	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	or	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: or	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         or	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	or	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	or	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	or	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	or	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	or	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	or	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: or	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         or	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	or	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x83,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	or	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	or	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xca,0xd2,0x04]
+         {evex}	or	dx, 1234
+# CHECK: {nf}	or	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x81,0xca,0xd2,0x04]
+         {nf}	or	dx, 1234
+# CHECK: or	ax, dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xca,0xd2,0x04]
+         or	ax, dx, 1234
+# CHECK: {nf}	or	ax, dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xca,0xd2,0x04]
+         {nf}	or	ax, dx, 1234
+# CHECK: {evex}	or	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	or	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {nf}	or	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	or	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: or	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         or	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {nf}	or	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	or	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {evex}	or	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         {evex}	or	ecx, 123456
+# CHECK: {nf}	or	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         {nf}	or	ecx, 123456
+# CHECK: or	edx, ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         or	edx, ecx, 123456
+# CHECK: {nf}	or	edx, ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         {nf}	or	edx, ecx, 123456
+# CHECK: {evex}	or	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         {evex}	or	r9, 123456
+# CHECK: {nf}	or	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         {nf}	or	r9, 123456
+# CHECK: or	r15, r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         or	r15, r9, 123456
+# CHECK: {nf}	or	r15, r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x81,0xc9,0x40,0xe2,0x01,0x00]
+         {nf}	or	r15, r9, 123456
+# CHECK: {evex}	or	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	or	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	or	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	or	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: or	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         or	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	or	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	or	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	or	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	or	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	or	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	or	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: or	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         or	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	or	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x81,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	or	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	or	cl, bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x08,0xd9]
+         {evex}	or	cl, bl
+# CHECK: {nf}	or	cl, bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x08,0xd9]
+         {nf}	or	cl, bl
+# CHECK: or	r8b, cl, bl
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x08,0xd9]
+         or	r8b, cl, bl
+# CHECK: {nf}	or	r8b, cl, bl
+# CHECK: encoding: [0x62,0xf4,0x3c,0x1c,0x08,0xd9]
+         {nf}	or	r8b, cl, bl
+# CHECK: {evex}	or	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x08,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	or	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {nf}	or	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x08,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: or	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x08,0x9c,0x80,0x23,0x01,0x00,0x00]
+         or	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {nf}	or	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x08,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {evex}	or	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x09,0xd0]
+         {evex}	or	ax, dx
+# CHECK: {nf}	or	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x09,0xd0]
+         {nf}	or	ax, dx
+# CHECK: or	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x09,0xd0]
+         or	r9w, ax, dx
+# CHECK: {nf}	or	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x1c,0x09,0xd0]
+         {nf}	or	r9w, ax, dx
+# CHECK: {evex}	or	word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x09,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	or	word ptr [r8 + 4*rax + 291], dx
+# CHECK: {nf}	or	word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x09,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	word ptr [r8 + 4*rax + 291], dx
+# CHECK: or	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x09,0x94,0x80,0x23,0x01,0x00,0x00]
+         or	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: {nf}	or	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x09,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: {evex}	or	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x09,0xca]
+         {evex}	or	edx, ecx
+# CHECK: {nf}	or	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x09,0xca]
+         {nf}	or	edx, ecx
+# CHECK: or	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x09,0xca]
+         or	r10d, edx, ecx
+# CHECK: {nf}	or	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x1c,0x09,0xca]
+         {nf}	or	r10d, edx, ecx
+# CHECK: {evex}	or	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	or	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {nf}	or	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: or	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         or	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {nf}	or	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {evex}	or	r15, r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x09,0xcf]
+         {evex}	or	r15, r9
+# CHECK: {nf}	or	r15, r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x09,0xcf]
+         {nf}	or	r15, r9
+# CHECK: or	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x09,0xcf]
+         or	r11, r15, r9
+# CHECK: {nf}	or	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x1c,0x09,0xcf]
+         {nf}	or	r11, r15, r9
+# CHECK: {evex}	or	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	or	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {nf}	or	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: or	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         or	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {nf}	or	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x09,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {evex}	or	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x0a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	or	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	or	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x0a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: or	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x0a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         or	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	or	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x0a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	or	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x0b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	or	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	or	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x0b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: or	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x0b,0x94,0x80,0x23,0x01,0x00,0x00]
+         or	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	or	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x0b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	or	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	or	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	or	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: or	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         or	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	or	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	or	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	or	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	or	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: or	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         or	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	or	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x0b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	or	r15, r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/sbb-att.s b/llvm/test/MC/X86/apx/sbb-att.s
new file mode 100644
index 000000000000..80b909edb4e9
--- /dev/null
+++ b/llvm/test/MC/X86/apx/sbb-att.s
@@ -0,0 +1,161 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-52: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	sbbb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xdb,0x7b]
+         {evex}	sbbb	$123, %bl
+# CHECK: sbbb	$123, %bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xdb,0x7b]
+         sbbb	$123, %bl, %cl
+# CHECK: {evex}	sbbw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xda,0x7b]
+         {evex}	sbbw	$123, %dx
+# CHECK: sbbw	$123, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xda,0x7b]
+         sbbw	$123, %dx, %ax
+# CHECK: {evex}	sbbl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xd9,0x7b]
+         {evex}	sbbl	$123, %ecx
+# CHECK: sbbl	$123, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xd9,0x7b]
+         sbbl	$123, %ecx, %edx
+# CHECK: {evex}	sbbq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xd9,0x7b]
+         {evex}	sbbq	$123, %r9
+# CHECK: sbbq	$123, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xd9,0x7b]
+         sbbq	$123, %r9, %r15
+# CHECK: {evex}	sbbb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sbbb	$123, 291(%r8,%rax,4)
+# CHECK: sbbb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sbbb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	sbbw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sbbw	$123, 291(%r8,%rax,4)
+# CHECK: sbbw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sbbw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	sbbl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sbbl	$123, 291(%r8,%rax,4)
+# CHECK: sbbl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sbbl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	sbbq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sbbq	$123, 291(%r8,%rax,4)
+# CHECK: sbbq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sbbq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	sbbw	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xda,0xd2,0x04]
+         {evex}	sbbw	$1234, %dx
+# CHECK: sbbw	$1234, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xda,0xd2,0x04]
+         sbbw	$1234, %dx, %ax
+# CHECK: {evex}	sbbw	$1234, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	sbbw	$1234, 291(%r8,%rax,4)
+# CHECK: sbbw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         sbbw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	sbbl	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xd9,0x40,0xe2,0x01,0x00]
+         {evex}	sbbl	$123456, %ecx
+# CHECK: sbbl	$123456, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xd9,0x40,0xe2,0x01,0x00]
+         sbbl	$123456, %ecx, %edx
+# CHECK: {evex}	sbbq	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xd9,0x40,0xe2,0x01,0x00]
+         {evex}	sbbq	$123456, %r9
+# CHECK: sbbq	$123456, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xd9,0x40,0xe2,0x01,0x00]
+         sbbq	$123456, %r9, %r15
+# CHECK: {evex}	sbbl	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	sbbl	$123456, 291(%r8,%rax,4)
+# CHECK: sbbl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         sbbl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	sbbq	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	sbbq	$123456, 291(%r8,%rax,4)
+# CHECK: sbbq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         sbbq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	sbbb	%bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x18,0xd9]
+         {evex}	sbbb	%bl, %cl
+# CHECK: sbbb	%bl, %cl, %r8b
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x18,0xd9]
+         sbbb	%bl, %cl, %r8b
+# CHECK: {evex}	sbbb	%bl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x18,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbbb	%bl, 291(%r8,%rax,4)
+# CHECK: sbbb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x18,0x9c,0x80,0x23,0x01,0x00,0x00]
+         sbbb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: {evex}	sbbw	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x19,0xd0]
+         {evex}	sbbw	%dx, %ax
+# CHECK: sbbw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x19,0xd0]
+         sbbw	%dx, %ax, %r9w
+# CHECK: {evex}	sbbw	%dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x19,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbbw	%dx, 291(%r8,%rax,4)
+# CHECK: sbbw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x19,0x94,0x80,0x23,0x01,0x00,0x00]
+         sbbw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: {evex}	sbbl	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x19,0xca]
+         {evex}	sbbl	%ecx, %edx
+# CHECK: sbbl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x19,0xca]
+         sbbl	%ecx, %edx, %r10d
+# CHECK: {evex}	sbbl	%ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x19,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbbl	%ecx, 291(%r8,%rax,4)
+# CHECK: sbbl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x19,0x8c,0x80,0x23,0x01,0x00,0x00]
+         sbbl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: {evex}	sbbq	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x19,0xcf]
+         {evex}	sbbq	%r9, %r15
+# CHECK: sbbq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x19,0xcf]
+         sbbq	%r9, %r15, %r11
+# CHECK: {evex}	sbbq	%r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x19,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbbq	%r9, 291(%r8,%rax,4)
+# CHECK: sbbq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x19,0x8c,0x80,0x23,0x01,0x00,0x00]
+         sbbq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: {evex}	sbbb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x1a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbbb	291(%r8,%rax,4), %bl
+# CHECK: sbbb	291(%r8,%rax,4), %bl, %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x1a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         sbbb	291(%r8,%rax,4), %bl, %cl
+# CHECK: {evex}	sbbw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x1b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbbw	291(%r8,%rax,4), %dx
+# CHECK: sbbw	291(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x1b,0x94,0x80,0x23,0x01,0x00,0x00]
+         sbbw	291(%r8,%rax,4), %dx, %ax
+# CHECK: {evex}	sbbl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x1b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbbl	291(%r8,%rax,4), %ecx
+# CHECK: sbbl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x1b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         sbbl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: {evex}	sbbq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x1b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbbq	291(%r8,%rax,4), %r9
+# CHECK: sbbq	291(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x1b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         sbbq	291(%r8,%rax,4), %r9, %r15
diff --git a/llvm/test/MC/X86/apx/sbb-intel.s b/llvm/test/MC/X86/apx/sbb-intel.s
new file mode 100644
index 000000000000..57d9c8110368
--- /dev/null
+++ b/llvm/test/MC/X86/apx/sbb-intel.s
@@ -0,0 +1,158 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	sbb	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xdb,0x7b]
+         {evex}	sbb	bl, 123
+# CHECK: sbb	cl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xdb,0x7b]
+         sbb	cl, bl, 123
+# CHECK: {evex}	sbb	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xda,0x7b]
+         {evex}	sbb	dx, 123
+# CHECK: sbb	ax, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xda,0x7b]
+         sbb	ax, dx, 123
+# CHECK: {evex}	sbb	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xd9,0x7b]
+         {evex}	sbb	ecx, 123
+# CHECK: sbb	edx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xd9,0x7b]
+         sbb	edx, ecx, 123
+# CHECK: {evex}	sbb	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xd9,0x7b]
+         {evex}	sbb	r9, 123
+# CHECK: sbb	r15, r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xd9,0x7b]
+         sbb	r15, r9, 123
+# CHECK: {evex}	sbb	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sbb	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: sbb	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sbb	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	sbb	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sbb	word ptr [r8 + 4*rax + 291], 123
+# CHECK: sbb	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sbb	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	sbb	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sbb	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: sbb	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sbb	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	sbb	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sbb	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: sbb	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sbb	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	sbb	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xda,0xd2,0x04]
+         {evex}	sbb	dx, 1234
+# CHECK: sbb	ax, dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xda,0xd2,0x04]
+         sbb	ax, dx, 1234
+# CHECK: {evex}	sbb	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	sbb	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: sbb	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         sbb	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {evex}	sbb	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xd9,0x40,0xe2,0x01,0x00]
+         {evex}	sbb	ecx, 123456
+# CHECK: sbb	edx, ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xd9,0x40,0xe2,0x01,0x00]
+         sbb	edx, ecx, 123456
+# CHECK: {evex}	sbb	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xd9,0x40,0xe2,0x01,0x00]
+         {evex}	sbb	r9, 123456
+# CHECK: sbb	r15, r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xd9,0x40,0xe2,0x01,0x00]
+         sbb	r15, r9, 123456
+# CHECK: {evex}	sbb	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	sbb	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: sbb	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         sbb	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	sbb	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	sbb	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: sbb	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0x9c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         sbb	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	sbb	cl, bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x18,0xd9]
+         {evex}	sbb	cl, bl
+# CHECK: sbb	r8b, cl, bl
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x18,0xd9]
+         sbb	r8b, cl, bl
+# CHECK: {evex}	sbb	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x18,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbb	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: sbb	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x18,0x9c,0x80,0x23,0x01,0x00,0x00]
+         sbb	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {evex}	sbb	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x19,0xd0]
+         {evex}	sbb	ax, dx
+# CHECK: sbb	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x19,0xd0]
+         sbb	r9w, ax, dx
+# CHECK: {evex}	sbb	word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x19,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbb	word ptr [r8 + 4*rax + 291], dx
+# CHECK: sbb	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x19,0x94,0x80,0x23,0x01,0x00,0x00]
+         sbb	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: {evex}	sbb	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x19,0xca]
+         {evex}	sbb	edx, ecx
+# CHECK: sbb	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x19,0xca]
+         sbb	r10d, edx, ecx
+# CHECK: {evex}	sbb	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x19,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbb	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: sbb	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x19,0x8c,0x80,0x23,0x01,0x00,0x00]
+         sbb	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {evex}	sbb	r15, r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x19,0xcf]
+         {evex}	sbb	r15, r9
+# CHECK: sbb	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x19,0xcf]
+         sbb	r11, r15, r9
+# CHECK: {evex}	sbb	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x19,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbb	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: sbb	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x19,0x8c,0x80,0x23,0x01,0x00,0x00]
+         sbb	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {evex}	sbb	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x1a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbb	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: sbb	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x1a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         sbb	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	sbb	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x1b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbb	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: sbb	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x1b,0x94,0x80,0x23,0x01,0x00,0x00]
+         sbb	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	sbb	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x1b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbb	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: sbb	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x1b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         sbb	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	sbb	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x1b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sbb	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: sbb	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x1b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         sbb	r15, r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/sub-att.s b/llvm/test/MC/X86/apx/sub-att.s
new file mode 100644
index 000000000000..4c12a35b0bc2
--- /dev/null
+++ b/llvm/test/MC/X86/apx/sub-att.s
@@ -0,0 +1,317 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-104: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	subb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xeb,0x7b]
+         {evex}	subb	$123, %bl
+# CHECK: {nf}	subb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x80,0xeb,0x7b]
+         {nf}	subb	$123, %bl
+# CHECK: subb	$123, %bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xeb,0x7b]
+         subb	$123, %bl, %cl
+# CHECK: {nf}	subb	$123, %bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x80,0xeb,0x7b]
+         {nf}	subb	$123, %bl, %cl
+# CHECK: {evex}	subw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xea,0x7b]
+         {evex}	subw	$123, %dx
+# CHECK: {nf}	subw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x83,0xea,0x7b]
+         {nf}	subw	$123, %dx
+# CHECK: subw	$123, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xea,0x7b]
+         subw	$123, %dx, %ax
+# CHECK: {nf}	subw	$123, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xea,0x7b]
+         {nf}	subw	$123, %dx, %ax
+# CHECK: {evex}	subl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xe9,0x7b]
+         {evex}	subl	$123, %ecx
+# CHECK: {nf}	subl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x83,0xe9,0x7b]
+         {nf}	subl	$123, %ecx
+# CHECK: subl	$123, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xe9,0x7b]
+         subl	$123, %ecx, %edx
+# CHECK: {nf}	subl	$123, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x83,0xe9,0x7b]
+         {nf}	subl	$123, %ecx, %edx
+# CHECK: {evex}	subq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xe9,0x7b]
+         {evex}	subq	$123, %r9
+# CHECK: {nf}	subq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xe9,0x7b]
+         {nf}	subq	$123, %r9
+# CHECK: subq	$123, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xe9,0x7b]
+         subq	$123, %r9, %r15
+# CHECK: {nf}	subq	$123, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x83,0xe9,0x7b]
+         {nf}	subq	$123, %r9, %r15
+# CHECK: {evex}	subb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	subb	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	subb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x80,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	subb	$123, 291(%r8,%rax,4)
+# CHECK: subb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         subb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	subb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0x80,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	subb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	subw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	subw	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	subw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	subw	$123, 291(%r8,%rax,4)
+# CHECK: subw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         subw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	subw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	subw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	subl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	subl	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	subl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	subl	$123, 291(%r8,%rax,4)
+# CHECK: subl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         subl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	subl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	subl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	subq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	subq	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	subq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	subq	$123, 291(%r8,%rax,4)
+# CHECK: subq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         subq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	subq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	subq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	subw	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xea,0xd2,0x04]
+         {evex}	subw	$1234, %dx
+# CHECK: {nf}	subw	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x81,0xea,0xd2,0x04]
+         {nf}	subw	$1234, %dx
+# CHECK: subw	$1234, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xea,0xd2,0x04]
+         subw	$1234, %dx, %ax
+# CHECK: {nf}	subw	$1234, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xea,0xd2,0x04]
+         {nf}	subw	$1234, %dx, %ax
+# CHECK: {evex}	subw	$1234, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	subw	$1234, 291(%r8,%rax,4)
+# CHECK: {nf}	subw	$1234, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	subw	$1234, 291(%r8,%rax,4)
+# CHECK: subw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         subw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	subw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	subw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	subl	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         {evex}	subl	$123456, %ecx
+# CHECK: {nf}	subl	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         {nf}	subl	$123456, %ecx
+# CHECK: subl	$123456, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         subl	$123456, %ecx, %edx
+# CHECK: {nf}	subl	$123456, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         {nf}	subl	$123456, %ecx, %edx
+# CHECK: {evex}	subq	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         {evex}	subq	$123456, %r9
+# CHECK: {nf}	subq	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         {nf}	subq	$123456, %r9
+# CHECK: subq	$123456, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         subq	$123456, %r9, %r15
+# CHECK: {nf}	subq	$123456, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         {nf}	subq	$123456, %r9, %r15
+# CHECK: {evex}	subl	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	subl	$123456, 291(%r8,%rax,4)
+# CHECK: {nf}	subl	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	subl	$123456, 291(%r8,%rax,4)
+# CHECK: subl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         subl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	subl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	subl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	subq	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	subq	$123456, 291(%r8,%rax,4)
+# CHECK: {nf}	subq	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	subq	$123456, 291(%r8,%rax,4)
+# CHECK: subq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         subq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	subq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	subq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	subb	%bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x28,0xd9]
+         {evex}	subb	%bl, %cl
+# CHECK: {nf}	subb	%bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x28,0xd9]
+         {nf}	subb	%bl, %cl
+# CHECK: subb	%bl, %cl, %r8b
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x28,0xd9]
+         subb	%bl, %cl, %r8b
+# CHECK: {nf}	subb	%bl, %cl, %r8b
+# CHECK: encoding: [0x62,0xf4,0x3c,0x1c,0x28,0xd9]
+         {nf}	subb	%bl, %cl, %r8b
+# CHECK: {evex}	subb	%bl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x28,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	subb	%bl, 291(%r8,%rax,4)
+# CHECK: {nf}	subb	%bl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x28,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subb	%bl, 291(%r8,%rax,4)
+# CHECK: subb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x28,0x9c,0x80,0x23,0x01,0x00,0x00]
+         subb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: {nf}	subb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x28,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: {evex}	subw	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x29,0xd0]
+         {evex}	subw	%dx, %ax
+# CHECK: {nf}	subw	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x29,0xd0]
+         {nf}	subw	%dx, %ax
+# CHECK: subw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x29,0xd0]
+         subw	%dx, %ax, %r9w
+# CHECK: {nf}	subw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x1c,0x29,0xd0]
+         {nf}	subw	%dx, %ax, %r9w
+# CHECK: {evex}	subw	%dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x29,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	subw	%dx, 291(%r8,%rax,4)
+# CHECK: {nf}	subw	%dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x29,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subw	%dx, 291(%r8,%rax,4)
+# CHECK: subw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x29,0x94,0x80,0x23,0x01,0x00,0x00]
+         subw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: {nf}	subw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x29,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: {evex}	subl	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x29,0xca]
+         {evex}	subl	%ecx, %edx
+# CHECK: {nf}	subl	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x29,0xca]
+         {nf}	subl	%ecx, %edx
+# CHECK: subl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x29,0xca]
+         subl	%ecx, %edx, %r10d
+# CHECK: {nf}	subl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x1c,0x29,0xca]
+         {nf}	subl	%ecx, %edx, %r10d
+# CHECK: {evex}	subl	%ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	subl	%ecx, 291(%r8,%rax,4)
+# CHECK: {nf}	subl	%ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subl	%ecx, 291(%r8,%rax,4)
+# CHECK: subl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         subl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: {nf}	subl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: {evex}	subq	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x29,0xcf]
+         {evex}	subq	%r9, %r15
+# CHECK: {nf}	subq	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x29,0xcf]
+         {nf}	subq	%r9, %r15
+# CHECK: subq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x29,0xcf]
+         subq	%r9, %r15, %r11
+# CHECK: {nf}	subq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x1c,0x29,0xcf]
+         {nf}	subq	%r9, %r15, %r11
+# CHECK: {evex}	subq	%r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	subq	%r9, 291(%r8,%rax,4)
+# CHECK: {nf}	subq	%r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subq	%r9, 291(%r8,%rax,4)
+# CHECK: subq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         subq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: {nf}	subq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: {evex}	subb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x2a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	subb	291(%r8,%rax,4), %bl
+# CHECK: {nf}	subb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x2a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subb	291(%r8,%rax,4), %bl
+# CHECK: subb	291(%r8,%rax,4), %bl, %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x2a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         subb	291(%r8,%rax,4), %bl, %cl
+# CHECK: {nf}	subb	291(%r8,%rax,4), %bl, %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x2a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subb	291(%r8,%rax,4), %bl, %cl
+# CHECK: {evex}	subw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x2b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	subw	291(%r8,%rax,4), %dx
+# CHECK: {nf}	subw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x2b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subw	291(%r8,%rax,4), %dx
+# CHECK: subw	291(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x2b,0x94,0x80,0x23,0x01,0x00,0x00]
+         subw	291(%r8,%rax,4), %dx, %ax
+# CHECK: {nf}	subw	291(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x2b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subw	291(%r8,%rax,4), %dx, %ax
+# CHECK: {evex}	subl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	subl	291(%r8,%rax,4), %ecx
+# CHECK: {nf}	subl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subl	291(%r8,%rax,4), %ecx
+# CHECK: subl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         subl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: {nf}	subl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: {evex}	subq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	subq	291(%r8,%rax,4), %r9
+# CHECK: {nf}	subq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subq	291(%r8,%rax,4), %r9
+# CHECK: subq	291(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         subq	291(%r8,%rax,4), %r9, %r15
+# CHECK: {nf}	subq	291(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	subq	291(%r8,%rax,4), %r9, %r15
diff --git a/llvm/test/MC/X86/apx/sub-intel.s b/llvm/test/MC/X86/apx/sub-intel.s
new file mode 100644
index 000000000000..b66fa613d105
--- /dev/null
+++ b/llvm/test/MC/X86/apx/sub-intel.s
@@ -0,0 +1,314 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	sub	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xeb,0x7b]
+         {evex}	sub	bl, 123
+# CHECK: {nf}	sub	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x80,0xeb,0x7b]
+         {nf}	sub	bl, 123
+# CHECK: sub	cl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xeb,0x7b]
+         sub	cl, bl, 123
+# CHECK: {nf}	sub	cl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x80,0xeb,0x7b]
+         {nf}	sub	cl, bl, 123
+# CHECK: {evex}	sub	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xea,0x7b]
+         {evex}	sub	dx, 123
+# CHECK: {nf}	sub	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x83,0xea,0x7b]
+         {nf}	sub	dx, 123
+# CHECK: sub	ax, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xea,0x7b]
+         sub	ax, dx, 123
+# CHECK: {nf}	sub	ax, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xea,0x7b]
+         {nf}	sub	ax, dx, 123
+# CHECK: {evex}	sub	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xe9,0x7b]
+         {evex}	sub	ecx, 123
+# CHECK: {nf}	sub	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x83,0xe9,0x7b]
+         {nf}	sub	ecx, 123
+# CHECK: sub	edx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xe9,0x7b]
+         sub	edx, ecx, 123
+# CHECK: {nf}	sub	edx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x83,0xe9,0x7b]
+         {nf}	sub	edx, ecx, 123
+# CHECK: {evex}	sub	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xe9,0x7b]
+         {evex}	sub	r9, 123
+# CHECK: {nf}	sub	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xe9,0x7b]
+         {nf}	sub	r9, 123
+# CHECK: sub	r15, r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xe9,0x7b]
+         sub	r15, r9, 123
+# CHECK: {nf}	sub	r15, r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x83,0xe9,0x7b]
+         {nf}	sub	r15, r9, 123
+# CHECK: {evex}	sub	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sub	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sub	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x80,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sub	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: sub	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sub	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sub	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0x80,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sub	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	sub	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sub	word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sub	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sub	word ptr [r8 + 4*rax + 291], 123
+# CHECK: sub	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sub	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sub	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sub	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	sub	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sub	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sub	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sub	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: sub	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sub	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sub	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sub	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	sub	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sub	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sub	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sub	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: sub	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sub	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sub	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x83,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sub	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	sub	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xea,0xd2,0x04]
+         {evex}	sub	dx, 1234
+# CHECK: {nf}	sub	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x81,0xea,0xd2,0x04]
+         {nf}	sub	dx, 1234
+# CHECK: sub	ax, dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xea,0xd2,0x04]
+         sub	ax, dx, 1234
+# CHECK: {nf}	sub	ax, dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xea,0xd2,0x04]
+         {nf}	sub	ax, dx, 1234
+# CHECK: {evex}	sub	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	sub	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {nf}	sub	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	sub	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: sub	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         sub	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {nf}	sub	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	sub	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {evex}	sub	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         {evex}	sub	ecx, 123456
+# CHECK: {nf}	sub	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         {nf}	sub	ecx, 123456
+# CHECK: sub	edx, ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         sub	edx, ecx, 123456
+# CHECK: {nf}	sub	edx, ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         {nf}	sub	edx, ecx, 123456
+# CHECK: {evex}	sub	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         {evex}	sub	r9, 123456
+# CHECK: {nf}	sub	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         {nf}	sub	r9, 123456
+# CHECK: sub	r15, r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         sub	r15, r9, 123456
+# CHECK: {nf}	sub	r15, r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x81,0xe9,0x40,0xe2,0x01,0x00]
+         {nf}	sub	r15, r9, 123456
+# CHECK: {evex}	sub	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	sub	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	sub	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	sub	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: sub	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         sub	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	sub	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	sub	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	sub	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	sub	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	sub	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	sub	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: sub	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         sub	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	sub	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x81,0xac,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	sub	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	sub	cl, bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x28,0xd9]
+         {evex}	sub	cl, bl
+# CHECK: {nf}	sub	cl, bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x28,0xd9]
+         {nf}	sub	cl, bl
+# CHECK: sub	r8b, cl, bl
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x28,0xd9]
+         sub	r8b, cl, bl
+# CHECK: {nf}	sub	r8b, cl, bl
+# CHECK: encoding: [0x62,0xf4,0x3c,0x1c,0x28,0xd9]
+         {nf}	sub	r8b, cl, bl
+# CHECK: {evex}	sub	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x28,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sub	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {nf}	sub	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x28,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: sub	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x28,0x9c,0x80,0x23,0x01,0x00,0x00]
+         sub	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {nf}	sub	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x28,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {evex}	sub	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x29,0xd0]
+         {evex}	sub	ax, dx
+# CHECK: {nf}	sub	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x29,0xd0]
+         {nf}	sub	ax, dx
+# CHECK: sub	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x29,0xd0]
+         sub	r9w, ax, dx
+# CHECK: {nf}	sub	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x1c,0x29,0xd0]
+         {nf}	sub	r9w, ax, dx
+# CHECK: {evex}	sub	word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x29,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sub	word ptr [r8 + 4*rax + 291], dx
+# CHECK: {nf}	sub	word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x29,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	word ptr [r8 + 4*rax + 291], dx
+# CHECK: sub	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x29,0x94,0x80,0x23,0x01,0x00,0x00]
+         sub	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: {nf}	sub	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x29,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: {evex}	sub	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x29,0xca]
+         {evex}	sub	edx, ecx
+# CHECK: {nf}	sub	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x29,0xca]
+         {nf}	sub	edx, ecx
+# CHECK: sub	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x29,0xca]
+         sub	r10d, edx, ecx
+# CHECK: {nf}	sub	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x1c,0x29,0xca]
+         {nf}	sub	r10d, edx, ecx
+# CHECK: {evex}	sub	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sub	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {nf}	sub	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: sub	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         sub	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {nf}	sub	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {evex}	sub	r15, r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x29,0xcf]
+         {evex}	sub	r15, r9
+# CHECK: {nf}	sub	r15, r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x29,0xcf]
+         {nf}	sub	r15, r9
+# CHECK: sub	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x29,0xcf]
+         sub	r11, r15, r9
+# CHECK: {nf}	sub	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x1c,0x29,0xcf]
+         {nf}	sub	r11, r15, r9
+# CHECK: {evex}	sub	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sub	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {nf}	sub	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: sub	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         sub	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {nf}	sub	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x29,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {evex}	sub	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x2a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sub	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sub	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x2a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: sub	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x2a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         sub	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sub	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x2a,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	sub	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x2b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sub	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sub	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x2b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: sub	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x2b,0x94,0x80,0x23,0x01,0x00,0x00]
+         sub	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sub	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x2b,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	sub	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sub	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sub	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: sub	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         sub	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sub	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	sub	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sub	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sub	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: sub	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         sub	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sub	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x2b,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sub	r15, r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/wrssd-att.s b/llvm/test/MC/X86/apx/wrssd-att.s
new file mode 100644
index 000000000000..409b3010f5c7
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrssd-att.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-1: error:
+# ERROR-NOT: error:
+# CHECK: wrssd	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00]
+         wrssd	%r18d, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrssd-intel.s b/llvm/test/MC/X86/apx/wrssd-intel.s
new file mode 100644
index 000000000000..1d402f2c5177
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrssd-intel.s
@@ -0,0 +1,5 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: wrssd	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00]
+         wrssd	dword ptr [r28 + 4*r29 + 291], r18d
diff --git a/llvm/test/MC/X86/apx/wrssq-att.s b/llvm/test/MC/X86/apx/wrssq-att.s
new file mode 100644
index 000000000000..1f616ac2e4e4
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrssq-att.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-1: error:
+# ERROR-NOT: error:
+# CHECK: wrssq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00]
+         wrssq	%r19, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrssq-intel.s b/llvm/test/MC/X86/apx/wrssq-intel.s
new file mode 100644
index 000000000000..d31dca55ca4a
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrssq-intel.s
@@ -0,0 +1,5 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: wrssq	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00]
+         wrssq	qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/wrussd-att.s b/llvm/test/MC/X86/apx/wrussd-att.s
new file mode 100644
index 000000000000..269d9a8aa858
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrussd-att.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-1: error:
+# ERROR-NOT: error:
+# CHECK: wrussd	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00]
+         wrussd	%r18d, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrussd-intel.s b/llvm/test/MC/X86/apx/wrussd-intel.s
new file mode 100644
index 000000000000..fed6eb10d4ad
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrussd-intel.s
@@ -0,0 +1,5 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: wrussd	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00]
+         wrussd	dword ptr [r28 + 4*r29 + 291], r18d
diff --git a/llvm/test/MC/X86/apx/wrussq-att.s b/llvm/test/MC/X86/apx/wrussq-att.s
new file mode 100644
index 000000000000..b41360cd9db0
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrussq-att.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-1: error:
+# ERROR-NOT: error:
+# CHECK: wrussq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00]
+         wrussq	%r19, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrussq-intel.s b/llvm/test/MC/X86/apx/wrussq-intel.s
new file mode 100644
index 000000000000..a9a96da9d3d1
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrussq-intel.s
@@ -0,0 +1,5 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: wrussq	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00]
+         wrussq	qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/xor-att.s b/llvm/test/MC/X86/apx/xor-att.s
new file mode 100644
index 000000000000..509cf5b86fe9
--- /dev/null
+++ b/llvm/test/MC/X86/apx/xor-att.s
@@ -0,0 +1,317 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-104: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	xorb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xf3,0x7b]
+         {evex}	xorb	$123, %bl
+# CHECK: {nf}	xorb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x80,0xf3,0x7b]
+         {nf}	xorb	$123, %bl
+# CHECK: xorb	$123, %bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xf3,0x7b]
+         xorb	$123, %bl, %cl
+# CHECK: {nf}	xorb	$123, %bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x80,0xf3,0x7b]
+         {nf}	xorb	$123, %bl, %cl
+# CHECK: {evex}	xorw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xf2,0x7b]
+         {evex}	xorw	$123, %dx
+# CHECK: {nf}	xorw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x83,0xf2,0x7b]
+         {nf}	xorw	$123, %dx
+# CHECK: xorw	$123, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xf2,0x7b]
+         xorw	$123, %dx, %ax
+# CHECK: {nf}	xorw	$123, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xf2,0x7b]
+         {nf}	xorw	$123, %dx, %ax
+# CHECK: {evex}	xorl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xf1,0x7b]
+         {evex}	xorl	$123, %ecx
+# CHECK: {nf}	xorl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x83,0xf1,0x7b]
+         {nf}	xorl	$123, %ecx
+# CHECK: xorl	$123, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xf1,0x7b]
+         xorl	$123, %ecx, %edx
+# CHECK: {nf}	xorl	$123, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x83,0xf1,0x7b]
+         {nf}	xorl	$123, %ecx, %edx
+# CHECK: {evex}	xorq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xf1,0x7b]
+         {evex}	xorq	$123, %r9
+# CHECK: {nf}	xorq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xf1,0x7b]
+         {nf}	xorq	$123, %r9
+# CHECK: xorq	$123, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xf1,0x7b]
+         xorq	$123, %r9, %r15
+# CHECK: {nf}	xorq	$123, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x83,0xf1,0x7b]
+         {nf}	xorq	$123, %r9, %r15
+# CHECK: {evex}	xorb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	xorb	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	xorb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x80,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xorb	$123, 291(%r8,%rax,4)
+# CHECK: xorb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         xorb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	xorb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0x80,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xorb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	xorw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	xorw	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	xorw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xorw	$123, 291(%r8,%rax,4)
+# CHECK: xorw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         xorw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	xorw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xorw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	xorl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	xorl	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	xorl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xorl	$123, 291(%r8,%rax,4)
+# CHECK: xorl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         xorl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	xorl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xorl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	xorq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	xorq	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	xorq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xorq	$123, 291(%r8,%rax,4)
+# CHECK: xorq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         xorq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	xorq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xorq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	xorw	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xf2,0xd2,0x04]
+         {evex}	xorw	$1234, %dx
+# CHECK: {nf}	xorw	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x81,0xf2,0xd2,0x04]
+         {nf}	xorw	$1234, %dx
+# CHECK: xorw	$1234, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xf2,0xd2,0x04]
+         xorw	$1234, %dx, %ax
+# CHECK: {nf}	xorw	$1234, %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xf2,0xd2,0x04]
+         {nf}	xorw	$1234, %dx, %ax
+# CHECK: {evex}	xorw	$1234, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	xorw	$1234, 291(%r8,%rax,4)
+# CHECK: {nf}	xorw	$1234, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	xorw	$1234, 291(%r8,%rax,4)
+# CHECK: xorw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         xorw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	xorw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	xorw	$1234, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	xorl	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         {evex}	xorl	$123456, %ecx
+# CHECK: {nf}	xorl	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         {nf}	xorl	$123456, %ecx
+# CHECK: xorl	$123456, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         xorl	$123456, %ecx, %edx
+# CHECK: {nf}	xorl	$123456, %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         {nf}	xorl	$123456, %ecx, %edx
+# CHECK: {evex}	xorq	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         {evex}	xorq	$123456, %r9
+# CHECK: {nf}	xorq	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         {nf}	xorq	$123456, %r9
+# CHECK: xorq	$123456, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         xorq	$123456, %r9, %r15
+# CHECK: {nf}	xorq	$123456, %r9, %r15
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         {nf}	xorq	$123456, %r9, %r15
+# CHECK: {evex}	xorl	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	xorl	$123456, 291(%r8,%rax,4)
+# CHECK: {nf}	xorl	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	xorl	$123456, 291(%r8,%rax,4)
+# CHECK: xorl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         xorl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	xorl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	xorl	$123456, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	xorq	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	xorq	$123456, 291(%r8,%rax,4)
+# CHECK: {nf}	xorq	$123456, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	xorq	$123456, 291(%r8,%rax,4)
+# CHECK: xorq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         xorq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	xorq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	xorq	$123456, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	xorb	%bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x30,0xd9]
+         {evex}	xorb	%bl, %cl
+# CHECK: {nf}	xorb	%bl, %cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x30,0xd9]
+         {nf}	xorb	%bl, %cl
+# CHECK: xorb	%bl, %cl, %r8b
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x30,0xd9]
+         xorb	%bl, %cl, %r8b
+# CHECK: {nf}	xorb	%bl, %cl, %r8b
+# CHECK: encoding: [0x62,0xf4,0x3c,0x1c,0x30,0xd9]
+         {nf}	xorb	%bl, %cl, %r8b
+# CHECK: {evex}	xorb	%bl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x30,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xorb	%bl, 291(%r8,%rax,4)
+# CHECK: {nf}	xorb	%bl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x30,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorb	%bl, 291(%r8,%rax,4)
+# CHECK: xorb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x30,0x9c,0x80,0x23,0x01,0x00,0x00]
+         xorb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: {nf}	xorb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x30,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorb	%bl, 291(%r8,%rax,4), %cl
+# CHECK: {evex}	xorw	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x31,0xd0]
+         {evex}	xorw	%dx, %ax
+# CHECK: {nf}	xorw	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x31,0xd0]
+         {nf}	xorw	%dx, %ax
+# CHECK: xorw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x31,0xd0]
+         xorw	%dx, %ax, %r9w
+# CHECK: {nf}	xorw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x1c,0x31,0xd0]
+         {nf}	xorw	%dx, %ax, %r9w
+# CHECK: {evex}	xorw	%dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x31,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xorw	%dx, 291(%r8,%rax,4)
+# CHECK: {nf}	xorw	%dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x31,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorw	%dx, 291(%r8,%rax,4)
+# CHECK: xorw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x31,0x94,0x80,0x23,0x01,0x00,0x00]
+         xorw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: {nf}	xorw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x31,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorw	%dx, 291(%r8,%rax,4), %ax
+# CHECK: {evex}	xorl	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x31,0xca]
+         {evex}	xorl	%ecx, %edx
+# CHECK: {nf}	xorl	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x31,0xca]
+         {nf}	xorl	%ecx, %edx
+# CHECK: xorl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x31,0xca]
+         xorl	%ecx, %edx, %r10d
+# CHECK: {nf}	xorl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x1c,0x31,0xca]
+         {nf}	xorl	%ecx, %edx, %r10d
+# CHECK: {evex}	xorl	%ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xorl	%ecx, 291(%r8,%rax,4)
+# CHECK: {nf}	xorl	%ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorl	%ecx, 291(%r8,%rax,4)
+# CHECK: xorl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         xorl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: {nf}	xorl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorl	%ecx, 291(%r8,%rax,4), %edx
+# CHECK: {evex}	xorq	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x31,0xcf]
+         {evex}	xorq	%r9, %r15
+# CHECK: {nf}	xorq	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x31,0xcf]
+         {nf}	xorq	%r9, %r15
+# CHECK: xorq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x31,0xcf]
+         xorq	%r9, %r15, %r11
+# CHECK: {nf}	xorq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x1c,0x31,0xcf]
+         {nf}	xorq	%r9, %r15, %r11
+# CHECK: {evex}	xorq	%r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xorq	%r9, 291(%r8,%rax,4)
+# CHECK: {nf}	xorq	%r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorq	%r9, 291(%r8,%rax,4)
+# CHECK: xorq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         xorq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: {nf}	xorq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorq	%r9, 291(%r8,%rax,4), %r15
+# CHECK: {evex}	xorb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x32,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xorb	291(%r8,%rax,4), %bl
+# CHECK: {nf}	xorb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x32,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorb	291(%r8,%rax,4), %bl
+# CHECK: xorb	291(%r8,%rax,4), %bl, %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x32,0x9c,0x80,0x23,0x01,0x00,0x00]
+         xorb	291(%r8,%rax,4), %bl, %cl
+# CHECK: {nf}	xorb	291(%r8,%rax,4), %bl, %cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x32,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorb	291(%r8,%rax,4), %bl, %cl
+# CHECK: {evex}	xorw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x33,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xorw	291(%r8,%rax,4), %dx
+# CHECK: {nf}	xorw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x33,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorw	291(%r8,%rax,4), %dx
+# CHECK: xorw	291(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x33,0x94,0x80,0x23,0x01,0x00,0x00]
+         xorw	291(%r8,%rax,4), %dx, %ax
+# CHECK: {nf}	xorw	291(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x33,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorw	291(%r8,%rax,4), %dx, %ax
+# CHECK: {evex}	xorl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xorl	291(%r8,%rax,4), %ecx
+# CHECK: {nf}	xorl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorl	291(%r8,%rax,4), %ecx
+# CHECK: xorl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         xorl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: {nf}	xorl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorl	291(%r8,%rax,4), %ecx, %edx
+# CHECK: {evex}	xorq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xorq	291(%r8,%rax,4), %r9
+# CHECK: {nf}	xorq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorq	291(%r8,%rax,4), %r9
+# CHECK: xorq	291(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         xorq	291(%r8,%rax,4), %r9, %r15
+# CHECK: {nf}	xorq	291(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xorq	291(%r8,%rax,4), %r9, %r15
diff --git a/llvm/test/MC/X86/apx/xor-intel.s b/llvm/test/MC/X86/apx/xor-intel.s
new file mode 100644
index 000000000000..8b2e29a35239
--- /dev/null
+++ b/llvm/test/MC/X86/apx/xor-intel.s
@@ -0,0 +1,314 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	xor	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x80,0xf3,0x7b]
+         {evex}	xor	bl, 123
+# CHECK: {nf}	xor	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x80,0xf3,0x7b]
+         {nf}	xor	bl, 123
+# CHECK: xor	cl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x80,0xf3,0x7b]
+         xor	cl, bl, 123
+# CHECK: {nf}	xor	cl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x80,0xf3,0x7b]
+         {nf}	xor	cl, bl, 123
+# CHECK: {evex}	xor	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x83,0xf2,0x7b]
+         {evex}	xor	dx, 123
+# CHECK: {nf}	xor	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x83,0xf2,0x7b]
+         {nf}	xor	dx, 123
+# CHECK: xor	ax, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x83,0xf2,0x7b]
+         xor	ax, dx, 123
+# CHECK: {nf}	xor	ax, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xf2,0x7b]
+         {nf}	xor	ax, dx, 123
+# CHECK: {evex}	xor	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x83,0xf1,0x7b]
+         {evex}	xor	ecx, 123
+# CHECK: {nf}	xor	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x83,0xf1,0x7b]
+         {nf}	xor	ecx, 123
+# CHECK: xor	edx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x83,0xf1,0x7b]
+         xor	edx, ecx, 123
+# CHECK: {nf}	xor	edx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x83,0xf1,0x7b]
+         {nf}	xor	edx, ecx, 123
+# CHECK: {evex}	xor	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xf1,0x7b]
+         {evex}	xor	r9, 123
+# CHECK: {nf}	xor	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xf1,0x7b]
+         {nf}	xor	r9, 123
+# CHECK: xor	r15, r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x83,0xf1,0x7b]
+         xor	r15, r9, 123
+# CHECK: {nf}	xor	r15, r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x83,0xf1,0x7b]
+         {nf}	xor	r15, r9, 123
+# CHECK: {evex}	xor	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x80,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	xor	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	xor	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x80,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xor	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: xor	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0x80,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         xor	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	xor	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0x80,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xor	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	xor	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	xor	word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	xor	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xor	word ptr [r8 + 4*rax + 291], 123
+# CHECK: xor	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         xor	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	xor	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xor	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	xor	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	xor	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	xor	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xor	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: xor	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         xor	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	xor	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xor	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	xor	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	xor	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	xor	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xor	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: xor	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         xor	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	xor	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x83,0xb4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	xor	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	xor	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x81,0xf2,0xd2,0x04]
+         {evex}	xor	dx, 1234
+# CHECK: {nf}	xor	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x81,0xf2,0xd2,0x04]
+         {nf}	xor	dx, 1234
+# CHECK: xor	ax, dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x81,0xf2,0xd2,0x04]
+         xor	ax, dx, 1234
+# CHECK: {nf}	xor	ax, dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xf2,0xd2,0x04]
+         {nf}	xor	ax, dx, 1234
+# CHECK: {evex}	xor	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {evex}	xor	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {nf}	xor	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	xor	word ptr [r8 + 4*rax + 291], 1234
+# CHECK: xor	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         xor	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {nf}	xor	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0xd2,0x04]
+         {nf}	xor	dx, word ptr [r8 + 4*rax + 291], 1234
+# CHECK: {evex}	xor	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         {evex}	xor	ecx, 123456
+# CHECK: {nf}	xor	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         {nf}	xor	ecx, 123456
+# CHECK: xor	edx, ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x6c,0x18,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         xor	edx, ecx, 123456
+# CHECK: {nf}	xor	edx, ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x6c,0x1c,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         {nf}	xor	edx, ecx, 123456
+# CHECK: {evex}	xor	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         {evex}	xor	r9, 123456
+# CHECK: {nf}	xor	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         {nf}	xor	r9, 123456
+# CHECK: xor	r15, r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x18,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         xor	r15, r9, 123456
+# CHECK: {nf}	xor	r15, r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x1c,0x81,0xf1,0x40,0xe2,0x01,0x00]
+         {nf}	xor	r15, r9, 123456
+# CHECK: {evex}	xor	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	xor	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	xor	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	xor	dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: xor	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         xor	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	xor	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	xor	ecx, dword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	xor	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {evex}	xor	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	xor	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	xor	qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: xor	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         xor	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {nf}	xor	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0x81,0xb4,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00]
+         {nf}	xor	r9, qword ptr [r8 + 4*rax + 291], 123456
+# CHECK: {evex}	xor	cl, bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x30,0xd9]
+         {evex}	xor	cl, bl
+# CHECK: {nf}	xor	cl, bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x30,0xd9]
+         {nf}	xor	cl, bl
+# CHECK: xor	r8b, cl, bl
+# CHECK: encoding: [0x62,0xf4,0x3c,0x18,0x30,0xd9]
+         xor	r8b, cl, bl
+# CHECK: {nf}	xor	r8b, cl, bl
+# CHECK: encoding: [0x62,0xf4,0x3c,0x1c,0x30,0xd9]
+         {nf}	xor	r8b, cl, bl
+# CHECK: {evex}	xor	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x30,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xor	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {nf}	xor	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x30,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	byte ptr [r8 + 4*rax + 291], bl
+# CHECK: xor	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x30,0x9c,0x80,0x23,0x01,0x00,0x00]
+         xor	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {nf}	xor	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x30,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	cl, byte ptr [r8 + 4*rax + 291], bl
+# CHECK: {evex}	xor	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x31,0xd0]
+         {evex}	xor	ax, dx
+# CHECK: {nf}	xor	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x31,0xd0]
+         {nf}	xor	ax, dx
+# CHECK: xor	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x31,0xd0]
+         xor	r9w, ax, dx
+# CHECK: {nf}	xor	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x1c,0x31,0xd0]
+         {nf}	xor	r9w, ax, dx
+# CHECK: {evex}	xor	word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x31,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xor	word ptr [r8 + 4*rax + 291], dx
+# CHECK: {nf}	xor	word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x31,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	word ptr [r8 + 4*rax + 291], dx
+# CHECK: xor	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x31,0x94,0x80,0x23,0x01,0x00,0x00]
+         xor	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: {nf}	xor	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x31,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	ax, word ptr [r8 + 4*rax + 291], dx
+# CHECK: {evex}	xor	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x31,0xca]
+         {evex}	xor	edx, ecx
+# CHECK: {nf}	xor	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x31,0xca]
+         {nf}	xor	edx, ecx
+# CHECK: xor	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x31,0xca]
+         xor	r10d, edx, ecx
+# CHECK: {nf}	xor	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x1c,0x31,0xca]
+         {nf}	xor	r10d, edx, ecx
+# CHECK: {evex}	xor	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xor	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {nf}	xor	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: xor	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         xor	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {nf}	xor	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	edx, dword ptr [r8 + 4*rax + 291], ecx
+# CHECK: {evex}	xor	r15, r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x31,0xcf]
+         {evex}	xor	r15, r9
+# CHECK: {nf}	xor	r15, r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x31,0xcf]
+         {nf}	xor	r15, r9
+# CHECK: xor	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x31,0xcf]
+         xor	r11, r15, r9
+# CHECK: {nf}	xor	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x1c,0x31,0xcf]
+         {nf}	xor	r11, r15, r9
+# CHECK: {evex}	xor	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xor	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {nf}	xor	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	qword ptr [r8 + 4*rax + 291], r9
+# CHECK: xor	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         xor	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {nf}	xor	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x31,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	r15, qword ptr [r8 + 4*rax + 291], r9
+# CHECK: {evex}	xor	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x32,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xor	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	xor	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x32,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: xor	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x32,0x9c,0x80,0x23,0x01,0x00,0x00]
+         xor	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	xor	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x32,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	cl, bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	xor	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x33,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xor	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	xor	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x33,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: xor	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x33,0x94,0x80,0x23,0x01,0x00,0x00]
+         xor	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	xor	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x1c,0x33,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	ax, dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	xor	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xor	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	xor	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: xor	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         xor	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	xor	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x1c,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	edx, ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	xor	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	xor	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	xor	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: xor	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         xor	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	xor	r15, r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0x54,0x84,0x1c,0x33,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	xor	r15, r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/register-assignment-error.s b/llvm/test/MC/X86/register-assignment-error.s
deleted file mode 100644
index 6c5fcf3cae9e..000000000000
--- a/llvm/test/MC/X86/register-assignment-error.s
+++ /dev/null
@@ -1,8 +0,0 @@
-// RUN: not llvm-mc -triple x86_64 %s -o /dev/null 2>&1 | FileCheck %s
-
-var_xdata = %rcx
-
-// This used to crash.
-.if var_xdata == 1
-.endif
-// CHECK: error: expected absolute expression
-\ No newline at end of file
diff --git a/llvm/test/MC/X86/register-assignment.s b/llvm/test/MC/X86/register-assignment.s
index 84ea062cb746..4c7b07df050b 100644
--- a/llvm/test/MC/X86/register-assignment.s
+++ b/llvm/test/MC/X86/register-assignment.s
@@ -1,4 +1,5 @@
 // RUN: llvm-mc -triple x86_64-unknown-unknown %s -o -      | FileCheck %s
+// RUN: not llvm-mc -triple x86_64 --defsym ERR=1 %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR
 	
 // CHECK-NOT: .set var_xdata
 var_xdata = %rcx
@@ -25,3 +26,12 @@ xorq var_xdata, var_xdata
 .else
   .byte 2
 .endif
+
+.ifdef ERR
+// ERR: [[#@LINE+1]]:5: error: expected absolute expression
+.if var_xdata == 1
+.endif
+// ERR: [[#@LINE+1]]:5: error: expected absolute expression
+.if 1 == var_xdata
+.endif
+.endif
diff --git a/llvm/test/MC/X86/x86_64-asm-match.s b/llvm/test/MC/X86/x86_64-asm-match.s
index cb1a40d54153..68d375ec3e4c 100644
--- a/llvm/test/MC/X86/x86_64-asm-match.s
+++ b/llvm/test/MC/X86/x86_64-asm-match.s
@@ -29,7 +29,7 @@
 // CHECK:   Matching formal operand class MCK_FR16 against actual operand at index 3 (Reg:xmm5): match success using generic matcher
 // CHECK:   Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range
 // CHECK:   Opcode result: complete match, selecting this opcode
-// CHECK: AsmMatcher: found 2 encodings with mnemonic 'crc32l'
+// CHECK: AsmMatcher: found 4 encodings with mnemonic 'crc32l'
 // CHECK: Trying to match opcode CRC32r32r32
 // CHECK:   Matching formal operand class MCK_GR32 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rbx,IndexReg=rcx,Scale=8,Disp=2125315823,SegReg=gs): Opcode result: multiple operand mismatches, ignoring this opcode
 // CHECK: Trying to match opcode CRC32r32m32
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 4f957d104d8d..8ac46a5b6692 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -21,28 +21,50 @@ static const X86FoldTableEntry Table2Addr[] = {
   {X86::ADC8rr, X86::ADC8mr, TB_NO_REVERSE},
   {X86::ADD16ri, X86::ADD16mi, TB_NO_REVERSE},
   {X86::ADD16ri8, X86::ADD16mi8, TB_NO_REVERSE},
+  {X86::ADD16ri8_NF, X86::ADD16mi8_NF, TB_NO_REVERSE},
+  {X86::ADD16ri_NF, X86::ADD16mi_NF, TB_NO_REVERSE},
   {X86::ADD16rr, X86::ADD16mr, TB_NO_REVERSE},
+  {X86::ADD16rr_NF, X86::ADD16mr_NF, TB_NO_REVERSE},
   {X86::ADD32ri, X86::ADD32mi, TB_NO_REVERSE},
   {X86::ADD32ri8, X86::ADD32mi8, TB_NO_REVERSE},
+  {X86::ADD32ri8_NF, X86::ADD32mi8_NF, TB_NO_REVERSE},
+  {X86::ADD32ri_NF, X86::ADD32mi_NF, TB_NO_REVERSE},
   {X86::ADD32rr, X86::ADD32mr, TB_NO_REVERSE},
+  {X86::ADD32rr_NF, X86::ADD32mr_NF, TB_NO_REVERSE},
   {X86::ADD64ri32, X86::ADD64mi32, TB_NO_REVERSE},
+  {X86::ADD64ri32_NF, X86::ADD64mi32_NF, TB_NO_REVERSE},
   {X86::ADD64ri8, X86::ADD64mi8, TB_NO_REVERSE},
+  {X86::ADD64ri8_NF, X86::ADD64mi8_NF, TB_NO_REVERSE},
   {X86::ADD64rr, X86::ADD64mr, TB_NO_REVERSE},
+  {X86::ADD64rr_NF, X86::ADD64mr_NF, TB_NO_REVERSE},
   {X86::ADD8ri, X86::ADD8mi, TB_NO_REVERSE},
   {X86::ADD8ri8, X86::ADD8mi8, TB_NO_REVERSE},
+  {X86::ADD8ri_NF, X86::ADD8mi_NF, TB_NO_REVERSE},
   {X86::ADD8rr, X86::ADD8mr, TB_NO_REVERSE},
+  {X86::ADD8rr_NF, X86::ADD8mr_NF, TB_NO_REVERSE},
   {X86::AND16ri, X86::AND16mi, TB_NO_REVERSE},
   {X86::AND16ri8, X86::AND16mi8, TB_NO_REVERSE},
+  {X86::AND16ri8_NF, X86::AND16mi8_NF, TB_NO_REVERSE},
+  {X86::AND16ri_NF, X86::AND16mi_NF, TB_NO_REVERSE},
   {X86::AND16rr, X86::AND16mr, TB_NO_REVERSE},
+  {X86::AND16rr_NF, X86::AND16mr_NF, TB_NO_REVERSE},
   {X86::AND32ri, X86::AND32mi, TB_NO_REVERSE},
   {X86::AND32ri8, X86::AND32mi8, TB_NO_REVERSE},
+  {X86::AND32ri8_NF, X86::AND32mi8_NF, TB_NO_REVERSE},
+  {X86::AND32ri_NF, X86::AND32mi_NF, TB_NO_REVERSE},
   {X86::AND32rr, X86::AND32mr, TB_NO_REVERSE},
+  {X86::AND32rr_NF, X86::AND32mr_NF, TB_NO_REVERSE},
   {X86::AND64ri32, X86::AND64mi32, TB_NO_REVERSE},
+  {X86::AND64ri32_NF, X86::AND64mi32_NF, TB_NO_REVERSE},
   {X86::AND64ri8, X86::AND64mi8, TB_NO_REVERSE},
+  {X86::AND64ri8_NF, X86::AND64mi8_NF, TB_NO_REVERSE},
   {X86::AND64rr, X86::AND64mr, TB_NO_REVERSE},
+  {X86::AND64rr_NF, X86::AND64mr_NF, TB_NO_REVERSE},
   {X86::AND8ri, X86::AND8mi, TB_NO_REVERSE},
   {X86::AND8ri8, X86::AND8mi8, TB_NO_REVERSE},
+  {X86::AND8ri_NF, X86::AND8mi_NF, TB_NO_REVERSE},
   {X86::AND8rr, X86::AND8mr, TB_NO_REVERSE},
+  {X86::AND8rr_NF, X86::AND8mr_NF, TB_NO_REVERSE},
   {X86::BTC16ri8, X86::BTC16mi8, TB_NO_REVERSE},
   {X86::BTC32ri8, X86::BTC32mi8, TB_NO_REVERSE},
   {X86::BTC64ri8, X86::BTC64mi8, TB_NO_REVERSE},
@@ -61,25 +83,40 @@ static const X86FoldTableEntry Table2Addr[] = {
   {X86::INC64r, X86::INC64m, TB_NO_REVERSE},
   {X86::INC8r, X86::INC8m, TB_NO_REVERSE},
   {X86::NEG16r, X86::NEG16m, TB_NO_REVERSE},
+  {X86::NEG16r_NF, X86::NEG16m_NF, TB_NO_REVERSE},
   {X86::NEG32r, X86::NEG32m, TB_NO_REVERSE},
+  {X86::NEG32r_NF, X86::NEG32m_NF, TB_NO_REVERSE},
   {X86::NEG64r, X86::NEG64m, TB_NO_REVERSE},
+  {X86::NEG64r_NF, X86::NEG64m_NF, TB_NO_REVERSE},
   {X86::NEG8r, X86::NEG8m, TB_NO_REVERSE},
+  {X86::NEG8r_NF, X86::NEG8m_NF, TB_NO_REVERSE},
   {X86::NOT16r, X86::NOT16m, TB_NO_REVERSE},
   {X86::NOT32r, X86::NOT32m, TB_NO_REVERSE},
   {X86::NOT64r, X86::NOT64m, TB_NO_REVERSE},
   {X86::NOT8r, X86::NOT8m, TB_NO_REVERSE},
   {X86::OR16ri, X86::OR16mi, TB_NO_REVERSE},
   {X86::OR16ri8, X86::OR16mi8, TB_NO_REVERSE},
+  {X86::OR16ri8_NF, X86::OR16mi8_NF, TB_NO_REVERSE},
+  {X86::OR16ri_NF, X86::OR16mi_NF, TB_NO_REVERSE},
   {X86::OR16rr, X86::OR16mr, TB_NO_REVERSE},
+  {X86::OR16rr_NF, X86::OR16mr_NF, TB_NO_REVERSE},
   {X86::OR32ri, X86::OR32mi, TB_NO_REVERSE},
   {X86::OR32ri8, X86::OR32mi8, TB_NO_REVERSE},
+  {X86::OR32ri8_NF, X86::OR32mi8_NF, TB_NO_REVERSE},
+  {X86::OR32ri_NF, X86::OR32mi_NF, TB_NO_REVERSE},
   {X86::OR32rr, X86::OR32mr, TB_NO_REVERSE},
+  {X86::OR32rr_NF, X86::OR32mr_NF, TB_NO_REVERSE},
   {X86::OR64ri32, X86::OR64mi32, TB_NO_REVERSE},
+  {X86::OR64ri32_NF, X86::OR64mi32_NF, TB_NO_REVERSE},
   {X86::OR64ri8, X86::OR64mi8, TB_NO_REVERSE},
+  {X86::OR64ri8_NF, X86::OR64mi8_NF, TB_NO_REVERSE},
   {X86::OR64rr, X86::OR64mr, TB_NO_REVERSE},
+  {X86::OR64rr_NF, X86::OR64mr_NF, TB_NO_REVERSE},
   {X86::OR8ri, X86::OR8mi, TB_NO_REVERSE},
   {X86::OR8ri8, X86::OR8mi8, TB_NO_REVERSE},
+  {X86::OR8ri_NF, X86::OR8mi_NF, TB_NO_REVERSE},
   {X86::OR8rr, X86::OR8mr, TB_NO_REVERSE},
+  {X86::OR8rr_NF, X86::OR8mr_NF, TB_NO_REVERSE},
   {X86::RCL16r1, X86::RCL16m1, TB_NO_REVERSE},
   {X86::RCL16rCL, X86::RCL16mCL, TB_NO_REVERSE},
   {X86::RCL16ri, X86::RCL16mi, TB_NO_REVERSE},
@@ -190,28 +227,50 @@ static const X86FoldTableEntry Table2Addr[] = {
   {X86::SHRD64rri8, X86::SHRD64mri8, TB_NO_REVERSE},
   {X86::SUB16ri, X86::SUB16mi, TB_NO_REVERSE},
   {X86::SUB16ri8, X86::SUB16mi8, TB_NO_REVERSE},
+  {X86::SUB16ri8_NF, X86::SUB16mi8_NF, TB_NO_REVERSE},
+  {X86::SUB16ri_NF, X86::SUB16mi_NF, TB_NO_REVERSE},
   {X86::SUB16rr, X86::SUB16mr, TB_NO_REVERSE},
+  {X86::SUB16rr_NF, X86::SUB16mr_NF, TB_NO_REVERSE},
   {X86::SUB32ri, X86::SUB32mi, TB_NO_REVERSE},
   {X86::SUB32ri8, X86::SUB32mi8, TB_NO_REVERSE},
+  {X86::SUB32ri8_NF, X86::SUB32mi8_NF, TB_NO_REVERSE},
+  {X86::SUB32ri_NF, X86::SUB32mi_NF, TB_NO_REVERSE},
   {X86::SUB32rr, X86::SUB32mr, TB_NO_REVERSE},
+  {X86::SUB32rr_NF, X86::SUB32mr_NF, TB_NO_REVERSE},
   {X86::SUB64ri32, X86::SUB64mi32, TB_NO_REVERSE},
+  {X86::SUB64ri32_NF, X86::SUB64mi32_NF, TB_NO_REVERSE},
   {X86::SUB64ri8, X86::SUB64mi8, TB_NO_REVERSE},
+  {X86::SUB64ri8_NF, X86::SUB64mi8_NF, TB_NO_REVERSE},
   {X86::SUB64rr, X86::SUB64mr, TB_NO_REVERSE},
+  {X86::SUB64rr_NF, X86::SUB64mr_NF, TB_NO_REVERSE},
   {X86::SUB8ri, X86::SUB8mi, TB_NO_REVERSE},
   {X86::SUB8ri8, X86::SUB8mi8, TB_NO_REVERSE},
+  {X86::SUB8ri_NF, X86::SUB8mi_NF, TB_NO_REVERSE},
   {X86::SUB8rr, X86::SUB8mr, TB_NO_REVERSE},
+  {X86::SUB8rr_NF, X86::SUB8mr_NF, TB_NO_REVERSE},
   {X86::XOR16ri, X86::XOR16mi, TB_NO_REVERSE},
   {X86::XOR16ri8, X86::XOR16mi8, TB_NO_REVERSE},
+  {X86::XOR16ri8_NF, X86::XOR16mi8_NF, TB_NO_REVERSE},
+  {X86::XOR16ri_NF, X86::XOR16mi_NF, TB_NO_REVERSE},
   {X86::XOR16rr, X86::XOR16mr, TB_NO_REVERSE},
+  {X86::XOR16rr_NF, X86::XOR16mr_NF, TB_NO_REVERSE},
   {X86::XOR32ri, X86::XOR32mi, TB_NO_REVERSE},
   {X86::XOR32ri8, X86::XOR32mi8, TB_NO_REVERSE},
+  {X86::XOR32ri8_NF, X86::XOR32mi8_NF, TB_NO_REVERSE},
+  {X86::XOR32ri_NF, X86::XOR32mi_NF, TB_NO_REVERSE},
   {X86::XOR32rr, X86::XOR32mr, TB_NO_REVERSE},
+  {X86::XOR32rr_NF, X86::XOR32mr_NF, TB_NO_REVERSE},
   {X86::XOR64ri32, X86::XOR64mi32, TB_NO_REVERSE},
+  {X86::XOR64ri32_NF, X86::XOR64mi32_NF, TB_NO_REVERSE},
   {X86::XOR64ri8, X86::XOR64mi8, TB_NO_REVERSE},
+  {X86::XOR64ri8_NF, X86::XOR64mi8_NF, TB_NO_REVERSE},
   {X86::XOR64rr, X86::XOR64mr, TB_NO_REVERSE},
+  {X86::XOR64rr_NF, X86::XOR64mr_NF, TB_NO_REVERSE},
   {X86::XOR8ri, X86::XOR8mi, TB_NO_REVERSE},
   {X86::XOR8ri8, X86::XOR8mi8, TB_NO_REVERSE},
+  {X86::XOR8ri_NF, X86::XOR8mi_NF, TB_NO_REVERSE},
   {X86::XOR8rr, X86::XOR8mr, TB_NO_REVERSE},
+  {X86::XOR8rr_NF, X86::XOR8mr_NF, TB_NO_REVERSE},
 };
 
 static const X86FoldTableEntry Table0[] = {
@@ -408,8 +467,63 @@ static const X86FoldTableEntry Table0[] = {
 };
 
 static const X86FoldTableEntry Table1[] = {
+  {X86::ADC16ri8_ND, X86::ADC16mi8_ND, 0},
+  {X86::ADC16ri_ND, X86::ADC16mi_ND, 0},
+  {X86::ADC16rr_ND, X86::ADC16mr_ND, 0},
+  {X86::ADC32ri8_ND, X86::ADC32mi8_ND, 0},
+  {X86::ADC32ri_ND, X86::ADC32mi_ND, 0},
+  {X86::ADC32rr_ND, X86::ADC32mr_ND, 0},
+  {X86::ADC64ri32_ND, X86::ADC64mi32_ND, 0},
+  {X86::ADC64ri8_ND, X86::ADC64mi8_ND, 0},
+  {X86::ADC64rr_ND, X86::ADC64mr_ND, 0},
+  {X86::ADC8ri_ND, X86::ADC8mi_ND, 0},
+  {X86::ADC8rr_ND, X86::ADC8mr_ND, 0},
+  {X86::ADD16ri8_ND, X86::ADD16mi8_ND, 0},
+  {X86::ADD16ri8_NF_ND, X86::ADD16mi8_NF_ND, 0},
+  {X86::ADD16ri_ND, X86::ADD16mi_ND, 0},
+  {X86::ADD16ri_NF_ND, X86::ADD16mi_NF_ND, 0},
+  {X86::ADD16rr_ND, X86::ADD16mr_ND, 0},
+  {X86::ADD16rr_NF_ND, X86::ADD16mr_NF_ND, 0},
+  {X86::ADD32ri8_ND, X86::ADD32mi8_ND, 0},
+  {X86::ADD32ri8_NF_ND, X86::ADD32mi8_NF_ND, 0},
+  {X86::ADD32ri_ND, X86::ADD32mi_ND, 0},
+  {X86::ADD32ri_NF_ND, X86::ADD32mi_NF_ND, 0},
+  {X86::ADD32rr_ND, X86::ADD32mr_ND, 0},
+  {X86::ADD32rr_NF_ND, X86::ADD32mr_NF_ND, 0},
+  {X86::ADD64ri32_ND, X86::ADD64mi32_ND, 0},
+  {X86::ADD64ri32_NF_ND, X86::ADD64mi32_NF_ND, 0},
+  {X86::ADD64ri8_ND, X86::ADD64mi8_ND, 0},
+  {X86::ADD64ri8_NF_ND, X86::ADD64mi8_NF_ND, 0},
+  {X86::ADD64rr_ND, X86::ADD64mr_ND, 0},
+  {X86::ADD64rr_NF_ND, X86::ADD64mr_NF_ND, 0},
+  {X86::ADD8ri_ND, X86::ADD8mi_ND, 0},
+  {X86::ADD8ri_NF_ND, X86::ADD8mi_NF_ND, 0},
+  {X86::ADD8rr_ND, X86::ADD8mr_ND, 0},
+  {X86::ADD8rr_NF_ND, X86::ADD8mr_NF_ND, 0},
   {X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16},
   {X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16},
+  {X86::AND16ri8_ND, X86::AND16mi8_ND, 0},
+  {X86::AND16ri8_NF_ND, X86::AND16mi8_NF_ND, 0},
+  {X86::AND16ri_ND, X86::AND16mi_ND, 0},
+  {X86::AND16ri_NF_ND, X86::AND16mi_NF_ND, 0},
+  {X86::AND16rr_ND, X86::AND16mr_ND, 0},
+  {X86::AND16rr_NF_ND, X86::AND16mr_NF_ND, 0},
+  {X86::AND32ri8_ND, X86::AND32mi8_ND, 0},
+  {X86::AND32ri8_NF_ND, X86::AND32mi8_NF_ND, 0},
+  {X86::AND32ri_ND, X86::AND32mi_ND, 0},
+  {X86::AND32ri_NF_ND, X86::AND32mi_NF_ND, 0},
+  {X86::AND32rr_ND, X86::AND32mr_ND, 0},
+  {X86::AND32rr_NF_ND, X86::AND32mr_NF_ND, 0},
+  {X86::AND64ri32_ND, X86::AND64mi32_ND, 0},
+  {X86::AND64ri32_NF_ND, X86::AND64mi32_NF_ND, 0},
+  {X86::AND64ri8_ND, X86::AND64mi8_ND, 0},
+  {X86::AND64ri8_NF_ND, X86::AND64mi8_NF_ND, 0},
+  {X86::AND64rr_ND, X86::AND64mr_ND, 0},
+  {X86::AND64rr_NF_ND, X86::AND64mr_NF_ND, 0},
+  {X86::AND8ri_ND, X86::AND8mi_ND, 0},
+  {X86::AND8ri_NF_ND, X86::AND8mi_NF_ND, 0},
+  {X86::AND8rr_ND, X86::AND8mr_ND, 0},
+  {X86::AND8rr_NF_ND, X86::AND8mr_NF_ND, 0},
   {X86::BEXTR32rr, X86::BEXTR32rm, 0},
   {X86::BEXTR32rr_EVEX, X86::BEXTR32rm_EVEX, 0},
   {X86::BEXTR64rr, X86::BEXTR64rm, 0},
@@ -555,6 +669,40 @@ static const X86FoldTableEntry Table1[] = {
   {X86::MOVZX32rr8_NOREX, X86::MOVZX32rm8_NOREX, 0},
   {X86::MOVZX64rr16, X86::MOVZX64rm16, 0},
   {X86::MOVZX64rr8, X86::MOVZX64rm8, 0},
+  {X86::NEG16r_ND, X86::NEG16m_ND, 0},
+  {X86::NEG16r_NF_ND, X86::NEG16m_NF_ND, 0},
+  {X86::NEG32r_ND, X86::NEG32m_ND, 0},
+  {X86::NEG32r_NF_ND, X86::NEG32m_NF_ND, 0},
+  {X86::NEG64r_ND, X86::NEG64m_ND, 0},
+  {X86::NEG64r_NF_ND, X86::NEG64m_NF_ND, 0},
+  {X86::NEG8r_ND, X86::NEG8m_ND, 0},
+  {X86::NEG8r_NF_ND, X86::NEG8m_NF_ND, 0},
+  {X86::NOT16r_ND, X86::NOT16m_ND, 0},
+  {X86::NOT32r_ND, X86::NOT32m_ND, 0},
+  {X86::NOT64r_ND, X86::NOT64m_ND, 0},
+  {X86::NOT8r_ND, X86::NOT8m_ND, 0},
+  {X86::OR16ri8_ND, X86::OR16mi8_ND, 0},
+  {X86::OR16ri8_NF_ND, X86::OR16mi8_NF_ND, 0},
+  {X86::OR16ri_ND, X86::OR16mi_ND, 0},
+  {X86::OR16ri_NF_ND, X86::OR16mi_NF_ND, 0},
+  {X86::OR16rr_ND, X86::OR16mr_ND, 0},
+  {X86::OR16rr_NF_ND, X86::OR16mr_NF_ND, 0},
+  {X86::OR32ri8_ND, X86::OR32mi8_ND, 0},
+  {X86::OR32ri8_NF_ND, X86::OR32mi8_NF_ND, 0},
+  {X86::OR32ri_ND, X86::OR32mi_ND, 0},
+  {X86::OR32ri_NF_ND, X86::OR32mi_NF_ND, 0},
+  {X86::OR32rr_ND, X86::OR32mr_ND, 0},
+  {X86::OR32rr_NF_ND, X86::OR32mr_NF_ND, 0},
+  {X86::OR64ri32_ND, X86::OR64mi32_ND, 0},
+  {X86::OR64ri32_NF_ND, X86::OR64mi32_NF_ND, 0},
+  {X86::OR64ri8_ND, X86::OR64mi8_ND, 0},
+  {X86::OR64ri8_NF_ND, X86::OR64mi8_NF_ND, 0},
+  {X86::OR64rr_ND, X86::OR64mr_ND, 0},
+  {X86::OR64rr_NF_ND, X86::OR64mr_NF_ND, 0},
+  {X86::OR8ri_ND, X86::OR8mi_ND, 0},
+  {X86::OR8ri_NF_ND, X86::OR8mi_NF_ND, 0},
+  {X86::OR8rr_ND, X86::OR8mr_ND, 0},
+  {X86::OR8rr_NF_ND, X86::OR8mr_NF_ND, 0},
   {X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16},
   {X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16},
   {X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16},
@@ -605,6 +753,17 @@ static const X86FoldTableEntry Table1[] = {
   {X86::SARX32rr_EVEX, X86::SARX32rm_EVEX, 0},
   {X86::SARX64rr, X86::SARX64rm, 0},
   {X86::SARX64rr_EVEX, X86::SARX64rm_EVEX, 0},
+  {X86::SBB16ri8_ND, X86::SBB16mi8_ND, 0},
+  {X86::SBB16ri_ND, X86::SBB16mi_ND, 0},
+  {X86::SBB16rr_ND, X86::SBB16mr_ND, 0},
+  {X86::SBB32ri8_ND, X86::SBB32mi8_ND, 0},
+  {X86::SBB32ri_ND, X86::SBB32mi_ND, 0},
+  {X86::SBB32rr_ND, X86::SBB32mr_ND, 0},
+  {X86::SBB64ri32_ND, X86::SBB64mi32_ND, 0},
+  {X86::SBB64ri8_ND, X86::SBB64mi8_ND, 0},
+  {X86::SBB64rr_ND, X86::SBB64mr_ND, 0},
+  {X86::SBB8ri_ND, X86::SBB8mi_ND, 0},
+  {X86::SBB8rr_ND, X86::SBB8mr_ND, 0},
   {X86::SHLX32rr, X86::SHLX32rm, 0},
   {X86::SHLX32rr_EVEX, X86::SHLX32rm_EVEX, 0},
   {X86::SHLX64rr, X86::SHLX64rm, 0},
@@ -617,6 +776,28 @@ static const X86FoldTableEntry Table1[] = {
   {X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16},
   {X86::SQRTSDr, X86::SQRTSDm, 0},
   {X86::SQRTSSr, X86::SQRTSSm, 0},
+  {X86::SUB16ri8_ND, X86::SUB16mi8_ND, 0},
+  {X86::SUB16ri8_NF_ND, X86::SUB16mi8_NF_ND, 0},
+  {X86::SUB16ri_ND, X86::SUB16mi_ND, 0},
+  {X86::SUB16ri_NF_ND, X86::SUB16mi_NF_ND, 0},
+  {X86::SUB16rr_ND, X86::SUB16mr_ND, 0},
+  {X86::SUB16rr_NF_ND, X86::SUB16mr_NF_ND, 0},
+  {X86::SUB32ri8_ND, X86::SUB32mi8_ND, 0},
+  {X86::SUB32ri8_NF_ND, X86::SUB32mi8_NF_ND, 0},
+  {X86::SUB32ri_ND, X86::SUB32mi_ND, 0},
+  {X86::SUB32ri_NF_ND, X86::SUB32mi_NF_ND, 0},
+  {X86::SUB32rr_ND, X86::SUB32mr_ND, 0},
+  {X86::SUB32rr_NF_ND, X86::SUB32mr_NF_ND, 0},
+  {X86::SUB64ri32_ND, X86::SUB64mi32_ND, 0},
+  {X86::SUB64ri32_NF_ND, X86::SUB64mi32_NF_ND, 0},
+  {X86::SUB64ri8_ND, X86::SUB64mi8_ND, 0},
+  {X86::SUB64ri8_NF_ND, X86::SUB64mi8_NF_ND, 0},
+  {X86::SUB64rr_ND, X86::SUB64mr_ND, 0},
+  {X86::SUB64rr_NF_ND, X86::SUB64mr_NF_ND, 0},
+  {X86::SUB8ri_ND, X86::SUB8mi_ND, 0},
+  {X86::SUB8ri_NF_ND, X86::SUB8mi_NF_ND, 0},
+  {X86::SUB8rr_ND, X86::SUB8mr_ND, 0},
+  {X86::SUB8rr_NF_ND, X86::SUB8mr_NF_ND, 0},
   {X86::T1MSKC32rr, X86::T1MSKC32rm, 0},
   {X86::T1MSKC64rr, X86::T1MSKC64rm, 0},
   {X86::TZCNT16rr, X86::TZCNT16rm, 0},
@@ -1314,6 +1495,28 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VUCOMISSZrr_Int, X86::VUCOMISSZrm_Int, TB_NO_REVERSE},
   {X86::VUCOMISSrr, X86::VUCOMISSrm, 0},
   {X86::VUCOMISSrr_Int, X86::VUCOMISSrm_Int, TB_NO_REVERSE},
+  {X86::XOR16ri8_ND, X86::XOR16mi8_ND, 0},
+  {X86::XOR16ri8_NF_ND, X86::XOR16mi8_NF_ND, 0},
+  {X86::XOR16ri_ND, X86::XOR16mi_ND, 0},
+  {X86::XOR16ri_NF_ND, X86::XOR16mi_NF_ND, 0},
+  {X86::XOR16rr_ND, X86::XOR16mr_ND, 0},
+  {X86::XOR16rr_NF_ND, X86::XOR16mr_NF_ND, 0},
+  {X86::XOR32ri8_ND, X86::XOR32mi8_ND, 0},
+  {X86::XOR32ri8_NF_ND, X86::XOR32mi8_NF_ND, 0},
+  {X86::XOR32ri_ND, X86::XOR32mi_ND, 0},
+  {X86::XOR32ri_NF_ND, X86::XOR32mi_NF_ND, 0},
+  {X86::XOR32rr_ND, X86::XOR32mr_ND, 0},
+  {X86::XOR32rr_NF_ND, X86::XOR32mr_NF_ND, 0},
+  {X86::XOR64ri32_ND, X86::XOR64mi32_ND, 0},
+  {X86::XOR64ri32_NF_ND, X86::XOR64mi32_NF_ND, 0},
+  {X86::XOR64ri8_ND, X86::XOR64mi8_ND, 0},
+  {X86::XOR64ri8_NF_ND, X86::XOR64mi8_NF_ND, 0},
+  {X86::XOR64rr_ND, X86::XOR64mr_ND, 0},
+  {X86::XOR64rr_NF_ND, X86::XOR64mr_NF_ND, 0},
+  {X86::XOR8ri_ND, X86::XOR8mi_ND, 0},
+  {X86::XOR8ri_NF_ND, X86::XOR8mi_NF_ND, 0},
+  {X86::XOR8rr_ND, X86::XOR8mr_ND, 0},
+  {X86::XOR8rr_NF_ND, X86::XOR8mr_NF_ND, 0},
 };
 
 static const X86FoldTableEntry Table2[] = {
@@ -1322,15 +1525,31 @@ static const X86FoldTableEntry Table2[] = {
   {X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE},
   {X86::ADD8rr_DB, X86::ADD8rm, TB_NO_REVERSE},
   {X86::ADC16rr, X86::ADC16rm, 0},
+  {X86::ADC16rr_ND, X86::ADC16rm_ND, 0},
   {X86::ADC32rr, X86::ADC32rm, 0},
+  {X86::ADC32rr_ND, X86::ADC32rm_ND, 0},
   {X86::ADC64rr, X86::ADC64rm, 0},
+  {X86::ADC64rr_ND, X86::ADC64rm_ND, 0},
   {X86::ADC8rr, X86::ADC8rm, 0},
+  {X86::ADC8rr_ND, X86::ADC8rm_ND, 0},
   {X86::ADCX32rr, X86::ADCX32rm, 0},
   {X86::ADCX64rr, X86::ADCX64rm, 0},
   {X86::ADD16rr, X86::ADD16rm, 0},
+  {X86::ADD16rr_ND, X86::ADD16rm_ND, 0},
+  {X86::ADD16rr_NF, X86::ADD16rm_NF, 0},
+  {X86::ADD16rr_NF_ND, X86::ADD16rm_NF_ND, 0},
   {X86::ADD32rr, X86::ADD32rm, 0},
+  {X86::ADD32rr_ND, X86::ADD32rm_ND, 0},
+  {X86::ADD32rr_NF, X86::ADD32rm_NF, 0},
+  {X86::ADD32rr_NF_ND, X86::ADD32rm_NF_ND, 0},
   {X86::ADD64rr, X86::ADD64rm, 0},
+  {X86::ADD64rr_ND, X86::ADD64rm_ND, 0},
+  {X86::ADD64rr_NF, X86::ADD64rm_NF, 0},
+  {X86::ADD64rr_NF_ND, X86::ADD64rm_NF_ND, 0},
   {X86::ADD8rr, X86::ADD8rm, 0},
+  {X86::ADD8rr_ND, X86::ADD8rm_ND, 0},
+  {X86::ADD8rr_NF, X86::ADD8rm_NF, 0},
+  {X86::ADD8rr_NF_ND, X86::ADD8rm_NF_ND, 0},
   {X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16},
   {X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16},
   {X86::ADDSDrr, X86::ADDSDrm, 0},
@@ -1346,9 +1565,21 @@ static const X86FoldTableEntry Table2[] = {
   {X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16},
   {X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16},
   {X86::AND16rr, X86::AND16rm, 0},
+  {X86::AND16rr_ND, X86::AND16rm_ND, 0},
+  {X86::AND16rr_NF, X86::AND16rm_NF, 0},
+  {X86::AND16rr_NF_ND, X86::AND16rm_NF_ND, 0},
   {X86::AND32rr, X86::AND32rm, 0},
+  {X86::AND32rr_ND, X86::AND32rm_ND, 0},
+  {X86::AND32rr_NF, X86::AND32rm_NF, 0},
+  {X86::AND32rr_NF_ND, X86::AND32rm_NF_ND, 0},
   {X86::AND64rr, X86::AND64rm, 0},
+  {X86::AND64rr_ND, X86::AND64rm_ND, 0},
+  {X86::AND64rr_NF, X86::AND64rm_NF, 0},
+  {X86::AND64rr_NF_ND, X86::AND64rm_NF_ND, 0},
   {X86::AND8rr, X86::AND8rm, 0},
+  {X86::AND8rr_ND, X86::AND8rm_ND, 0},
+  {X86::AND8rr_NF, X86::AND8rm_NF, 0},
+  {X86::AND8rr_NF_ND, X86::AND8rm_NF_ND, 0},
   {X86::ANDN32rr, X86::ANDN32rm, 0},
   {X86::ANDN32rr_EVEX, X86::ANDN32rm_EVEX, 0},
   {X86::ANDN64rr, X86::ANDN64rm, 0},
@@ -1371,10 +1602,15 @@ static const X86FoldTableEntry Table2[] = {
   {X86::CMPSSrr, X86::CMPSSrm, 0},
   {X86::CMPSSrr_Int, X86::CMPSSrm_Int, TB_NO_REVERSE},
   {X86::CRC32r32r16, X86::CRC32r32m16, 0},
+  {X86::CRC32r32r16_EVEX, X86::CRC32r32m16_EVEX, 0},
   {X86::CRC32r32r32, X86::CRC32r32m32, 0},
+  {X86::CRC32r32r32_EVEX, X86::CRC32r32m32_EVEX, 0},
   {X86::CRC32r32r8, X86::CRC32r32m8, 0},
+  {X86::CRC32r32r8_EVEX, X86::CRC32r32m8_EVEX, 0},
   {X86::CRC32r64r64, X86::CRC32r64m64, 0},
+  {X86::CRC32r64r64_EVEX, X86::CRC32r64m64_EVEX, 0},
   {X86::CRC32r64r8, X86::CRC32r64m8, 0},
+  {X86::CRC32r64r8_EVEX, X86::CRC32r64m8_EVEX, 0},
   {X86::CVTSD2SSrr_Int, X86::CVTSD2SSrm_Int, TB_NO_REVERSE},
   {X86::CVTSI2SDrr_Int, X86::CVTSI2SDrm_Int, 0},
   {X86::CVTSI2SSrr_Int, X86::CVTSI2SSrm_Int, 0},
@@ -1503,9 +1739,21 @@ static const X86FoldTableEntry Table2[] = {
   {X86::MULX64rr, X86::MULX64rm, 0},
   {X86::MULX64rr_EVEX, X86::MULX64rm_EVEX, 0},
   {X86::OR16rr, X86::OR16rm, 0},
+  {X86::OR16rr_ND, X86::OR16rm_ND, 0},
+  {X86::OR16rr_NF, X86::OR16rm_NF, 0},
+  {X86::OR16rr_NF_ND, X86::OR16rm_NF_ND, 0},
   {X86::OR32rr, X86::OR32rm, 0},
+  {X86::OR32rr_ND, X86::OR32rm_ND, 0},
+  {X86::OR32rr_NF, X86::OR32rm_NF, 0},
+  {X86::OR32rr_NF_ND, X86::OR32rm_NF_ND, 0},
   {X86::OR64rr, X86::OR64rm, 0},
+  {X86::OR64rr_ND, X86::OR64rm_ND, 0},
+  {X86::OR64rr_NF, X86::OR64rm_NF, 0},
+  {X86::OR64rr_NF_ND, X86::OR64rm_NF_ND, 0},
   {X86::OR8rr, X86::OR8rm, 0},
+  {X86::OR8rr_ND, X86::OR8rm_ND, 0},
+  {X86::OR8rr_NF, X86::OR8rm_NF, 0},
+  {X86::OR8rr_NF_ND, X86::OR8rm_NF_ND, 0},
   {X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16},
   {X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16},
   {X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16},
@@ -1628,9 +1876,13 @@ static const X86FoldTableEntry Table2[] = {
   {X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE},
   {X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE},
   {X86::SBB16rr, X86::SBB16rm, 0},
+  {X86::SBB16rr_ND, X86::SBB16rm_ND, 0},
   {X86::SBB32rr, X86::SBB32rm, 0},
+  {X86::SBB32rr_ND, X86::SBB32rm_ND, 0},
   {X86::SBB64rr, X86::SBB64rm, 0},
+  {X86::SBB64rr_ND, X86::SBB64rm_ND, 0},
   {X86::SBB8rr, X86::SBB8rm, 0},
+  {X86::SBB8rr_ND, X86::SBB8rm_ND, 0},
   {X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16},
   {X86::SHA1MSG1rr_EVEX, X86::SHA1MSG1rm_EVEX, TB_ALIGN_16},
   {X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16},
@@ -1650,9 +1902,21 @@ static const X86FoldTableEntry Table2[] = {
   {X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE},
   {X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE},
   {X86::SUB16rr, X86::SUB16rm, 0},
+  {X86::SUB16rr_ND, X86::SUB16rm_ND, 0},
+  {X86::SUB16rr_NF, X86::SUB16rm_NF, 0},
+  {X86::SUB16rr_NF_ND, X86::SUB16rm_NF_ND, 0},
   {X86::SUB32rr, X86::SUB32rm, 0},
+  {X86::SUB32rr_ND, X86::SUB32rm_ND, 0},
+  {X86::SUB32rr_NF, X86::SUB32rm_NF, 0},
+  {X86::SUB32rr_NF_ND, X86::SUB32rm_NF_ND, 0},
   {X86::SUB64rr, X86::SUB64rm, 0},
+  {X86::SUB64rr_ND, X86::SUB64rm_ND, 0},
+  {X86::SUB64rr_NF, X86::SUB64rm_NF, 0},
+  {X86::SUB64rr_NF_ND, X86::SUB64rm_NF_ND, 0},
   {X86::SUB8rr, X86::SUB8rm, 0},
+  {X86::SUB8rr_ND, X86::SUB8rm_ND, 0},
+  {X86::SUB8rr_NF, X86::SUB8rm_NF, 0},
+  {X86::SUB8rr_NF_ND, X86::SUB8rm_NF_ND, 0},
   {X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16},
   {X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16},
   {X86::SUBSDrr, X86::SUBSDrm, 0},
@@ -3281,9 +3545,21 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VXORPSZrr, X86::VXORPSZrm, 0},
   {X86::VXORPSrr, X86::VXORPSrm, 0},
   {X86::XOR16rr, X86::XOR16rm, 0},
+  {X86::XOR16rr_ND, X86::XOR16rm_ND, 0},
+  {X86::XOR16rr_NF, X86::XOR16rm_NF, 0},
+  {X86::XOR16rr_NF_ND, X86::XOR16rm_NF_ND, 0},
   {X86::XOR32rr, X86::XOR32rm, 0},
+  {X86::XOR32rr_ND, X86::XOR32rm_ND, 0},
+  {X86::XOR32rr_NF, X86::XOR32rm_NF, 0},
+  {X86::XOR32rr_NF_ND, X86::XOR32rm_NF_ND, 0},
   {X86::XOR64rr, X86::XOR64rm, 0},
+  {X86::XOR64rr_ND, X86::XOR64rm_ND, 0},
+  {X86::XOR64rr_NF, X86::XOR64rm_NF, 0},
+  {X86::XOR64rr_NF_ND, X86::XOR64rm_NF_ND, 0},
   {X86::XOR8rr, X86::XOR8rm, 0},
+  {X86::XOR8rr_ND, X86::XOR8rm_ND, 0},
+  {X86::XOR8rr_NF, X86::XOR8rm_NF, 0},
+  {X86::XOR8rr_NF_ND, X86::XOR8rm_NF_ND, 0},
   {X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16},
   {X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16},
 };
diff --git a/llvm/test/ThinLTO/X86/devirt.ll b/llvm/test/ThinLTO/X86/devirt.ll
index 3bb5d6b2e4e0..472e43d77680 100644
--- a/llvm/test/ThinLTO/X86/devirt.ll
+++ b/llvm/test/ThinLTO/X86/devirt.ll
@@ -100,7 +100,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 @_ZTV1D = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr undef, ptr @_ZN1D1mEi] }, !type !3
 
 
-; CHECK-IR-LABEL: define i32 @test
+; CHECK-IR-LABEL: define {{(noundef )?}}i32 @test
 define i32 @test(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
diff --git a/llvm/test/ThinLTO/X86/devirt2.ll b/llvm/test/ThinLTO/X86/devirt2.ll
index 1b33741d37bd..9e91efeba0da 100644
--- a/llvm/test/ThinLTO/X86/devirt2.ll
+++ b/llvm/test/ThinLTO/X86/devirt2.ll
@@ -202,7 +202,7 @@ entry:
 ; CHECK-IR1-LABEL: ret i32
 ; CHECK-IR1-LABEL: }
 
-; CHECK-IR2: define i32 @test2
+; CHECK-IR2: define noundef i32 @test2
 ; CHECK-IR2-NEXT: entry:
 ; Check that the call was devirtualized. Ignore extra character before
 ; symbol name which would happen if it was promoted during module
diff --git a/llvm/test/ThinLTO/X86/devirt_check.ll b/llvm/test/ThinLTO/X86/devirt_check.ll
index bf03afa0d8c6..74f1dfd6ac01 100644
--- a/llvm/test/ThinLTO/X86/devirt_check.ll
+++ b/llvm/test/ThinLTO/X86/devirt_check.ll
@@ -40,7 +40,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 @_ZTV1B = constant { [4 x i8*] } { [4 x i8*] [i8* null, i8* undef, i8* bitcast (i32 (%struct.B*, i32)* @_ZN1B1fEi to i8*), i8* bitcast (i32 (%struct.A*, i32)* @_ZN1A1nEi to i8*)] }, !type !0, !type !1, !vcall_visibility !5
 
 
-; CHECK-LABEL: define i32 @test
+; CHECK-LABEL: define {{(noundef )?}}i32 @test
 define i32 @test(%struct.A* %obj, i32 %a) {
 entry:
   %0 = bitcast %struct.A* %obj to i8***
diff --git a/llvm/test/ThinLTO/X86/devirt_promote.ll b/llvm/test/ThinLTO/X86/devirt_promote.ll
index fc2b41f215af..d00701be3a17 100644
--- a/llvm/test/ThinLTO/X86/devirt_promote.ll
+++ b/llvm/test/ThinLTO/X86/devirt_promote.ll
@@ -63,7 +63,7 @@ entry:
 ; CHECK-IR1-LABEL: ret i32
 ; CHECK-IR1-LABEL: }
 
-; CHECK-IR2: define i32 @test2
+; CHECK-IR2: define noundef i32 @test2
 ; Check that the call was devirtualized.
 ; CHECK-IR2:   %call4 = tail call i32 @_ZN1A1nEi
 
diff --git a/llvm/test/ThinLTO/X86/devirt_promote_legacy.ll b/llvm/test/ThinLTO/X86/devirt_promote_legacy.ll
index 9d7a40204ba8..542c1e85b6dd 100644
--- a/llvm/test/ThinLTO/X86/devirt_promote_legacy.ll
+++ b/llvm/test/ThinLTO/X86/devirt_promote_legacy.ll
@@ -45,7 +45,7 @@ entry:
 ; CHECK-IR1-LABEL: ret i32
 ; CHECK-IR1-LABEL: }
 
-; CHECK-IR2: define i32 @test2
+; CHECK-IR2: define noundef i32 @test2
 ; Check that the call was devirtualized.
 ; CHECK-IR2: = tail call i32 @_ZN1A1nEi
 
diff --git a/llvm/test/ThinLTO/X86/devirt_pure_virtual_base.ll b/llvm/test/ThinLTO/X86/devirt_pure_virtual_base.ll
index cff616091e03..ea69eedb6e35 100644
--- a/llvm/test/ThinLTO/X86/devirt_pure_virtual_base.ll
+++ b/llvm/test/ThinLTO/X86/devirt_pure_virtual_base.ll
@@ -66,7 +66,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 ;; Prevent the vtables from being dead code eliminated.
 @llvm.used = appending global [2 x ptr] [ ptr @_ZTV1A, ptr @_ZTV1B]
 
-; CHECK-IR-LABEL: define dso_local i32 @_start
+; CHECK-IR-LABEL: define dso_local {{(noundef )?}}i32 @_start
 define i32 @_start(ptr %obj, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
diff --git a/llvm/test/ThinLTO/X86/devirt_single_hybrid.ll b/llvm/test/ThinLTO/X86/devirt_single_hybrid.ll
index f5a37551085a..90fdf0d7dfa0 100644
--- a/llvm/test/ThinLTO/X86/devirt_single_hybrid.ll
+++ b/llvm/test/ThinLTO/X86/devirt_single_hybrid.ll
@@ -24,11 +24,11 @@
 
 ; REMARK-COUNT-3: single-impl: devirtualized a call to _ZNK1A1fEv
 
-; IMPORT:       define available_externally hidden i32 @_ZNK1A1fEv(ptr %this)
+; IMPORT:       define available_externally hidden {{(noundef )?}}i32 @_ZNK1A1fEv(ptr %this)
 ; IMPORT-NEXT:  entry:
 ; IMPORT-NEXT:      ret i32 3
 
-; CODEGEN:        define hidden i32 @main()
+; CODEGEN:        define hidden {{(noundef )?}}i32 @main()
 ; CODEGEN-NEXT:   entry:
 ; CODEGEN-NEXT:     ret i32 23
 
diff --git a/llvm/test/ThinLTO/X86/devirt_vcall_vis_hidden.ll b/llvm/test/ThinLTO/X86/devirt_vcall_vis_hidden.ll
index 7ebb22c6f98c..7c007e15a804 100644
--- a/llvm/test/ThinLTO/X86/devirt_vcall_vis_hidden.ll
+++ b/llvm/test/ThinLTO/X86/devirt_vcall_vis_hidden.ll
@@ -71,7 +71,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 @_ZTV1D = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr undef, ptr @_ZN1D1mEi] }, !type !3, !vcall_visibility !5
 
 
-; CHECK-IR-LABEL: define i32 @test
+; CHECK-IR-LABEL: define {{(noundef )?}}i32 @test
 define i32 @test(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
diff --git a/llvm/test/ThinLTO/X86/devirt_vcall_vis_public.ll b/llvm/test/ThinLTO/X86/devirt_vcall_vis_public.ll
index 10dda3fab0ba..dfb2f8f033d6 100644
--- a/llvm/test/ThinLTO/X86/devirt_vcall_vis_public.ll
+++ b/llvm/test/ThinLTO/X86/devirt_vcall_vis_public.ll
@@ -157,7 +157,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 @_ZTV1D = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr undef, ptr @_ZN1D1mEi] }, !type !3, !vcall_visibility !5
 
 
-; CHECK-IR-LABEL: define i32 @test
+; CHECK-IR-LABEL: define {{(noundef )?}}i32 @test
 define i32 @test(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
@@ -193,7 +193,7 @@ entry:
 ; CHECK-IR-LABEL: ret i32
 ; CHECK-IR-LABEL: }
 
-; CHECK-IR-LABEL: define i32 @test_public
+; CHECK-IR-LABEL: define {{(noundef )?}}i32 @test_public
 define i32 @test_public(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
diff --git a/llvm/test/ThinLTO/X86/funcimport.ll b/llvm/test/ThinLTO/X86/funcimport.ll
index 63a83b6a33b9..3f7941bb7648 100644
--- a/llvm/test/ThinLTO/X86/funcimport.ll
+++ b/llvm/test/ThinLTO/X86/funcimport.ll
@@ -33,7 +33,7 @@
 
 ; Verify that the optimizer run
 ; RUN: llvm-lto -thinlto-action=optimize %t2.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=OPTIMIZED
-; OPTIMIZED: define i32 @main()
+; OPTIMIZED: define noundef i32 @main()
 
 ; Verify that the codegen run
 ; RUN: llvm-lto -thinlto-action=codegen %t2.bc -o - | llvm-nm -o - | FileCheck %s --check-prefix=CODEGEN
diff --git a/llvm/test/ThinLTO/X86/globals-import-const-fold.ll b/llvm/test/ThinLTO/X86/globals-import-const-fold.ll
index ec94719abfa1..caa0103f9f82 100644
--- a/llvm/test/ThinLTO/X86/globals-import-const-fold.ll
+++ b/llvm/test/ThinLTO/X86/globals-import-const-fold.ll
@@ -9,7 +9,7 @@
 
 ; IMPORT: @baz = internal local_unnamed_addr constant i32 10
 
-; OPTIMIZE:       define i32 @main()
+; OPTIMIZE:       define noundef i32 @main()
 ; OPTIMIZE-NEXT:    ret i32 10
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/ThinLTO/X86/import-constant.ll b/llvm/test/ThinLTO/X86/import-constant.ll
index 79d0ef536733..a614fd9f09d2 100644
--- a/llvm/test/ThinLTO/X86/import-constant.ll
+++ b/llvm/test/ThinLTO/X86/import-constant.ll
@@ -35,7 +35,7 @@
 ; @outer is a write-only variable that's stored to once, so the store and the global can be removed.
 ; OPT-NOT: @outer
 
-; OPT:      define dso_local i32 @main()
+; OPT:      define dso_local noundef i32 @main()
 ; OPT-NEXT: entry:
 ; OPT-NEXT:   ret i32 12
 
diff --git a/llvm/test/ThinLTO/X86/index-const-prop-alias.ll b/llvm/test/ThinLTO/X86/index-const-prop-alias.ll
index 1e4855aa840c..91d311853a57 100644
--- a/llvm/test/ThinLTO/X86/index-const-prop-alias.ll
+++ b/llvm/test/ThinLTO/X86/index-const-prop-alias.ll
@@ -20,7 +20,7 @@
 ; IMPORT-NEXT:  @g = internal global i32 42, align 4 #0
 ; IMPORT:  attributes #0 = { "thinlto-internalize" }
 
-; CODEGEN:      define dso_local i32 @main
+; CODEGEN:      define dso_local noundef i32 @main
 ; CODEGEN-NEXT:    ret i32 42
 
 ; PRESERVED:      @g.alias = external global i32
diff --git a/llvm/test/ThinLTO/X86/index-const-prop.ll b/llvm/test/ThinLTO/X86/index-const-prop.ll
index 5087ec6ad5ca..f55fc23d1f32 100644
--- a/llvm/test/ThinLTO/X86/index-const-prop.ll
+++ b/llvm/test/ThinLTO/X86/index-const-prop.ll
@@ -23,7 +23,7 @@
 ; IMPORT-NEXT: @gFoo.llvm.0 = internal unnamed_addr global i32 1, align 4, !dbg !5
 ; IMPORT: !DICompileUnit({{.*}})
 
-; OPTIMIZE:        define i32 @main
+; OPTIMIZE:        define noundef i32 @main
 ; OPTIMIZE-NEXT:     ret i32 3
 
 ; IMPORT2: @gBar = available_externally local_unnamed_addr global i32 2, align 4, !dbg !0
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/array.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/array.ll
index b777e764247f..a52bbfbe1a34 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/array.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/array.ll
@@ -11,10 +11,10 @@ define void @caller() {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[LEFT:%.*]] = alloca [3 x i32], align 4
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[LEFT]], align 4
-; TUNIT-NEXT:    [[LEFT_0_1:%.*]] = getelementptr [3 x i32], ptr [[LEFT]], i64 0, i64 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[LEFT_0_1]], align 4
-; TUNIT-NEXT:    [[LEFT_0_2:%.*]] = getelementptr [3 x i32], ptr [[LEFT]], i64 0, i64 2
-; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[LEFT_0_2]], align 4
+; TUNIT-NEXT:    [[LEFT_B4:%.*]] = getelementptr i8, ptr [[LEFT]], i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[LEFT_B4]], align 4
+; TUNIT-NEXT:    [[LEFT_B8:%.*]] = getelementptr i8, ptr [[LEFT]], i64 8
+; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[LEFT_B8]], align 4
 ; TUNIT-NEXT:    call void @callee(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
 ; TUNIT-NEXT:    ret void
 ;
@@ -36,10 +36,10 @@ define internal void @callee(ptr noalias %arg) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARG_PRIV:%.*]] = alloca [3 x i32], align 4
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[ARG_PRIV]], align 4
-; CHECK-NEXT:    [[ARG_PRIV_0_1:%.*]] = getelementptr [3 x i32], ptr [[ARG_PRIV]], i64 0, i64 1
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[ARG_PRIV_0_1]], align 4
-; CHECK-NEXT:    [[ARG_PRIV_0_2:%.*]] = getelementptr [3 x i32], ptr [[ARG_PRIV]], i64 0, i64 2
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[ARG_PRIV_0_2]], align 4
+; CHECK-NEXT:    [[ARG_PRIV_B4:%.*]] = getelementptr i8, ptr [[ARG_PRIV]], i64 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[ARG_PRIV_B4]], align 4
+; CHECK-NEXT:    [[ARG_PRIV_B8:%.*]] = getelementptr i8, ptr [[ARG_PRIV]], i64 8
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[ARG_PRIV_B8]], align 4
 ; CHECK-NEXT:    call void @use(ptr noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(12) [[ARG_PRIV]])
 ; CHECK-NEXT:    ret void
 ;
@@ -48,5 +48,7 @@ entry:
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
+; TUNIT: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
+;.
+; CGSCC: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
 ;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
index fd9e95581970..877071c1a3fe 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
@@ -15,14 +15,14 @@ define internal i32 @f(ptr byval(%struct.ss) %b, ptr byval(i32) %X, i32 %i) noun
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr [[X_PRIV]], align 4
 ; CHECK-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B_PRIV]], align 4
-; CHECK-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[B_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_0_1]], align 4
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = add i32 [[TRUETMP1]], 1
-; CHECK-NEXT:    store i32 [[TRUETMP2]], ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[B_PRIV_B4:%.*]] = getelementptr i8, ptr [[B_PRIV]], i64 4
+; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_B4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[B_PRIV]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[X_PRIV]], align 4
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[X_PRIV]], align 4
-; CHECK-NEXT:    [[A:%.*]] = add i32 [[L]], [[TRUETMP2]]
+; CHECK-NEXT:    [[A:%.*]] = add i32 [[L]], [[TMP2]]
 ; CHECK-NEXT:    ret i32 [[A]]
 ;
 entry:
@@ -46,10 +46,10 @@ define i32 @test(ptr %X) {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; TUNIT-NEXT:    store i32 1, ptr [[S]], align 8
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; TUNIT-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[S]], align 8
-; TUNIT-NEXT:    [[S_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i64 0, i32 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_0_1]], align 8
+; TUNIT-NEXT:    [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_B4]], align 8
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4
 ; TUNIT-NEXT:    [[C:%.*]] = call i32 @f(i32 [[TMP0]], i64 [[TMP1]], i32 [[TMP2]]) #[[ATTR2:[0-9]+]]
 ; TUNIT-NEXT:    ret i32 [[C]]
@@ -59,7 +59,7 @@ define i32 @test(ptr %X) {
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; CGSCC-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4
 ; CGSCC-NEXT:    [[C:%.*]] = call i32 @f(i32 noundef 1, i64 noundef 2, i32 [[TMP0]]) #[[ATTR2:[0-9]+]]
 ; CGSCC-NEXT:    ret i32 [[C]]
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
index 2bb080c42760..b76254f66090 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
@@ -13,11 +13,11 @@ define internal void @f(ptr byval(%struct.ss)  %b, ptr byval(i32) %X) nounwind
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr [[X_PRIV]], align 4
 ; CHECK-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B_PRIV]], align 4
-; CHECK-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[B_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_0_1]], align 4
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = add i32 [[TRUETMP1]], 1
-; CHECK-NEXT:    store i32 [[TRUETMP2]], ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[B_PRIV_B4:%.*]] = getelementptr i8, ptr [[B_PRIV]], i64 4
+; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_B4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[B_PRIV]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[X_PRIV]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -38,10 +38,10 @@ define i32 @test(ptr %X) {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; TUNIT-NEXT:    store i32 1, ptr [[S]], align 8
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; TUNIT-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[S]], align 8
-; TUNIT-NEXT:    [[S_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i64 0, i32 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_0_1]], align 8
+; TUNIT-NEXT:    [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_B4]], align 8
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4
 ; TUNIT-NEXT:    call void @f(i32 [[TMP0]], i64 [[TMP1]], i32 [[TMP2]]) #[[ATTR2:[0-9]+]]
 ; TUNIT-NEXT:    ret i32 0
@@ -51,7 +51,7 @@ define i32 @test(ptr %X) {
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; CGSCC-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; CGSCC-NEXT:    ret i32 0
 ;
 entry:
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
index 21f3e039f11d..77667875256f 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
@@ -13,12 +13,12 @@ define internal i32 @f(ptr byval(%struct.ss)  %b) nounwind  {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B_PRIV]], align 4
-; CHECK-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[B_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_0_1]], align 4
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = add i32 [[TRUETMP1]], 1
-; CHECK-NEXT:    store i32 [[TRUETMP2]], ptr [[B_PRIV]], align 8
-; CHECK-NEXT:    ret i32 [[TRUETMP1]]
+; CHECK-NEXT:    [[B_PRIV_B4:%.*]] = getelementptr i8, ptr [[B_PRIV]], i64 4
+; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_B4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
   %tmp1 = load i32, ptr %b, align 4
@@ -35,12 +35,12 @@ define internal i32 @g(ptr byval(%struct.ss) align 32 %b) nounwind {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B_PRIV]], align 4
-; CHECK-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[B_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_0_1]], align 4
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = load i32, ptr [[B_PRIV]], align 32
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = add i32 [[TRUETMP1]], 1
-; CHECK-NEXT:    store i32 [[TRUETMP2]], ptr [[B_PRIV]], align 32
-; CHECK-NEXT:    ret i32 [[TRUETMP2]]
+; CHECK-NEXT:    [[B_PRIV_B4:%.*]] = getelementptr i8, ptr [[B_PRIV]], i64 4
+; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_B4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_PRIV]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[B_PRIV]], align 32
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %tmp1 = load i32, ptr %b, align 4
@@ -57,14 +57,14 @@ define i32 @main() nounwind  {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; TUNIT-NEXT:    store i32 1, ptr [[S]], align 32
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; TUNIT-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[S]], align 8
-; TUNIT-NEXT:    [[S_0_11:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i64 0, i32 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_0_11]], align 8
+; TUNIT-NEXT:    [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_B4]], align 8
 ; TUNIT-NEXT:    [[C0:%.*]] = call i32 @f(i32 [[TMP0]], i64 [[TMP1]]) #[[ATTR1:[0-9]+]]
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[S]], align 32
-; TUNIT-NEXT:    [[S_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i64 0, i32 1
-; TUNIT-NEXT:    [[TMP3:%.*]] = load i64, ptr [[S_0_1]], align 32
+; TUNIT-NEXT:    [[S_B41:%.*]] = getelementptr i8, ptr [[S]], i64 4
+; TUNIT-NEXT:    [[TMP3:%.*]] = load i64, ptr [[S_B41]], align 32
 ; TUNIT-NEXT:    [[C1:%.*]] = call i32 @g(i32 [[TMP2]], i64 [[TMP3]]) #[[ATTR1]]
 ; TUNIT-NEXT:    [[A:%.*]] = add i32 [[C0]], [[C1]]
 ; TUNIT-NEXT:    ret i32 [[A]]
@@ -74,7 +74,7 @@ define i32 @main() nounwind  {
 ; CGSCC-SAME: () #[[ATTR1:[0-9]+]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; CGSCC-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; CGSCC-NEXT:    [[C0:%.*]] = call i32 @f(i32 noundef 1, i64 noundef 2) #[[ATTR2:[0-9]+]]
 ; CGSCC-NEXT:    [[C1:%.*]] = call i32 @g(i32 noundef 1, i64 noundef 2) #[[ATTR2]]
 ; CGSCC-NEXT:    [[A:%.*]] = add i32 [[C0]], [[C1]]
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
index 0370b86faa09..274181fa8b9e 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
@@ -14,8 +14,8 @@ target triple = "x86_64-unknown-linux-gnu"
 @a = internal global %struct.Foo { i32 1, i64 2 }, align 8
 
 ;.
-; CHECK: @[[B:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_S:%.*]] { double 3.140000e+00, i16 9439, i8 25, [5 x i8] undef }, align 16
-; CHECK: @[[A:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_FOO:%.*]] { i32 1, i64 2 }, align 8
+; CHECK: @b = internal global %struct.s { double 3.140000e+00, i16 9439, i8 25, [5 x i8] undef }, align 16
+; CHECK: @a = internal global %struct.Foo { i32 1, i64 2 }, align 8
 ;.
 define void @run() {
 ;
@@ -82,8 +82,8 @@ define internal i64 @CaptureAStruct(ptr byval(%struct.Foo) %a) {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[A_PRIV:%.*]] = alloca [[STRUCT_FOO:%.*]], align 8
 ; CGSCC-NEXT:    store i32 [[TMP0]], ptr [[A_PRIV]], align 4
-; CGSCC-NEXT:    [[A_PRIV_0_1:%.*]] = getelementptr [[STRUCT_FOO]], ptr [[A_PRIV]], i64 0, i32 1
-; CGSCC-NEXT:    store i64 [[TMP1]], ptr [[A_PRIV_0_1]], align 8
+; CGSCC-NEXT:    [[A_PRIV_B8:%.*]] = getelementptr i8, ptr [[A_PRIV]], i64 8
+; CGSCC-NEXT:    store i64 [[TMP1]], ptr [[A_PRIV_B8]], align 8
 ; CGSCC-NEXT:    [[A_PTR:%.*]] = alloca ptr, align 8
 ; CGSCC-NEXT:    br label [[LOOP:%.*]]
 ; CGSCC:       loop:
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll
index 71a6e39e7166..c1e92bde0236 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll
@@ -15,8 +15,8 @@ define internal void @bar(ptr byval(%pair) %Data) {
 ; TUNIT-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; TUNIT-NEXT:    [[DATA_PRIV:%.*]] = alloca [[PAIR:%.*]], align 8
 ; TUNIT-NEXT:    store i32 [[TMP0]], ptr [[DATA_PRIV]], align 4
-; TUNIT-NEXT:    [[DATA_PRIV_0_1:%.*]] = getelementptr [[PAIR]], ptr [[DATA_PRIV]], i64 0, i32 1
-; TUNIT-NEXT:    store i32 [[TMP1]], ptr [[DATA_PRIV_0_1]], align 4
+; TUNIT-NEXT:    [[DATA_PRIV_B4:%.*]] = getelementptr i8, ptr [[DATA_PRIV]], i64 4
+; TUNIT-NEXT:    store i32 [[TMP1]], ptr [[DATA_PRIV_B4]], align 4
 ; TUNIT-NEXT:    [[TMP3:%.*]] = call ptr @foo(ptr nonnull dereferenceable(8) [[DATA_PRIV]])
 ; TUNIT-NEXT:    ret void
 ;
@@ -25,8 +25,8 @@ define internal void @bar(ptr byval(%pair) %Data) {
 ; CGSCC-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CGSCC-NEXT:    [[DATA_PRIV:%.*]] = alloca [[PAIR:%.*]], align 8
 ; CGSCC-NEXT:    store i32 [[TMP0]], ptr [[DATA_PRIV]], align 4
-; CGSCC-NEXT:    [[DATA_PRIV_0_1:%.*]] = getelementptr [[PAIR]], ptr [[DATA_PRIV]], i64 0, i32 1
-; CGSCC-NEXT:    store i32 [[TMP1]], ptr [[DATA_PRIV_0_1]], align 4
+; CGSCC-NEXT:    [[DATA_PRIV_B4:%.*]] = getelementptr i8, ptr [[DATA_PRIV]], i64 4
+; CGSCC-NEXT:    store i32 [[TMP1]], ptr [[DATA_PRIV_B4]], align 4
 ; CGSCC-NEXT:    [[TMP3:%.*]] = call ptr @foo(ptr noundef nonnull dereferenceable(8) [[DATA_PRIV]])
 ; CGSCC-NEXT:    ret void
 ;
@@ -38,16 +38,16 @@ define void @zed(ptr byval(%pair) %Data) {
 ; TUNIT-LABEL: define {{[^@]+}}@zed
 ; TUNIT-SAME: (ptr noalias nocapture nonnull readonly byval([[PAIR:%.*]]) dereferenceable(8) [[DATA:%.*]]) {
 ; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DATA]], align 1
-; TUNIT-NEXT:    [[DATA_0_1:%.*]] = getelementptr [[PAIR]], ptr [[DATA]], i64 0, i32 1
-; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DATA_0_1]], align 1
+; TUNIT-NEXT:    [[DATA_B4:%.*]] = getelementptr i8, ptr [[DATA]], i64 4
+; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DATA_B4]], align 1
 ; TUNIT-NEXT:    call void @bar(i32 [[TMP1]], i32 [[TMP2]])
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC-LABEL: define {{[^@]+}}@zed
 ; CGSCC-SAME: (ptr noalias nocapture nofree noundef nonnull readonly byval([[PAIR:%.*]]) dereferenceable(8) [[DATA:%.*]]) {
 ; CGSCC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DATA]], align 1
-; CGSCC-NEXT:    [[DATA_0_1:%.*]] = getelementptr [[PAIR]], ptr [[DATA]], i64 0, i32 1
-; CGSCC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DATA_0_1]], align 1
+; CGSCC-NEXT:    [[DATA_B4:%.*]] = getelementptr i8, ptr [[DATA]], i64 4
+; CGSCC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DATA_B4]], align 1
 ; CGSCC-NEXT:    call void @bar(i32 [[TMP1]], i32 [[TMP2]])
 ; CGSCC-NEXT:    ret void
 ;
@@ -55,7 +55,9 @@ define void @zed(ptr byval(%pair) %Data) {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { memory(readwrite, argmem: none) }
+; TUNIT: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
+;.
+; CGSCC: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
index 81e6f959cef4..154b093e9dbb 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
@@ -9,7 +9,7 @@
 
 declare void @use(i8)
 ;.
-; CHECK: @[[MYSTR:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_MYSTR:%.*]] zeroinitializer
+; CHECK: @mystr = internal global %struct.MYstr zeroinitializer
 ;.
 define internal void @vfu1(ptr byval(%struct.MYstr) align 4 %u) nounwind {
 ; CHECK: Function Attrs: nounwind memory(readwrite, argmem: none)
@@ -18,8 +18,8 @@ define internal void @vfu1(ptr byval(%struct.MYstr) align 4 %u) nounwind {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_PRIV:%.*]] = alloca [[STRUCT_MYSTR:%.*]], align 8
 ; CHECK-NEXT:    store i8 [[TMP0]], ptr [[U_PRIV]], align 1
-; CHECK-NEXT:    [[U_PRIV_0_1:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[U_PRIV_0_1]], align 4
+; CHECK-NEXT:    [[U_PRIV_B4:%.*]] = getelementptr i8, ptr [[U_PRIV]], i64 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[U_PRIV_B4]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i32 0, i32 1
 ; CHECK-NEXT:    store i32 99, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    store i8 97, ptr [[U_PRIV]], align 8
@@ -49,8 +49,8 @@ define internal i32 @vfu2(ptr byval(%struct.MYstr) align 4 %u) nounwind readonly
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_PRIV:%.*]] = alloca [[STRUCT_MYSTR:%.*]], align 8
 ; CHECK-NEXT:    store i8 [[TMP0]], ptr [[U_PRIV]], align 1
-; CHECK-NEXT:    [[U_PRIV_0_1:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[U_PRIV_0_1]], align 4
+; CHECK-NEXT:    [[U_PRIV_B4:%.*]] = getelementptr i8, ptr [[U_PRIV]], i64 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[U_PRIV_B4]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i32 0, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[U_PRIV]], align 8
@@ -74,12 +74,12 @@ define i32 @unions() nounwind {
 ; TUNIT-SAME: () #[[ATTR2:[0-9]+]] {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_0_1:%.*]] = getelementptr [[STRUCT_MYSTR:%.*]], ptr @mystr, i64 0, i32 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_0_1]], align 8
+; TUNIT-NEXT:    [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_B4]], align 8
 ; TUNIT-NEXT:    call void @vfu1(i8 [[TMP0]], i32 [[TMP1]]) #[[ATTR2]]
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_0_11:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr @mystr, i64 0, i32 1
-; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_0_11]], align 8
+; TUNIT-NEXT:    [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_B41]], align 8
 ; TUNIT-NEXT:    [[RESULT:%.*]] = call i32 @vfu2(i8 [[TMP2]], i32 [[TMP3]]) #[[ATTR3:[0-9]+]]
 ; TUNIT-NEXT:    ret i32 [[RESULT]]
 ;
@@ -88,12 +88,12 @@ define i32 @unions() nounwind {
 ; CGSCC-SAME: () #[[ATTR2:[0-9]+]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load i8, ptr @mystr, align 8
-; CGSCC-NEXT:    [[MYSTR_0_1:%.*]] = getelementptr [[STRUCT_MYSTR:%.*]], ptr @mystr, i64 0, i32 1
-; CGSCC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_0_1]], align 8
+; CGSCC-NEXT:    [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; CGSCC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_B4]], align 8
 ; CGSCC-NEXT:    call void @vfu1(i8 [[TMP0]], i32 [[TMP1]]) #[[ATTR2]]
 ; CGSCC-NEXT:    [[TMP2:%.*]] = load i8, ptr @mystr, align 8
-; CGSCC-NEXT:    [[MYSTR_0_11:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr @mystr, i64 0, i32 1
-; CGSCC-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_0_11]], align 8
+; CGSCC-NEXT:    [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; CGSCC-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_B41]], align 8
 ; CGSCC-NEXT:    [[RESULT:%.*]] = call i32 @vfu2(i8 [[TMP2]], i32 [[TMP3]]) #[[ATTR2]]
 ; CGSCC-NEXT:    ret i32 [[RESULT]]
 ;
@@ -110,8 +110,8 @@ define internal i32 @vfu2_v2(ptr byval(%struct.MYstr) align 4 %u) nounwind reado
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_PRIV:%.*]] = alloca [[STRUCT_MYSTR:%.*]], align 8
 ; CHECK-NEXT:    store i8 [[TMP0]], ptr [[U_PRIV]], align 1
-; CHECK-NEXT:    [[U_PRIV_0_1:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[U_PRIV_0_1]], align 4
+; CHECK-NEXT:    [[U_PRIV_B4:%.*]] = getelementptr i8, ptr [[U_PRIV]], i64 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[U_PRIV_B4]], align 4
 ; CHECK-NEXT:    [[Z:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i32 0, i32 1
 ; CHECK-NEXT:    store i32 99, ptr [[Z]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i32 0, i32 1
@@ -139,12 +139,12 @@ define i32 @unions_v2() nounwind {
 ; TUNIT-SAME: () #[[ATTR2]] {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_0_11:%.*]] = getelementptr [[STRUCT_MYSTR:%.*]], ptr @mystr, i64 0, i32 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_0_11]], align 8
+; TUNIT-NEXT:    [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_B41]], align 8
 ; TUNIT-NEXT:    call void @vfu1(i8 [[TMP0]], i32 [[TMP1]]) #[[ATTR2]]
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_0_1:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr @mystr, i64 0, i32 1
-; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_0_1]], align 8
+; TUNIT-NEXT:    [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_B4]], align 8
 ; TUNIT-NEXT:    [[RESULT:%.*]] = call i32 @vfu2_v2(i8 [[TMP2]], i32 [[TMP3]]) #[[ATTR3]]
 ; TUNIT-NEXT:    ret i32 [[RESULT]]
 ;
diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
index ef1458b8bc84..99a4c0aac7a2 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
@@ -32,37 +32,37 @@
 @rec_storage = internal global i32 undef
 
 ;.
-; CHECK: @[[GLOBALBYTES:[a-zA-Z0-9_$"\\.-]+]] = global [1024 x i8] zeroinitializer, align 16
-; CHECK: @[[GINT1:[a-zA-Z0-9_$"\\.-]+]] = global i32 0, align 4
-; CHECK: @[[GINT2:[a-zA-Z0-9_$"\\.-]+]] = global i32 0, align 4
-; CHECK: @[[GSTATIC_INT1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
-; CHECK: @[[GSTATIC_INT2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
-; CHECK: @[[GSTATIC_INT3:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
-; CHECK: @[[GSTATIC_UNDEF_INT1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[GSTATIC_UNDEF_INT2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[GI1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[GI2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[GS1:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_S:%.*]] undef, align 4
-; CHECK: @[[GS2:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_S:%.*]] zeroinitializer, align 4
-; CHECK: @[[VS1:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_S:%.*]] undef, align 4
-; CHECK: @[[VS2:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_S:%.*]] undef, align 4
-; CHECK: @[[GBYTES:[a-zA-Z0-9_$"\\.-]+]] = internal global [1024 x i8] zeroinitializer, align 16
-; CHECK: @[[FLAG0:[a-zA-Z0-9_$"\\.-]+]] = global i32 0, align 4
-; CHECK: @[[FLAG1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[FLAG2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[FLAG4:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[FLAG3:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
-; CHECK: @[[A1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0
-; CHECK: @[[A2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0
-; CHECK: @[[A3:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef
-; CHECK: @[[BYTES1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef
-; CHECK: @[[BYTES2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef
-; CHECK: @[[REC_STORAGE:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef
-; CHECK: @[[GLOBAL:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_STY:%.*]] zeroinitializer, align 8
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
-; CHECK: @[[GC:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[GRS:[a-zA-Z0-9_$"\\.-]+]] = internal thread_local global i32 undef
-; CHECK: @[[GRS2:[a-zA-Z0-9_$"\\.-]+]] = global i32 undef
+; CHECK: @globalBytes = global [1024 x i8] zeroinitializer, align 16
+; CHECK: @Gint1 = global i32 0, align 4
+; CHECK: @Gint2 = global i32 0, align 4
+; CHECK: @Gstatic_int1 = internal global i32 0, align 4
+; CHECK: @Gstatic_int2 = internal global i32 0, align 4
+; CHECK: @Gstatic_int3 = internal global i32 0, align 4
+; CHECK: @Gstatic_undef_int1 = internal global i32 undef, align 4
+; CHECK: @Gstatic_undef_int2 = internal global i32 undef, align 4
+; CHECK: @GI1 = internal global i32 undef, align 4
+; CHECK: @GI2 = internal global i32 undef, align 4
+; CHECK: @Gs1 = internal global %struct.S undef, align 4
+; CHECK: @Gs2 = internal global %struct.S zeroinitializer, align 4
+; CHECK: @Vs1 = internal global %struct.S undef, align 4
+; CHECK: @Vs2 = internal global %struct.S undef, align 4
+; CHECK: @GBytes = internal global [1024 x i8] zeroinitializer, align 16
+; CHECK: @Flag0 = global i32 0, align 4
+; CHECK: @Flag1 = internal global i32 undef, align 4
+; CHECK: @Flag2 = internal global i32 undef, align 4
+; CHECK: @Flag4 = internal global i32 undef, align 4
+; CHECK: @Flag3 = internal global i32 0, align 4
+; CHECK: @a1 = internal global i32 0
+; CHECK: @a2 = internal global i32 0
+; CHECK: @a3 = internal global i32 undef
+; CHECK: @bytes1 = internal global i32 undef
+; CHECK: @bytes2 = internal global i32 undef
+; CHECK: @rec_storage = internal global i32 undef
+; CHECK: @global = internal global %struct.STy zeroinitializer, align 8
+; CHECK: @G = internal global i32 0, align 4
+; CHECK: @GC = internal global i32 undef, align 4
+; CHECK: @GRS = internal thread_local global i32 undef
+; CHECK: @GRS2 = global i32 undef
 ;.
 define void @write_arg(ptr %p, i32 %v) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
@@ -2674,10 +2674,10 @@ define dso_local void @test_nested_memory(ptr %dst, ptr %src) {
 ; TUNIT-NEXT:    store ptr [[SRC]], ptr [[SRC2]], align 8
 ; TUNIT-NEXT:    store ptr [[CALL_H2S]], ptr getelementptr inbounds ([[STRUCT_STY]], ptr @global, i64 0, i32 2), align 8
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[LOCAL]], align 8
-; TUNIT-NEXT:    [[LOCAL_0_1:%.*]] = getelementptr [[STRUCT_STY]], ptr [[LOCAL]], i64 0, i32 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[LOCAL_0_1]], align 8
-; TUNIT-NEXT:    [[LOCAL_0_2:%.*]] = getelementptr [[STRUCT_STY]], ptr [[LOCAL]], i64 0, i32 2
-; TUNIT-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[LOCAL_0_2]], align 8
+; TUNIT-NEXT:    [[LOCAL_B8:%.*]] = getelementptr i8, ptr [[LOCAL]], i64 8
+; TUNIT-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[LOCAL_B8]], align 8
+; TUNIT-NEXT:    [[LOCAL_B16:%.*]] = getelementptr i8, ptr [[LOCAL]], i64 16
+; TUNIT-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[LOCAL_B16]], align 8
 ; TUNIT-NEXT:    call fastcc void @nested_memory_callee(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]]) #[[ATTR21:[0-9]+]]
 ; TUNIT-NEXT:    ret void
 ;
@@ -2714,10 +2714,10 @@ define internal fastcc void @nested_memory_callee(ptr nocapture readonly %S) nof
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[S_PRIV:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
 ; TUNIT-NEXT:    store ptr [[TMP0]], ptr [[S_PRIV]], align 8
-; TUNIT-NEXT:    [[S_PRIV_0_1:%.*]] = getelementptr [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 1
-; TUNIT-NEXT:    store ptr [[TMP1]], ptr [[S_PRIV_0_1]], align 8
-; TUNIT-NEXT:    [[S_PRIV_0_2:%.*]] = getelementptr [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 2
-; TUNIT-NEXT:    store ptr [[TMP2]], ptr [[S_PRIV_0_2]], align 8
+; TUNIT-NEXT:    [[S_PRIV_B8:%.*]] = getelementptr i8, ptr [[S_PRIV]], i64 8
+; TUNIT-NEXT:    store ptr [[TMP1]], ptr [[S_PRIV_B8]], align 8
+; TUNIT-NEXT:    [[S_PRIV_B16:%.*]] = getelementptr i8, ptr [[S_PRIV]], i64 16
+; TUNIT-NEXT:    store ptr [[TMP2]], ptr [[S_PRIV_B16]], align 8
 ; TUNIT-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 2
 ; TUNIT-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[INNER]], align 8
 ; TUNIT-NEXT:    [[INNER1:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[TMP3]], i64 0, i32 2
@@ -2736,10 +2736,10 @@ define internal fastcc void @nested_memory_callee(ptr nocapture readonly %S) nof
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[S_PRIV:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
 ; CGSCC-NEXT:    store ptr [[TMP0]], ptr [[S_PRIV]], align 8
-; CGSCC-NEXT:    [[S_PRIV_0_1:%.*]] = getelementptr [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 1
-; CGSCC-NEXT:    store ptr [[TMP1]], ptr [[S_PRIV_0_1]], align 8
-; CGSCC-NEXT:    [[S_PRIV_0_2:%.*]] = getelementptr [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 2
-; CGSCC-NEXT:    store ptr [[TMP2]], ptr [[S_PRIV_0_2]], align 8
+; CGSCC-NEXT:    [[S_PRIV_B8:%.*]] = getelementptr i8, ptr [[S_PRIV]], i64 8
+; CGSCC-NEXT:    store ptr [[TMP1]], ptr [[S_PRIV_B8]], align 8
+; CGSCC-NEXT:    [[S_PRIV_B16:%.*]] = getelementptr i8, ptr [[S_PRIV]], i64 16
+; CGSCC-NEXT:    store ptr [[TMP2]], ptr [[S_PRIV_B16]], align 8
 ; CGSCC-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 2
 ; CGSCC-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[INNER]], align 8
 ; CGSCC-NEXT:    [[INNER1:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[TMP3]], i64 0, i32 2
@@ -3289,67 +3289,67 @@ declare void @llvm.assume(i1 noundef)
 ;.
 ; TUNIT: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; TUNIT: [[META1:![0-9]+]] = !{i32 7, !"uwtable", i32 1}
-; TUNIT: [[META2:![0-9]+]] = !{!"clang version 13.0.0"}
-; TUNIT: [[TBAA3]] = !{!4, !4, i64 0}
-; TUNIT: [[META4:![0-9]+]] = !{!"int", !5, i64 0}
-; TUNIT: [[META5:![0-9]+]] = !{!"omnipotent char", !6, i64 0}
-; TUNIT: [[META6:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; TUNIT: [[TBAA7]] = !{!8, !9, i64 12}
-; TUNIT: [[META8:![0-9]+]] = !{!"S", !4, i64 0, !4, i64 4, !4, i64 8, !9, i64 12, !9, i64 16, !9, i64 20}
-; TUNIT: [[META9:![0-9]+]] = !{!"float", !5, i64 0}
-; TUNIT: [[TBAA10]] = !{!8, !9, i64 16}
-; TUNIT: [[TBAA11]] = !{!8, !9, i64 20}
-; TUNIT: [[TBAA12]] = !{!8, !4, i64 0}
-; TUNIT: [[TBAA13]] = !{!8, !4, i64 4}
-; TUNIT: [[TBAA14]] = !{!8, !4, i64 8}
-; TUNIT: [[LOOP15]] = distinct !{!15, !16}
-; TUNIT: [[META16:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; TUNIT: [[LOOP17]] = distinct !{!17, !16}
-; TUNIT: [[LOOP18]] = distinct !{!18, !16}
-; TUNIT: [[TBAA19]] = !{!5, !5, i64 0}
-; TUNIT: [[LOOP20]] = distinct !{!20, !16}
-; TUNIT: [[LOOP21]] = distinct !{!21, !16}
-; TUNIT: [[LOOP22]] = distinct !{!22, !16}
-; TUNIT: [[LOOP23]] = distinct !{!23, !16}
-; TUNIT: [[LOOP24]] = distinct !{!24, !16}
-; TUNIT: [[LOOP25]] = distinct !{!25, !16}
-; TUNIT: [[TBAA26]] = !{!9, !9, i64 0}
-; TUNIT: [[LOOP27]] = distinct !{!27, !16}
-; TUNIT: [[TBAA28]] = !{!29, !29, i64 0}
-; TUNIT: [[META29:![0-9]+]] = !{!"long long", !5, i64 0}
-; TUNIT: [[LOOP30]] = distinct !{!30, !16}
-; TUNIT: [[LOOP31]] = distinct !{!31, !16}
+; TUNIT: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; TUNIT: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; TUNIT: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+; TUNIT: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+; TUNIT: [[META6]] = !{!"Simple C/C++ TBAA"}
+; TUNIT: [[TBAA7]] = !{[[META8:![0-9]+]], [[META9:![0-9]+]], i64 12}
+; TUNIT: [[META8]] = !{!"S", [[META4]], i64 0, [[META4]], i64 4, [[META4]], i64 8, [[META9]], i64 12, [[META9]], i64 16, [[META9]], i64 20}
+; TUNIT: [[META9]] = !{!"float", [[META5]], i64 0}
+; TUNIT: [[TBAA10]] = !{[[META8]], [[META9]], i64 16}
+; TUNIT: [[TBAA11]] = !{[[META8]], [[META9]], i64 20}
+; TUNIT: [[TBAA12]] = !{[[META8]], [[META4]], i64 0}
+; TUNIT: [[TBAA13]] = !{[[META8]], [[META4]], i64 4}
+; TUNIT: [[TBAA14]] = !{[[META8]], [[META4]], i64 8}
+; TUNIT: [[LOOP15]] = distinct !{[[LOOP15]], [[META16:![0-9]+]]}
+; TUNIT: [[META16]] = !{!"llvm.loop.mustprogress"}
+; TUNIT: [[LOOP17]] = distinct !{[[LOOP17]], [[META16]]}
+; TUNIT: [[LOOP18]] = distinct !{[[LOOP18]], [[META16]]}
+; TUNIT: [[TBAA19]] = !{[[META5]], [[META5]], i64 0}
+; TUNIT: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]]}
+; TUNIT: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]}
+; TUNIT: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]]}
+; TUNIT: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]]}
+; TUNIT: [[LOOP24]] = distinct !{[[LOOP24]], [[META16]]}
+; TUNIT: [[LOOP25]] = distinct !{[[LOOP25]], [[META16]]}
+; TUNIT: [[TBAA26]] = !{[[META9]], [[META9]], i64 0}
+; TUNIT: [[LOOP27]] = distinct !{[[LOOP27]], [[META16]]}
+; TUNIT: [[TBAA28]] = !{[[META29:![0-9]+]], [[META29]], i64 0}
+; TUNIT: [[META29]] = !{!"long long", [[META5]], i64 0}
+; TUNIT: [[LOOP30]] = distinct !{[[LOOP30]], [[META16]]}
+; TUNIT: [[LOOP31]] = distinct !{[[LOOP31]], [[META16]]}
 ;.
 ; CGSCC: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; CGSCC: [[META1:![0-9]+]] = !{i32 7, !"uwtable", i32 1}
-; CGSCC: [[META2:![0-9]+]] = !{!"clang version 13.0.0"}
-; CGSCC: [[TBAA3]] = !{!4, !4, i64 0}
-; CGSCC: [[META4:![0-9]+]] = !{!"int", !5, i64 0}
-; CGSCC: [[META5:![0-9]+]] = !{!"omnipotent char", !6, i64 0}
-; CGSCC: [[META6:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; CGSCC: [[TBAA7]] = !{!8, !9, i64 12}
-; CGSCC: [[META8:![0-9]+]] = !{!"S", !4, i64 0, !4, i64 4, !4, i64 8, !9, i64 12, !9, i64 16, !9, i64 20}
-; CGSCC: [[META9:![0-9]+]] = !{!"float", !5, i64 0}
-; CGSCC: [[TBAA10]] = !{!8, !9, i64 16}
-; CGSCC: [[TBAA11]] = !{!8, !9, i64 20}
-; CGSCC: [[TBAA12]] = !{!8, !4, i64 0}
-; CGSCC: [[TBAA13]] = !{!8, !4, i64 4}
-; CGSCC: [[TBAA14]] = !{!8, !4, i64 8}
-; CGSCC: [[TBAA15]] = !{!5, !5, i64 0}
-; CGSCC: [[LOOP16]] = distinct !{!16, !17}
-; CGSCC: [[META17:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; CGSCC: [[TBAA18]] = !{!9, !9, i64 0}
-; CGSCC: [[LOOP19]] = distinct !{!19, !17}
-; CGSCC: [[TBAA20]] = !{!21, !21, i64 0}
-; CGSCC: [[META21:![0-9]+]] = !{!"long long", !5, i64 0}
-; CGSCC: [[LOOP22]] = distinct !{!22, !17}
-; CGSCC: [[LOOP23]] = distinct !{!23, !17}
-; CGSCC: [[LOOP24]] = distinct !{!24, !17}
-; CGSCC: [[LOOP25]] = distinct !{!25, !17}
-; CGSCC: [[LOOP26]] = distinct !{!26, !17}
-; CGSCC: [[LOOP27]] = distinct !{!27, !17}
-; CGSCC: [[LOOP28]] = distinct !{!28, !17}
-; CGSCC: [[LOOP29]] = distinct !{!29, !17}
-; CGSCC: [[LOOP30]] = distinct !{!30, !17}
-; CGSCC: [[LOOP31]] = distinct !{!31, !17}
+; CGSCC: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CGSCC: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CGSCC: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+; CGSCC: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+; CGSCC: [[META6]] = !{!"Simple C/C++ TBAA"}
+; CGSCC: [[TBAA7]] = !{[[META8:![0-9]+]], [[META9:![0-9]+]], i64 12}
+; CGSCC: [[META8]] = !{!"S", [[META4]], i64 0, [[META4]], i64 4, [[META4]], i64 8, [[META9]], i64 12, [[META9]], i64 16, [[META9]], i64 20}
+; CGSCC: [[META9]] = !{!"float", [[META5]], i64 0}
+; CGSCC: [[TBAA10]] = !{[[META8]], [[META9]], i64 16}
+; CGSCC: [[TBAA11]] = !{[[META8]], [[META9]], i64 20}
+; CGSCC: [[TBAA12]] = !{[[META8]], [[META4]], i64 0}
+; CGSCC: [[TBAA13]] = !{[[META8]], [[META4]], i64 4}
+; CGSCC: [[TBAA14]] = !{[[META8]], [[META4]], i64 8}
+; CGSCC: [[TBAA15]] = !{[[META5]], [[META5]], i64 0}
+; CGSCC: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]]}
+; CGSCC: [[META17]] = !{!"llvm.loop.mustprogress"}
+; CGSCC: [[TBAA18]] = !{[[META9]], [[META9]], i64 0}
+; CGSCC: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]]}
+; CGSCC: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; CGSCC: [[META21]] = !{!"long long", [[META5]], i64 0}
+; CGSCC: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]]}
+; CGSCC: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]]}
+; CGSCC: [[LOOP24]] = distinct !{[[LOOP24]], [[META17]]}
+; CGSCC: [[LOOP25]] = distinct !{[[LOOP25]], [[META17]]}
+; CGSCC: [[LOOP26]] = distinct !{[[LOOP26]], [[META17]]}
+; CGSCC: [[LOOP27]] = distinct !{[[LOOP27]], [[META17]]}
+; CGSCC: [[LOOP28]] = distinct !{[[LOOP28]], [[META17]]}
+; CGSCC: [[LOOP29]] = distinct !{[[LOOP29]], [[META17]]}
+; CGSCC: [[LOOP30]] = distinct !{[[LOOP30]], [[META17]]}
+; CGSCC: [[LOOP31]] = distinct !{[[LOOP31]], [[META17]]}
 ;.
diff --git a/llvm/test/Transforms/CanonicalizeFreezeInLoops/duplicate_remove.ll b/llvm/test/Transforms/CanonicalizeFreezeInLoops/duplicate_remove.ll
new file mode 100644
index 000000000000..a46bb00ba7fa
--- /dev/null
+++ b/llvm/test/Transforms/CanonicalizeFreezeInLoops/duplicate_remove.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s --passes=canon-freeze -S | FileCheck %s
+
+define void @check_duplicate_removal(i32 %n) {
+; CHECK-LABEL: define void @check_duplicate_removal(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[N_FROZEN:%.*]] = freeze i32 [[N]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[T1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[T3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[T2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[T3]], [[LOOP]] ]
+; CHECK-NEXT:    [[T3]] = add i32 [[N_FROZEN]], [[T2]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[T2]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %t1 = phi i32 [ 0, %entry], [%t3, %loop ]
+  %t2 = phi i32 [ 0, %entry], [%t3, %loop ]
+  %t3 = add i32 %n, %t2
+  %.fr = freeze i32 %t3
+  %cond = icmp eq i32 %t2, 0
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/ConstraintElimination/abs.ll b/llvm/test/Transforms/ConstraintElimination/abs.ll
new file mode 100644
index 000000000000..c3162ba48c2e
--- /dev/null
+++ b/llvm/test/Transforms/ConstraintElimination/abs.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=constraint-elimination -S %s | FileCheck %s
+
+define i1 @abs_int_min_is_not_poison(i32 %arg) {
+; CHECK-LABEL: define i1 @abs_int_min_is_not_poison(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[ARG]], i1 false)
+; CHECK-NEXT:    ret i1 true
+;
+  %abs = tail call i32 @llvm.abs.i32(i32 %arg, i1 false)
+  %cmp = icmp sge i32 %abs, %arg
+  ret i1 %cmp
+}
+
+define i1 @abs_int_min_is_poison(i32 %arg) {
+; CHECK-LABEL: define i1 @abs_int_min_is_poison(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[ARG]], i1 true)
+; CHECK-NEXT:    ret i1 true
+;
+  %abs = tail call i32 @llvm.abs.i32(i32 %arg, i1 true)
+  %cmp = icmp sge i32 %abs, %arg
+  ret i1 %cmp
+}
+
+define i1 @abs_plus_one(i32 %arg) {
+; CHECK-LABEL: define i1 @abs_plus_one(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[ARG]], i1 true)
+; CHECK-NEXT:    [[ABS_PLUS_ONE:%.*]] = add nsw i32 [[ABS]], 1
+; CHECK-NEXT:    ret i1 true
+;
+  %abs = tail call i32 @llvm.abs.i32(i32 %arg, i1 true)
+  %abs_plus_one = add nsw i32 %abs, 1
+  %cmp = icmp sge i32 %abs_plus_one, %arg
+  ret i1 %cmp
+}
+
+define i1 @arg_minus_one_strict_less(i32 %arg) {
+; CHECK-LABEL: define i1 @arg_minus_one_strict_less(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[ARG]], i1 true)
+; CHECK-NEXT:    [[ARG_MINUS_ONE:%.*]] = add nsw i32 [[ARG]], -1
+; CHECK-NEXT:    ret i1 true
+;
+  %abs = tail call i32 @llvm.abs.i32(i32 %arg, i1 true)
+  %arg_minus_one = add nsw i32 %arg, -1
+  %cmp = icmp slt i32 %arg_minus_one, %abs
+  ret i1 %cmp
+}
+
+define i1 @arg_minus_one_strict_greater(i32 %arg) {
+; CHECK-LABEL: define i1 @arg_minus_one_strict_greater(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[ARG]], i1 true)
+; CHECK-NEXT:    [[ARG_MINUS_ONE:%.*]] = add nsw i32 [[ARG]], -1
+; CHECK-NEXT:    ret i1 false
+;
+  %abs = tail call i32 @llvm.abs.i32(i32 %arg, i1 true)
+  %arg_minus_one = add nsw i32 %arg, -1
+  %cmp = icmp sgt i32 %arg_minus_one, %abs
+  ret i1 %cmp
+}
+
+define i1 @abs_plus_one_unsigned_greater_or_equal_nonnegative_arg(i32 %arg) {
+; CHECK-LABEL: define i1 @abs_plus_one_unsigned_greater_or_equal_nonnegative_arg(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[CMP_ARG_NONNEGATIVE:%.*]] = icmp sge i32 [[ARG]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ARG_NONNEGATIVE]])
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[ARG]], i1 true)
+; CHECK-NEXT:    [[ABS_PLUS_ONE:%.*]] = add nuw i32 [[ABS]], 1
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp_arg_nonnegative = icmp sge i32 %arg, 0
+  call void @llvm.assume(i1 %cmp_arg_nonnegative)
+  %abs = tail call i32 @llvm.abs.i32(i32 %arg, i1 true)
+  %abs_plus_one = add nuw i32 %abs, 1
+  %cmp = icmp uge i32 %abs_plus_one, %arg
+  ret i1 %cmp
+}
+
+define i1 @abs_plus_one_unsigned_greater_or_equal_cannot_be_simplified(i32 %arg) {
+; CHECK-LABEL: define i1 @abs_plus_one_unsigned_greater_or_equal_cannot_be_simplified(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[ARG]], i1 true)
+; CHECK-NEXT:    [[ABS_PLUS_ONE:%.*]] = add nuw i32 [[ABS]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[ABS_PLUS_ONE]], [[ARG]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %abs = tail call i32 @llvm.abs.i32(i32 %arg, i1 true)
+  %abs_plus_one = add nuw i32 %abs, 1
+  %cmp = icmp uge i32 %abs_plus_one, %arg
+  ret i1 %cmp
+}
+
+define i1 @abs_constant_negative_arg() {
+; CHECK-LABEL: define i1 @abs_constant_negative_arg() {
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 -3, i1 false)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[ABS]], 3
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %abs = tail call i32 @llvm.abs.i32(i32 -3, i1 false)
+  %cmp = icmp sge i32 %abs, 3
+  ret i1 %cmp
+}
+
+define i1 @abs_constant_positive_arg() {
+; CHECK-LABEL: define i1 @abs_constant_positive_arg() {
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 3, i1 false)
+; CHECK-NEXT:    ret i1 true
+;
+  %abs = tail call i32 @llvm.abs.i32(i32 3, i1 false)
+  %cmp = icmp sge i32 %abs, 3
+  ret i1 %cmp
+}
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/ConstraintElimination/add-nuw.ll b/llvm/test/Transforms/ConstraintElimination/add-nuw.ll
index be3b66af10e0..a8a474e6d450 100644
--- a/llvm/test/Transforms/ConstraintElimination/add-nuw.ll
+++ b/llvm/test/Transforms/ConstraintElimination/add-nuw.ll
@@ -235,8 +235,7 @@ define void @test.decompose.nonconst(i8 %a, i8 %b, i8 %c, i8 %d) {
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw i8 [[A]], [[A]]
 ; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw i8 [[A]], [[D:%.*]]
-; CHECK-NEXT:    [[C_4:%.*]] = icmp uge i8 [[ADD_2]], [[C]]
-; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret void
@@ -278,14 +277,11 @@ define void @test.decompose.nonconst.no.null.check(i8 %a, i8 %b, i8 %c, i8 %d) {
 ; CHECK-NEXT:    br i1 [[AND_0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[ADD_0:%.*]] = add nuw i8 [[A]], [[B]]
-; CHECK-NEXT:    [[T_0:%.*]] = icmp uge i8 [[ADD_0]], [[C]]
-; CHECK-NEXT:    call void @use(i1 [[T_0]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw i8 [[A]], [[A]]
-; CHECK-NEXT:    [[T_1:%.*]] = icmp uge i8 [[ADD_0]], [[C]]
-; CHECK-NEXT:    call void @use(i1 [[T_1]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw i8 [[A]], [[D:%.*]]
-; CHECK-NEXT:    [[C_4:%.*]] = icmp uge i8 [[ADD_2]], [[C]]
-; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret void
@@ -318,8 +314,7 @@ define i1 @test_n_must_ule_1_due_to_nuw(i8 %n, i8 %i) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SUB_N_1:%.*]] = add nuw i8 [[N:%.*]], -1
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[I:%.*]], [[SUB_N_1]]
-; CHECK-NEXT:    [[C_1:%.*]] = icmp uge i8 [[I]], [[ADD]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    ret i1 true
 ; CHECK:       if.end:
@@ -376,8 +371,7 @@ define i1 @test_n_must_ule_2_due_to_nuw(i8 %n, i8 %i) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SUB_N_1:%.*]] = add nuw i8 [[N:%.*]], -2
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[I:%.*]], [[SUB_N_1]]
-; CHECK-NEXT:    [[C_1:%.*]] = icmp uge i8 [[I]], [[ADD]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    ret i1 true
 ; CHECK:       if.end:
@@ -435,10 +429,9 @@ define i1 @add_nuw_neg_pr54224_i16(i16 %a) {
 ; CHECK-LABEL: @add_nuw_neg_pr54224_i16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NEG2:%.*]] = add nuw i16 [[A:%.*]], -305
-; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i16 0, [[NEG2]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK-NEXT:    br i1 false, label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
 ; CHECK:       exit.1:
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       exit.2:
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i16 [[A]], 0
 ; CHECK-NEXT:    ret i1 [[C_3]]
@@ -464,8 +457,7 @@ define i1 @add_nuw_neg_pr54224_i64(i64 %a) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i64 0, [[NEG2]]
 ; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
 ; CHECK:       exit.1:
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt i64 [[A]], 0
-; CHECK-NEXT:    ret i1 [[C_2]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       exit.2:
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i64 [[A]], 0
 ; CHECK-NEXT:    ret i1 [[C_3]]
diff --git a/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll b/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll
index 2de7592d5ccc..6bbc73c9c996 100644
--- a/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll
+++ b/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll
@@ -181,9 +181,8 @@ define i1 @test_remove_variables(i1 %c, ptr %A, i64 %B, ptr %C) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult ptr [[TMP0]], [[A:%.*]]
 ; CHECK-NEXT:    br i1 [[C_1]], label [[THEN_2:%.*]], label [[ELSE_2:%.*]]
 ; CHECK:       then.2:
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ne ptr [[A]], null
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp sgt i64 [[B:%.*]], 0
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C_2]], [[C_3]]
+; CHECK-NEXT:    [[AND:%.*]] = and i1 true, [[C_3]]
 ; CHECK-NEXT:    ret i1 [[AND]]
 ; CHECK:       else.2:
 ; CHECK-NEXT:    ret i1 false
@@ -478,3 +477,24 @@ entry:
   %and = select i1 %c.1, i1 %c.2, i1 false
   ret i1 %and
 }
+
+define i1 @and_select_second_implies_first_guaranteed_not_poison(ptr noundef %A, ptr noundef %B) {
+; CHECK-LABEL: @and_select_second_implies_first_guaranteed_not_poison(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ne ptr [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds ptr, ptr [[B]], i64 -1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt ptr [[GEP]], [[A]]
+; CHECK-NEXT:    call void @no_noundef(i1 [[C_2]])
+; CHECK-NEXT:    [[AND:%.*]] = select i1 [[C_1]], i1 [[C_2]], i1 false
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+entry:
+  %c.1 = icmp ne ptr %A, %B
+  %gep = getelementptr inbounds ptr, ptr %B, i64 -1
+  %c.2 = icmp ugt ptr %gep, %A
+  call void @no_noundef(i1 %c.2)
+  %and = select i1 %c.1, i1 %c.2, i1 false
+  ret i1 %and
+}
+
+declare void @no_noundef(i1 noundef)
diff --git a/llvm/test/Transforms/ConstraintElimination/constraint-overflow.ll b/llvm/test/Transforms/ConstraintElimination/constraint-overflow.ll
new file mode 100644
index 000000000000..88f87f4afab2
--- /dev/null
+++ b/llvm/test/Transforms/ConstraintElimination/constraint-overflow.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=constraint-elimination -S %s | FileCheck %s
+
+define i32 @f(i64 %a3, i64 %numElements) {
+; CHECK-LABEL: define i32 @f(
+; CHECK-SAME: i64 [[A3:%.*]], i64 [[NUMELEMENTS:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp ule i64 [[NUMELEMENTS]], 1152921504606846975
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[A1:%.*]] = shl nuw i64 [[NUMELEMENTS]], 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[A1]], [[A3]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_END_I:%.*]], label [[ABORT:%.*]]
+; CHECK:       if.end.i:
+; CHECK-NEXT:    [[CMP2_NOT_I:%.*]] = icmp ult i64 [[A1]], [[A3]]
+; CHECK-NEXT:    br i1 [[CMP2_NOT_I]], label [[ABORT]], label [[EXIT:%.*]]
+; CHECK:       abort:
+; CHECK-NEXT:    ret i32 -1
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %cond = icmp ule i64 %numElements, 1152921504606846975
+  call void @llvm.assume(i1 %cond)
+  %a1 = shl nuw i64 %numElements, 4
+  br label %if.end
+if.end:
+  %cmp = icmp ugt i64 %a1, %a3
+  br i1 %cmp, label %if.end.i, label %abort
+if.end.i:
+  %cmp2.not.i = icmp ult i64 %a1, %a3
+  br i1 %cmp2.not.i, label %abort, label %exit
+abort:
+  ret i32 -1
+exit:
+  ret i32 0
+}
+
+declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
index ea65e890e4f3..52adc78b4e15 100644
--- a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
+++ b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
@@ -289,8 +289,7 @@ define i4 @ptr_N_step_zext_n_zext(ptr %src, ptr %lower, ptr %upper, i16 %N, i16
 ; CHECK-NEXT:    br i1 [[STEP_ULT_N]], label [[PTR_CHECK:%.*]], label [[EXIT:%.*]]
 ; CHECK:       ptr.check:
 ; CHECK-NEXT:    [[SRC_STEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[STEP_ADD_1_EXT]]
-; CHECK-NEXT:    [[CMP_STEP_START:%.*]] = icmp ult ptr [[SRC_STEP]], [[LOWER]]
-; CHECK-NEXT:    [[OR_CHECK:%.*]] = or i1 [[CMP_STEP_START]], false
+; CHECK-NEXT:    [[OR_CHECK:%.*]] = or i1 false, false
 ; CHECK-NEXT:    br i1 [[OR_CHECK]], label [[TRAP_BB]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i4 3
@@ -344,9 +343,8 @@ define i4 @ptr_N_step_zext_n_zext_out_of_bounds(ptr %src, ptr %lower, ptr %upper
 ; CHECK-NEXT:    br i1 [[STEP_ULT_N]], label [[PTR_CHECK:%.*]], label [[EXIT:%.*]]
 ; CHECK:       ptr.check:
 ; CHECK-NEXT:    [[SRC_STEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[STEP_ADD_2_EXT]]
-; CHECK-NEXT:    [[CMP_STEP_START:%.*]] = icmp ult ptr [[SRC_STEP]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_STEP_END:%.*]] = icmp uge ptr [[SRC_STEP]], [[UPPER]]
-; CHECK-NEXT:    [[OR_CHECK:%.*]] = or i1 [[CMP_STEP_START]], [[CMP_STEP_END]]
+; CHECK-NEXT:    [[OR_CHECK:%.*]] = or i1 false, [[CMP_STEP_END]]
 ; CHECK-NEXT:    br i1 [[OR_CHECK]], label [[TRAP_BB]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i4 3
diff --git a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic.ll b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic.ll
index 0a835fb38a24..a4d825b32796 100644
--- a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic.ll
+++ b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic.ll
@@ -530,9 +530,8 @@ define i4 @ptr_N_unsigned_positive(ptr %src, ptr %lower, ptr %upper, i16 %N, i16
 ; CHECK-NEXT:    [[SRC_END:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i16 [[N:%.*]]
 ; CHECK-NEXT:    [[CMP_SRC_START:%.*]] = icmp ult ptr [[SRC]], [[LOWER:%.*]]
 ; CHECK-NEXT:    [[CMP_SRC_END:%.*]] = icmp uge ptr [[SRC_END]], [[UPPER:%.*]]
-; CHECK-NEXT:    [[N_NEG:%.*]] = icmp ult i16 [[N]], 0
 ; CHECK-NEXT:    [[OR_PRECOND_0:%.*]] = or i1 [[CMP_SRC_START]], [[CMP_SRC_END]]
-; CHECK-NEXT:    [[OR_PRECOND_1:%.*]] = or i1 [[OR_PRECOND_0]], [[N_NEG]]
+; CHECK-NEXT:    [[OR_PRECOND_1:%.*]] = or i1 [[OR_PRECOND_0]], false
 ; CHECK-NEXT:    br i1 [[OR_PRECOND_1]], label [[TRAP_BB:%.*]], label [[STEP_CHECK:%.*]]
 ; CHECK:       trap.bb:
 ; CHECK-NEXT:    ret i4 2
diff --git a/llvm/test/Transforms/ConstraintElimination/geps-pointers-to-structs.ll b/llvm/test/Transforms/ConstraintElimination/geps-pointers-to-structs.ll
index e72f1ee9b370..a86cd5c04d5b 100644
--- a/llvm/test/Transforms/ConstraintElimination/geps-pointers-to-structs.ll
+++ b/llvm/test/Transforms/ConstraintElimination/geps-pointers-to-structs.ll
@@ -369,8 +369,7 @@ define i1 @ptr.int.struct.test.ult.due.to.second.dimension.var.index(ptr %start,
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = zext i32 [[IDX]] to i64
 ; CHECK-NEXT:    [[START_0:%.*]] = getelementptr inbounds [[STRUCT_2]], ptr [[START]], i64 6, i32 0
 ; CHECK-NEXT:    [[START_0_CAST:%.*]] = bitcast ptr [[START_0]] to ptr
-; CHECK-NEXT:    [[C_0:%.*]] = icmp ult ptr [[START_0_CAST]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[C_0]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret i1 true
 ;
diff --git a/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll b/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
index 9568b155af13..a80e492e0824 100644
--- a/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
+++ b/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
@@ -128,8 +128,7 @@ define i1 @gep_decomp_i80(ptr %a) {
 ; CHECK-LABEL: @gep_decomp_i80(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i80 1973801615886922022913
-; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[GEP]], null
-; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:    br i1 false, label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i80 1973801615886922022913
 ; CHECK-NEXT:    ret i1 true
diff --git a/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll b/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
index 17a54b6ecbe2..e3f2a54f321e 100644
--- a/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
+++ b/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
@@ -23,7 +23,7 @@ define void @checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8 %n)
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV]]
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP_PTR_IV_LOWER]], [[CMP_PTR_IV_UPPER]]
+; CHECK-NEXT:    [[OR:%.*]] = or i1 false, [[CMP_PTR_IV_UPPER]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[TRAP]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    store i8 0, ptr [[PTR_IV]], align 4
@@ -88,7 +88,7 @@ define void @some_checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV]]
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP_PTR_IV_LOWER]], [[CMP_PTR_IV_UPPER]]
+; CHECK-NEXT:    [[OR:%.*]] = or i1 false, [[CMP_PTR_IV_UPPER]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[TRAP]], label [[LOOP_BODY:%.*]]
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i16 [[IV]], 1
@@ -166,7 +166,7 @@ define void @no_checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8 %
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV]]
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP_PTR_IV_LOWER]], [[CMP_PTR_IV_UPPER]]
+; CHECK-NEXT:    [[OR:%.*]] = or i1 false, [[CMP_PTR_IV_UPPER]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[TRAP]], label [[LOOP_BODY:%.*]]
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i16 [[IV]], 1
diff --git a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-base.ll b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-base.ll
index 8245e84108e4..7b928a030614 100644
--- a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-base.ll
+++ b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-base.ll
@@ -20,8 +20,7 @@ define void @loop_phi_pos_start_value(i32 %y, i1 %c, i32 %n) {
 ; CHECK-NEXT:    call void @use(i1 [[C_2]])
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp sgt i32 [[X]], 9
 ; CHECK-NEXT:    call void @use(i1 [[C_3]])
-; CHECK-NEXT:    [[C_4:%.*]] = icmp sge i32 [[X]], 0
-; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[C_5:%.*]] = icmp sge i32 [[X]], 9
 ; CHECK-NEXT:    call void @use(i1 [[C_5]])
 ; CHECK-NEXT:    [[X_NEXT]] = add nsw i32 [[X]], 1
diff --git a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
index b8ae10f42f03..66ce1ffc6ebc 100644
--- a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
+++ b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
@@ -19,9 +19,8 @@ define void @test1(ptr %src, ptr noundef %lower, ptr noundef %upper, i8 %N) {
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_BODY:%.*]]
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[SRC_IV:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[IV]]
-; CHECK-NEXT:    [[CMP_IV_START:%.*]] = icmp ult ptr [[SRC_IV]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_END:%.*]] = icmp uge ptr [[SRC_IV]], [[UPPER]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_IV_START]], [[CMP_IV_END]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_IV_END]]
 ; CHECK-NEXT:    br i1 [[OR_1]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]]
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[PTR_SRC_IV:%.*]] = bitcast ptr [[SRC_IV]] to ptr
@@ -120,9 +119,8 @@ define void @test2(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_BODY:%.*]]
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[SRC_IV:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[IV]]
-; CHECK-NEXT:    [[CMP_IV_START:%.*]] = icmp ult ptr [[SRC_IV]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_END:%.*]] = icmp uge ptr [[SRC_IV]], [[UPPER]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_IV_START]], [[CMP_IV_END]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_IV_END]]
 ; CHECK-NEXT:    br i1 [[OR_1]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]]
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1
@@ -218,7 +216,7 @@ define void @test2_with_ne(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK-NEXT:    [[SRC_IV:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[IV]]
 ; CHECK-NEXT:    [[CMP_IV_START:%.*]] = icmp ult ptr [[SRC_IV]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_END:%.*]] = icmp uge ptr [[SRC_IV]], [[UPPER]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_IV_START]], [[CMP_IV_END]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_IV_END]]
 ; CHECK-NEXT:    br i1 [[OR_1]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]]
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1
@@ -314,7 +312,7 @@ define void @test3(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK-NEXT:    [[SRC_IV:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[IV]]
 ; CHECK-NEXT:    [[CMP_IV_START:%.*]] = icmp ult ptr [[SRC_IV]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_END:%.*]] = icmp uge ptr [[SRC_IV]], [[UPPER]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_IV_START]], [[CMP_IV_END]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_IV_END]]
 ; CHECK-NEXT:    br i1 [[OR_1]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]]
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[NEXT]]
diff --git a/llvm/test/Transforms/ConstraintElimination/max-row-limit.ll b/llvm/test/Transforms/ConstraintElimination/max-row-limit.ll
index 752e012791ee..0e078109ed66 100644
--- a/llvm/test/Transforms/ConstraintElimination/max-row-limit.ll
+++ b/llvm/test/Transforms/ConstraintElimination/max-row-limit.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=constraint-elimination -S %s | FileCheck --check-prefixes=COMMON,SIMP %s
-; RUN: opt -passes=constraint-elimination -constraint-elimination-max-rows=4 -S %s | FileCheck --check-prefixes=COMMON,SIMP %s
-; RUN: opt -passes=constraint-elimination -constraint-elimination-max-rows=3 -S %s | FileCheck --check-prefixes=COMMON,NOSIMP %s
+; RUN: opt -passes=constraint-elimination -constraint-elimination-max-rows=9 -S %s | FileCheck --check-prefixes=COMMON,SIMP %s
+; RUN: opt -passes=constraint-elimination -constraint-elimination-max-rows=8 -S %s | FileCheck --check-prefixes=COMMON,NOSIMP %s
 
 
 define i1 @test_max_row_limit(i32 %l0, i32 %l1, i32 %l2, i32 %l3, i32 %l4) {
diff --git a/llvm/test/Transforms/ConstraintElimination/mul.ll b/llvm/test/Transforms/ConstraintElimination/mul.ll
index c8a1d31a48d5..362cd33fe650 100644
--- a/llvm/test/Transforms/ConstraintElimination/mul.ll
+++ b/llvm/test/Transforms/ConstraintElimination/mul.ll
@@ -9,8 +9,7 @@ define i1 @test_mul_const_nuw_unsigned_1(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_MUL_4:%.*]] = mul nuw i8 [[START:%.*]], 4
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_MUL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[T_1:%.*]] = icmp ult i8 [[START]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T_1]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.mul.4 = mul nuw i8 %start, 4
@@ -28,8 +27,7 @@ define i1 @test_mul_const_nuw_unsigned_2(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_MUL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_MUL_2:%.*]] = mul nuw i8 [[START]], 2
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i8 [[START_MUL_2]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.mul.4 = mul nuw i8 %start, 4
@@ -163,8 +161,7 @@ define i1 @test_mul_const_nuw_unsigned_8(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
 ; CHECK-NEXT:    [[START_ADD_2_1:%.*]] = add nuw i8 [[START_ADD_2]], 1
 ; CHECK-NEXT:    [[START_MUL_3:%.*]] = mul nuw i8 [[START]], 3
-; CHECK-NEXT:    [[T_5:%.*]] = icmp ule i8 [[START_ADD_1]], [[START_MUL_3]]
-; CHECK-NEXT:    ret i1 [[T_5]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.mul.4 = mul nuw i8 %start, 4
@@ -191,8 +188,7 @@ define i1 @test_mul_const_nuw_unsigned_9(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
 ; CHECK-NEXT:    [[START_ADD_2_1:%.*]] = add nuw i8 [[START_ADD_2]], 1
 ; CHECK-NEXT:    [[START_MUL_3:%.*]] = mul nuw i8 [[START]], 3
-; CHECK-NEXT:    [[F_5:%.*]] = icmp ult i8 [[START_ADD_2]], [[START_MUL_3]]
-; CHECK-NEXT:    ret i1 [[F_5]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %start.mul.4 = mul nuw i8 %start, 4
@@ -371,8 +367,7 @@ define i1 @test_mul_add_const_nuw_unsigned_1(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_MUL_4:%.*]] = mul nuw i8 [[ADD]], 4
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_MUL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[T_1:%.*]] = icmp ult i8 [[START]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T_1]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
@@ -392,8 +387,7 @@ define i1 @test_mul_add_const_nuw_unsigned_2(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_MUL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_MUL_2:%.*]] = mul nuw i8 [[START]], 2
-; CHECK-NEXT:    [[T_2:%.*]] = icmp ult i8 [[START_MUL_2]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T_2]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
diff --git a/llvm/test/Transforms/ConstraintElimination/or-implied-by-operands.ll b/llvm/test/Transforms/ConstraintElimination/or-implied-by-operands.ll
index fea6f2d8a5dc..f5c108822b8c 100644
--- a/llvm/test/Transforms/ConstraintElimination/or-implied-by-operands.ll
+++ b/llvm/test/Transforms/ConstraintElimination/or-implied-by-operands.ll
@@ -181,9 +181,8 @@ define i1 @test_remove_variables(i1 %c, ptr %A, i64 %B, ptr %C) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult ptr [[TMP0]], [[A:%.*]]
 ; CHECK-NEXT:    br i1 [[C_1]], label [[THEN_2:%.*]], label [[ELSE_2:%.*]]
 ; CHECK:       then.2:
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ne ptr [[A]], null
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp sgt i64 [[B:%.*]], 0
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[C_2]], [[C_3]]
+; CHECK-NEXT:    [[OR:%.*]] = or i1 true, [[C_3]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ; CHECK:       else.2:
 ; CHECK-NEXT:    ret i1 false
diff --git a/llvm/test/Transforms/ConstraintElimination/or.ll b/llvm/test/Transforms/ConstraintElimination/or.ll
index 2f24519ddd1d..01b8ca973efa 100644
--- a/llvm/test/Transforms/ConstraintElimination/or.ll
+++ b/llvm/test/Transforms/ConstraintElimination/or.ll
@@ -135,15 +135,13 @@ define i1 @test_or_chain_ule_1(i4 %x, i4 %y, i4 %z, i4 %a, i4 %b) {
 ; CHECK-NEXT:    [[RES_2:%.*]] = xor i1 [[RES_1]], [[C_7]]
 ; CHECK-NEXT:    ret i1 [[RES_2]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[RES_3:%.*]] = xor i1 false, false
+; CHECK-NEXT:    [[RES_3:%.*]] = xor i1 true, true
 ; CHECK-NEXT:    [[RES_4:%.*]] = xor i1 [[RES_3]], true
 ; CHECK-NEXT:    [[RES_5:%.*]] = xor i1 [[RES_4]], true
 ; CHECK-NEXT:    [[RES_6:%.*]] = xor i1 [[RES_5]], true
 ; CHECK-NEXT:    [[RES_7:%.*]] = xor i1 [[RES_6]], true
-; CHECK-NEXT:    [[C_8:%.*]] = icmp ule i4 [[X]], [[A]]
-; CHECK-NEXT:    [[RES_8:%.*]] = xor i1 [[RES_7]], [[C_8]]
-; CHECK-NEXT:    [[C_9:%.*]] = icmp ule i4 [[X]], [[B:%.*]]
-; CHECK-NEXT:    [[RES_9:%.*]] = xor i1 [[RES_8]], [[C_9]]
+; CHECK-NEXT:    [[RES_8:%.*]] = xor i1 [[RES_7]], true
+; CHECK-NEXT:    [[RES_9:%.*]] = xor i1 [[RES_8]], true
 ; CHECK-NEXT:    ret i1 [[RES_9]]
 ;
 entry:
@@ -210,15 +208,13 @@ define i1 @test_or_chain_ule_2(i4 %x, i4 %y, i4 %z, i4 %a, i4 %b) {
 ; CHECK-NEXT:    [[RES_2:%.*]] = xor i1 [[RES_1]], [[C_7]]
 ; CHECK-NEXT:    ret i1 [[RES_2]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[RES_3:%.*]] = xor i1 false, false
+; CHECK-NEXT:    [[RES_3:%.*]] = xor i1 true, true
 ; CHECK-NEXT:    [[RES_4:%.*]] = xor i1 [[RES_3]], true
 ; CHECK-NEXT:    [[RES_5:%.*]] = xor i1 [[RES_4]], true
 ; CHECK-NEXT:    [[RES_6:%.*]] = xor i1 [[RES_5]], true
 ; CHECK-NEXT:    [[RES_7:%.*]] = xor i1 [[RES_6]], true
-; CHECK-NEXT:    [[C_8:%.*]] = icmp ule i4 [[X]], [[A]]
-; CHECK-NEXT:    [[RES_8:%.*]] = xor i1 [[RES_7]], [[C_8]]
-; CHECK-NEXT:    [[C_9:%.*]] = icmp ule i4 [[X]], [[B:%.*]]
-; CHECK-NEXT:    [[RES_9:%.*]] = xor i1 [[RES_8]], [[C_9]]
+; CHECK-NEXT:    [[RES_8:%.*]] = xor i1 [[RES_7]], true
+; CHECK-NEXT:    [[RES_9:%.*]] = xor i1 [[RES_8]], true
 ; CHECK-NEXT:    ret i1 [[RES_9]]
 ;
 entry:
@@ -354,8 +350,7 @@ define i1 @test_or_chain_with_and_ule(i4 %x, i4 %y, i4 %z, i4 %a, i4 %b) {
 ; CHECK-NEXT:    [[RES_6:%.*]] = xor i1 [[RES_5]], [[C_8]]
 ; CHECK-NEXT:    [[C_9:%.*]] = icmp ule i4 [[X]], [[B:%.*]]
 ; CHECK-NEXT:    [[RES_7:%.*]] = xor i1 [[RES_6]], [[C_9]]
-; CHECK-NEXT:    [[C_10:%.*]] = icmp ule i4 2, [[X]]
-; CHECK-NEXT:    [[RES_8:%.*]] = xor i1 [[RES_7]], [[C_10]]
+; CHECK-NEXT:    [[RES_8:%.*]] = xor i1 [[RES_7]], true
 ; CHECK-NEXT:    [[C_11:%.*]] = icmp ugt i4 2, [[A]]
 ; CHECK-NEXT:    [[RES_9:%.*]] = xor i1 [[RES_8]], [[C_11]]
 ; CHECK-NEXT:    ret i1 [[RES_9]]
diff --git a/llvm/test/Transforms/ConstraintElimination/reason-about-add-operands.ll b/llvm/test/Transforms/ConstraintElimination/reason-about-add-operands.ll
index 3c95b192705a..fb13417c7a6c 100644
--- a/llvm/test/Transforms/ConstraintElimination/reason-about-add-operands.ll
+++ b/llvm/test/Transforms/ConstraintElimination/reason-about-add-operands.ll
@@ -14,8 +14,7 @@ define i1 @addition_with_extra_facts_and_args_ult_i64(i64 noundef %a, i64 nounde
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i64 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i64 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %cmp.a = icmp ule i64 %a, 2048
@@ -40,8 +39,7 @@ define i1 @addition_with_extra_facts_and_args_ult_1(i16 noundef %a, i16 noundef
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i16 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %cmp.a = icmp ule i16 %a, 2048
@@ -66,8 +64,7 @@ define i1 @addition_with_extra_facts_and_args_ult_2(i16 noundef %a, i16 noundef
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i16 [[B]], [[C]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %cmp.a = icmp ule i16 %a, 2048
@@ -92,8 +89,7 @@ define i1 @addition_with_extra_facts_and_args_ult_3(i16 noundef %a, i16 noundef
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[F:%.*]] = icmp uge i16 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[F]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %cmp.a = icmp ule i16 %a, 2048
@@ -118,8 +114,7 @@ define i1 @addition_with_extra_facts_and_args_ult_4(i16 noundef %a, i16 noundef
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[F:%.*]] = icmp uge i16 [[B]], [[C]]
-; CHECK-NEXT:    ret i1 [[F]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %cmp.a = icmp ule i16 %a, 2048
@@ -201,8 +196,7 @@ define i1 @addition_with_extra_facts_and_return_value_ult_1() {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i16 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %a = call i16 @get()
@@ -232,8 +226,7 @@ define i1 @addition_with_extra_facts_and_return_value_ult_2() {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[F:%.*]] = icmp uge i16 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[F]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %a = call i16 @get()
@@ -259,8 +252,7 @@ define i1 @addition_no_extra_facts_with_return_value_ult_1() {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i16 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %a = call i16 @get()
@@ -282,8 +274,7 @@ define i1 @addition_no_extra_facts_with_return_value_ult_2() {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[F:%.*]] = icmp uge i16 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[F]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %a = call i16 @get()
@@ -326,8 +317,7 @@ define i1 @assume_x_ugt_y_plus_y_via_shl_eq(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[S:%.*]] = shl nuw i8 [[Y]], 1
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i8 [[X]], [[S]]
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i8 [[X]], [[Y]]
-; CHECK-NEXT:    ret i1 [[C_2]]
+; CHECK-NEXT:    ret i1 false
 ;
   %s = shl nuw i8 %y, 1
   %c.1 = icmp ugt i8 %x, %s
@@ -358,8 +348,7 @@ define i1 @assume_x_ugt_y_plus_y_via_add_eq(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[S:%.*]] = add nuw i8 [[Y]], [[Y]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i8 [[X]], [[S]]
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i8 [[X]], [[Y]]
-; CHECK-NEXT:    ret i1 [[C_2]]
+; CHECK-NEXT:    ret i1 false
 ;
   %s = add nuw i8 %y, %y
   %c.1 = icmp ugt i8 %x, %s
@@ -390,8 +379,7 @@ define i1 @assume_x_ugt_y_plus_y_via_shl_ne(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[S:%.*]] = shl nuw i8 [[Y]], 1
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i8 [[X]], [[S]]
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ne i8 [[X]], [[Y]]
-; CHECK-NEXT:    ret i1 [[C_2]]
+; CHECK-NEXT:    ret i1 true
 ;
   %s = shl nuw i8 %y, 1
   %c.1 = icmp ugt i8 %x, %s
diff --git a/llvm/test/Transforms/ConstraintElimination/reproducer-remarks-debug.ll b/llvm/test/Transforms/ConstraintElimination/reproducer-remarks-debug.ll
index 4fdc8e583112..f0b20fb36059 100644
--- a/llvm/test/Transforms/ConstraintElimination/reproducer-remarks-debug.ll
+++ b/llvm/test/Transforms/ConstraintElimination/reproducer-remarks-debug.ll
@@ -5,6 +5,7 @@
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK:      Condition icmp eq ptr %a, null implied by dominating constraints
+; CHECK-NEXT: -1 * %a <= 0
 ; CHECK-NEXT: %a <= 0
 ; CHECK-NEXT: Creating reproducer for   %c.2 = icmp eq ptr %a, null
 ; CHECK-NEXT:   found external input ptr %a
diff --git a/llvm/test/Transforms/ConstraintElimination/reproducer-remarks.ll b/llvm/test/Transforms/ConstraintElimination/reproducer-remarks.ll
index 9f8ae3825a8d..f912abfc24a8 100644
--- a/llvm/test/Transforms/ConstraintElimination/reproducer-remarks.ll
+++ b/llvm/test/Transforms/ConstraintElimination/reproducer-remarks.ll
@@ -183,14 +183,14 @@ else:
 define i32 @test_branch(i32 %a) {
 ; CHECK-LABEL: define i1 @"{{.+}}test_branchrepro"(i32 %a) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = icmp ult i32 %a, 0
+; CHECK-NEXT:   %0 = icmp ult i32 %a, 4
 ; CHECK-NEXT:   call void @llvm.assume(i1 %0)
 ; CHECK-NEXT:   %c.2 = icmp ugt i32 0, 0
 ; CHECK-NEXT:   ret i1 %c.2
 ; CHECK-NEXT: }
 ;
 entry:
-  %c.1 = icmp ult i32 %a, 0
+  %c.1 = icmp ult i32 %a, 4
   br i1 %c.1, label %then, label %exit
 
 then:
diff --git a/llvm/test/Transforms/ConstraintElimination/shl.ll b/llvm/test/Transforms/ConstraintElimination/shl.ll
index 9f98a9d3a57c..9fe8c147017b 100644
--- a/llvm/test/Transforms/ConstraintElimination/shl.ll
+++ b/llvm/test/Transforms/ConstraintElimination/shl.ll
@@ -9,8 +9,7 @@ define i1 @test_shl_const_nuw_unsigned_1(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_SHL_4:%.*]] = shl nuw i8 [[START:%.*]], 4
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_SHL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[T_1:%.*]] = icmp ult i8 [[START]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T_1]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.shl.4 = shl nuw i8 %start, 4
@@ -28,8 +27,7 @@ define i1 @test_shl_const_nuw_unsigned_2(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_SHL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_SHL_2:%.*]] = shl nuw i8 [[START]], 2
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i8 [[START_SHL_2]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.shl.4 = shl nuw i8 %start, 4
@@ -49,8 +47,7 @@ define i1 @test_shl_const_nuw_unsigned_3(i8 %start, i8 %high) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_SHL_2:%.*]] = shl nuw i8 [[START]], 2
 ; CHECK-NEXT:    [[START_ADD_1:%.*]] = add nuw i8 [[START]], [[START]]
-; CHECK-NEXT:    [[T:%.*]] = icmp ule i8 [[START_ADD_1]], [[START_SHL_2]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.shl.4 = shl nuw i8 %start, 4
@@ -94,8 +91,7 @@ define i1 @test_shl_const_nuw_unsigned_5(i8 %start, i8 %high) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_ADD_1:%.*]] = add nuw i8 [[START]], [[START]]
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
-; CHECK-NEXT:    [[T_4:%.*]] = icmp ule i8 [[START_ADD_2]], [[START_SHL_4]]
-; CHECK-NEXT:    ret i1 [[T_4]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.shl.4 = shl nuw i8 %start, 4
@@ -167,8 +163,7 @@ define i1 @test_shl_const_nuw_unsigned_8(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
 ; CHECK-NEXT:    [[START_ADD_2_1:%.*]] = add nuw i8 [[START_ADD_2]], 1
 ; CHECK-NEXT:    [[START_SHL_3:%.*]] = shl nuw i8 [[START]], 3
-; CHECK-NEXT:    [[T_5:%.*]] = icmp ule i8 [[START_ADD_1]], [[START_SHL_3]]
-; CHECK-NEXT:    ret i1 [[T_5]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.shl.4 = shl nuw i8 %start, 4
@@ -296,8 +291,7 @@ define i1 @test_shl_add_const_nuw_unsigned_1(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_SHL_4:%.*]] = shl nuw i8 [[ADD]], 4
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_SHL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[T_1:%.*]] = icmp ult i8 [[START]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T_1]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
@@ -317,8 +311,7 @@ define i1 @test_shl_add_const_nuw_unsigned_2(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_SHL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_SHL_2:%.*]] = shl nuw i8 [[START]], 2
-; CHECK-NEXT:    [[T_2:%.*]] = icmp ult i8 [[START_SHL_2]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T_2]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
@@ -340,8 +333,7 @@ define i1 @test_shl_add_const_nuw_unsigned_3(i8 %start, i8 %high) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_ADD_1:%.*]] = add nuw i8 [[START]], [[START]]
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
-; CHECK-NEXT:    [[T_3:%.*]] = icmp ule i8 [[START_ADD_2]], [[START_SHL_4]]
-; CHECK-NEXT:    ret i1 [[T_3]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
@@ -365,8 +357,7 @@ define i1 @test_shl_add_const_nuw_unsigned_4(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_SHL_2:%.*]] = shl nuw i8 [[START]], 2
 ; CHECK-NEXT:    [[START_ADD_1:%.*]] = add nuw i8 [[START]], [[START]]
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
-; CHECK-NEXT:    [[T_4:%.*]] = icmp ult i8 [[START_ADD_2]], [[START_SHL_4]]
-; CHECK-NEXT:    ret i1 [[T_4]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
@@ -391,8 +382,7 @@ define i1 @test_shl_add_const_nuw_unsigned_5(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_ADD_1:%.*]] = add nuw i8 [[START]], [[START]]
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
 ; CHECK-NEXT:    [[START_ADD_2_12:%.*]] = add nuw i8 [[START_ADD_2]], 12
-; CHECK-NEXT:    [[T_5:%.*]] = icmp ule i8 [[START_ADD_2_12]], [[START_SHL_4]]
-; CHECK-NEXT:    ret i1 [[T_5]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
@@ -417,8 +407,7 @@ define i1 @test_shl_add_const_nuw_unsigned_6(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_ADD_1:%.*]] = add nuw i8 [[START]], [[START]]
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
 ; CHECK-NEXT:    [[START_ADD_2_13:%.*]] = add nuw i8 [[START_ADD_2]], 13
-; CHECK-NEXT:    [[F_1:%.*]] = icmp ule i8 [[START_ADD_2_13]], [[START_SHL_4]]
-; CHECK-NEXT:    ret i1 [[F_1]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
diff --git a/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll b/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
index 30051080178e..0d90bc2fb343 100644
--- a/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
+++ b/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
@@ -316,10 +316,9 @@ define i1 @sub_nuw_neg_i16(i16 %a) {
 ; CHECK-LABEL: @sub_nuw_neg_i16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NEG2:%.*]] = sub nuw i16 [[A:%.*]], -305
-; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i16 0, [[NEG2]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK-NEXT:    br i1 false, label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
 ; CHECK:       exit.1:
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       exit.2:
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i16 [[A]], 0
 ; CHECK-NEXT:    ret i1 [[C_3]]
diff --git a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-O1.ll b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-O1.ll
new file mode 100644
index 000000000000..acd6a08d7c1b
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-O1.ll
@@ -0,0 +1,64 @@
+; RUN: opt < %s -passes='module(coro-early),cgscc(inline,coro-split<reuse-storage>)' -S | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators < %s -passes='module(coro-early),cgscc(inline,coro-split<reuse-storage>)' -S | FileCheck %s
+
+; Simplified version from pr#75104.
+; Make sure we do not update debug location for hosited dbg.declare intrinsics when optimizing coro frame.
+
+; CHECK-NOT: mismatched subprogram between llvm.dbg.declare variable and !dbg attachment
+
+%"struct.std::coroutine_handle" = type { i8 }
+
+define void @_Z1fv() presplitcoroutine {
+entry:
+  %0 = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+  %1 = call ptr @llvm.coro.begin(token %0, ptr null), !dbg !10
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  call void @_ZN1BD1Ev()
+  %2 = call token @llvm.coro.save(ptr null)
+  %3 = call i8 @llvm.coro.suspend(token none, i1 false)
+  br label %for.cond
+}
+
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
+declare ptr @llvm.coro.begin(token, ptr writeonly)
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare token @llvm.coro.save(ptr)
+declare i8 @llvm.coro.suspend(token, i1)
+
+define void @_ZN1BD1Ev() {
+entry:
+  %b11 = alloca [0 x [0 x %"struct.std::coroutine_handle"]], i32 0, align 1
+  call void @llvm.dbg.declare(metadata ptr %b11, metadata !13, metadata !DIExpression()), !dbg !21
+  %call = call i1 @_ZNSt16coroutine_handleIvEcvbEv(ptr %b11), !dbg !21
+  ret void
+}
+
+declare i1 @_ZNSt16coroutine_handleIvEcvbEv(ptr)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 18.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "<stdin>", directory: "")
+!2 = !{!3}
+!3 = !DIGlobalVariableExpression(var: !4, expr: !DIExpression())
+!4 = distinct !DIGlobalVariable(name: "a", scope: !0, file: !5, line: 17, type: !6, isLocal: false, isDefinition: true)
+!5 = !DIFile(filename: "bad.cpp", directory: "")
+!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "coroutine_handle<void>", scope: !7, file: !5, line: 2, size: 8, flags: DIFlagTypePassByValue, elements: !8, templateParams: !8, identifier: "_ZTSSt16coroutine_handleIvE")
+!7 = !DINamespace(name: "std", scope: null)
+!8 = !{}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !DILocation(line: 31, column: 7, scope: !11)
+!11 = distinct !DISubprogram(name: "f", linkageName: "_Z1fv", scope: !5, file: !5, line: 31, type: !12, scopeLine: 31, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!12 = distinct !DISubroutineType(types: !8)
+!13 = !DILocalVariable(name: "b", scope: !14, file: !5, line: 27, type: !6)
+!14 = distinct !DILexicalBlock(scope: !15, file: !5, line: 27, column: 14)
+!15 = distinct !DILexicalBlock(scope: !16, file: !5, line: 26, column: 8)
+!16 = distinct !DISubprogram(name: "~B", linkageName: "_ZN1BD2Ev", scope: !17, file: !5, line: 26, type: !18, scopeLine: 26, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !20, retainedNodes: !8)
+!17 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "B", file: !5, line: 18, size: 8, flags: DIFlagTypePassByReference | DIFlagNonTrivial, elements: !8, identifier: "_ZTS1B")
+!18 = !DISubroutineType(types: !19)
+!19 = !{null}
+!20 = !DISubprogram(name: "~B", scope: !17, file: !5, line: 26, type: !18, scopeLine: 26, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!21 = !DILocation(line: 27, column: 14, scope: !14)
diff --git a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll
index 19a89fefd526..bf51218590c2 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll
@@ -32,8 +32,8 @@
 ; CHECK:       entry:
 ; CHECK:         %j = alloca i32, align 4
 ; CHECK:         call void @llvm.dbg.declare(metadata ptr %j, metadata ![[JVAR:[0-9]+]], metadata !DIExpression()), !dbg ![[JDBGLOC:[0-9]+]]
-; CHECK:         %[[MEMORY:.*]] = call ptr @new
-; CHECK:         call void @llvm.dbg.declare(metadata ptr %[[MEMORY]], metadata ![[XVAR:[0-9]+]], metadata !DIExpression(DW_OP_plus_uconst, 32)), !dbg ![[IDBGLOC:[0-9]+]]
+; CHECK:         %[[MEMORY:.*]] = call ptr @new({{.+}}), !dbg ![[IDBGLOC:[0-9]+]]
+; CHECK:         call void @llvm.dbg.declare(metadata ptr %[[MEMORY]], metadata ![[XVAR:[0-9]+]], metadata !DIExpression(DW_OP_plus_uconst, 32)), !dbg ![[IDBGLOC]]
 ; CHECK:         call void @llvm.dbg.declare(metadata ptr %[[MEMORY]], metadata ![[IVAR:[0-9]+]], metadata !DIExpression(DW_OP_plus_uconst, 20)), !dbg ![[IDBGLOC]]
 ; CHECK:       await.ready:
 ;
@@ -49,18 +49,20 @@
 ; CHECK:       await.ready:
 ;
 ; CHECK-DAG: ![[IVAR]] = !DILocalVariable(name: "i"
-; CHECK-DAG: ![[SCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: !6, file: !1, line: 23, column: 12)
-; CHECK-DAG: ![[IDBGLOC]] = !DILocation(line: 24, column: 7, scope: ![[SCOPE]])
+; CHECK-DAG: ![[PROG_SCOPE:[0-9]+]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov"
+; CHECK-DAG: ![[BLK_SCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[PROG_SCOPE]], file: !1, line: 23, column: 12)
+; CHECK-DAG: ![[IDBGLOC]] = !DILocation(line: 23, column: 6, scope: ![[PROG_SCOPE]])
 ; CHECK-DAG: ![[XVAR]] = !DILocalVariable(name: "x"
 ; CHECK-DAG: ![[JVAR]] = !DILocalVariable(name: "j"
-; CHECK-DAG: ![[JDBGLOC]] = !DILocation(line: 32, column: 7, scope: ![[SCOPE]])
+; CHECK-DAG: ![[JDBGLOC]] = !DILocation(line: 32, column: 7, scope: ![[BLK_SCOPE]])
 
 ; CHECK-DAG: ![[XVAR_RESUME]] = !DILocalVariable(name: "x"
-; CHECK-DAG: ![[IDBGLOC_RESUME]] = !DILocation(line: 24, column: 7, scope: ![[RESUME_SCOPE:[0-9]+]])
-; CHECK-DAG: ![[RESUME_SCOPE]] = distinct !DILexicalBlock(scope: !22, file: !1, line: 23, column: 12)
+; CHECK-DAG: ![[RESUME_PROG_SCOPE:[0-9]+]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov"
+; CHECK-DAG: ![[IDBGLOC_RESUME]] = !DILocation(line: 24, column: 7, scope: ![[RESUME_BLK_SCOPE:[0-9]+]])
+; CHECK-DAG: ![[RESUME_BLK_SCOPE]] = distinct !DILexicalBlock(scope: ![[RESUME_PROG_SCOPE]], file: !1, line: 23, column: 12)
 ; CHECK-DAG: ![[IVAR_RESUME]] = !DILocalVariable(name: "i"
 ; CHECK-DAG: ![[JVAR_RESUME]] = !DILocalVariable(name: "j"
-; CHECK-DAG: ![[JDBGLOC_RESUME]] = !DILocation(line: 32, column: 7, scope: ![[RESUME_SCOPE]])
+; CHECK-DAG: ![[JDBGLOC_RESUME]] = !DILocation(line: 32, column: 7, scope: ![[RESUME_BLK_SCOPE]])
 define void @f() presplitcoroutine !dbg !8 {
 entry:
   %__promise = alloca i8, align 8
@@ -72,13 +74,13 @@ entry:
   br i1 %alloc, label %coro.alloc, label %coro.init
 
 coro.alloc:                                       ; preds = %entry
-  %size = call i64 @llvm.coro.size.i64()
-  %memory = call ptr @new(i64 %size)
-  br label %coro.init
+  %size = call i64 @llvm.coro.size.i64(), !dbg !23
+  %memory = call ptr @new(i64 %size), !dbg !23
+  br label %coro.init, !dbg !23
 
 coro.init:                                        ; preds = %coro.alloc, %entry
-  %phi.entry.alloc = phi ptr [ null, %entry ], [ %memory, %coro.alloc ]
-  %begin = call ptr @llvm.coro.begin(token %id, ptr %phi.entry.alloc)
+  %phi.entry.alloc = phi ptr [ null, %entry ], [ %memory, %coro.alloc ], !dbg !23
+  %begin = call ptr @llvm.coro.begin(token %id, ptr %phi.entry.alloc), !dbg !23
   %ready = call i1 @await_ready()
   br i1 %ready, label %init.ready, label %init.suspend
 
@@ -240,3 +242,4 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
 !20 = !DILocation(line: 43, column: 3, scope: !7)
 !21 = !DILocation(line: 43, column: 8, scope: !7)
 !22 = distinct !DILexicalBlock(scope: !8, file: !1, line: 23, column: 12)
+!23 = !DILocation(line: 23, column: 6, scope: !8)
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
index 9fcf7c320f62..6227a5c822b1 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
@@ -1910,6 +1910,20 @@ exit:
   ret i1 false
 }
 
+define i1 @binop_eval_order(i32 %x) {
+; CHECK-LABEL: @binop_eval_order(
+; CHECK-NEXT:    [[A:%.*]] = add nuw nsw i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[B:%.*]] = add nuw nsw i32 [[A]], 1
+; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i32 [[A]], [[B]]
+; CHECK-NEXT:    ret i1 true
+;
+  %a = add nuw nsw i32 %x, 1
+  %b = add nuw nsw i32 %a, 1
+  %c = add nuw nsw i32 %a, %b
+  %d = icmp ugt i32 %c, 2
+  ret i1 %d
+}
+
 declare i32 @llvm.uadd.sat.i32(i32, i32)
 declare i32 @llvm.usub.sat.i32(i32, i32)
 declare i32 @llvm.sadd.sat.i32(i32, i32)
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll b/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll
index 7ec1028d65e0..0b95139f3dcb 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll
@@ -567,3 +567,32 @@ define i16 @cond_value_may_not_well_defined(i16 %x) {
   %sel = select i1 %cmp, i16 %and, i16 24
   ret i16 %sel
 }
+
+define i16 @and_elide_poison_flags(i16 noundef %a) {
+; CHECK-LABEL: @and_elide_poison_flags(
+; CHECK-NEXT:    [[X:%.*]] = add nuw i16 [[A:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[X]], 8
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[X]], i16 24
+; CHECK-NEXT:    ret i16 [[SEL]]
+;
+  %x = add nuw i16 %a, 1
+  %and = and i16 %x, 7
+  %cmp = icmp ult i16 %x, 8
+  %sel = select i1 %cmp, i16 %and, i16 24
+  ret i16 %sel
+}
+
+define i16 @and_elide_poison_flags_missing_noundef(i16 %a) {
+; CHECK-LABEL: @and_elide_poison_flags_missing_noundef(
+; CHECK-NEXT:    [[X:%.*]] = add nuw i16 [[A:%.*]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[X]], 8
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[AND]], i16 24
+; CHECK-NEXT:    ret i16 [[SEL]]
+;
+  %x = add nuw i16 %a, 1
+  %and = and i16 %x, 7
+  %cmp = icmp ult i16 %x, 8
+  %sel = select i1 %cmp, i16 %and, i16 24
+  ret i16 %sel
+}
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/cond-using-block-value.ll b/llvm/test/Transforms/CorrelatedValuePropagation/cond-using-block-value.ll
index d30b31d317a6..252f6596cedc 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/cond-using-block-value.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/cond-using-block-value.ll
@@ -12,8 +12,7 @@ define void @test_icmp_from_implied_cond(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[B]], [[A]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[L2:%.*]], label [[END]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[B_CMP1:%.*]] = icmp ult i32 [[B]], 32
-; CHECK-NEXT:    call void @use(i1 [[B_CMP1]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[B_CMP2:%.*]] = icmp ult i32 [[B]], 31
 ; CHECK-NEXT:    call void @use(i1 [[B_CMP2]])
 ; CHECK-NEXT:    ret void
@@ -47,7 +46,7 @@ define i64 @test_sext_from_implied_cond(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[B]], [[A]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[L2:%.*]], label [[END]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[B]] to i64
+; CHECK-NEXT:    [[SEXT:%.*]] = zext nneg i32 [[B]] to i64
 ; CHECK-NEXT:    ret i64 [[SEXT]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret i64 0
@@ -74,8 +73,7 @@ define void @test_icmp_from_implied_range(i16 %x, i32 %b) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[B]], [[A]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[L1:%.*]], label [[END:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[B_CMP1:%.*]] = icmp ult i32 [[B]], 65535
-; CHECK-NEXT:    call void @use(i1 [[B_CMP1]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[B_CMP2:%.*]] = icmp ult i32 [[B]], 65534
 ; CHECK-NEXT:    call void @use(i1 [[B_CMP2]])
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/sdiv_missing_debugloc.ll b/llvm/test/Transforms/CorrelatedValuePropagation/sdiv_missing_debugloc.ll
new file mode 100644
index 000000000000..3bb0ca384240
--- /dev/null
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/sdiv_missing_debugloc.ll
@@ -0,0 +1,42 @@
+; RUN: opt -passes=correlated-propagation -S < %s | FileCheck %s
+; CHECK: %{{[a-zA-Z0-9_]*}} = udiv i8 %x.nonneg, %y, !dbg ![[DBGLOC:[0-9]+]]
+; CHECK-NEXT: %{{[a-zA-Z0-9_]*}}.neg = sub i8 0, %rem1, !dbg ![[DBGLOC]]
+
+; Function Attrs: inaccessiblememonly nocallback nofree nosync nounwind willreturn
+declare void @llvm.assume(i1 noundef) #0
+
+define void @test8_neg_neg(i8 %x, i8 %y) !dbg !5 {
+  %c0 = icmp sle i8 %x, 0, !dbg !13
+  call void @llvm.dbg.value(metadata i1 %c0, metadata !9, metadata !DIExpression()), !dbg !13
+  call void @llvm.assume(i1 %c0), !dbg !14
+  %c1 = icmp sge i8 %y, 0, !dbg !15
+  call void @llvm.dbg.value(metadata i1 %c1, metadata !11, metadata !DIExpression()), !dbg !15
+  call void @llvm.assume(i1 %c1), !dbg !16
+  %rem = sdiv i8 %x, %y, !dbg !17
+  call void @llvm.dbg.value(metadata i8 %rem, metadata !12, metadata !DIExpression()), !dbg !17
+  ret void, !dbg !18
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "reduced.ll", directory: "/")
+!5 = distinct !DISubprogram(name: "test8_neg_neg", linkageName: "test8_neg_neg", scope: null, file: !2, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!9, !11, !12}
+!9 = !DILocalVariable(name: "1", scope: !5, file: !2, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned)
+!11 = !DILocalVariable(name: "2", scope: !5, file: !2, line: 3, type: !10)
+!12 = !DILocalVariable(name: "3", scope: !5, file: !2, line: 5, type: !10)
+!13 = !DILocation(line: 1, column: 1, scope: !5)
+!14 = !DILocation(line: 2, column: 1, scope: !5)
+!15 = !DILocation(line: 3, column: 1, scope: !5)
+!16 = !DILocation(line: 4, column: 1, scope: !5)
+!17 = !DILocation(line: 5, column: 1, scope: !5)
+!18 = !DILocation(line: 6, column: 1, scope: !5)
+\ No newline at end of file
diff --git a/llvm/test/Transforms/FunctionAttrs/argmemonly.ll b/llvm/test/Transforms/FunctionAttrs/argmemonly.ll
index 7a968e4119b8..ea6392714bf6 100644
--- a/llvm/test/Transforms/FunctionAttrs/argmemonly.ll
+++ b/llvm/test/Transforms/FunctionAttrs/argmemonly.ll
@@ -219,7 +219,7 @@ entry:
 }
 
 define void @test_memcpy_argonly(ptr %dst, ptr %src) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
 ; FNATTRS-LABEL: define void @test_memcpy_argonly
 ; FNATTRS-SAME: (ptr nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]]) #[[ATTR9:[0-9]+]] {
 ; FNATTRS-NEXT:  entry:
@@ -243,7 +243,7 @@ declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
 @arr = global [32 x i8] zeroinitializer
 
 define void @test_memcpy_src_global(ptr %dst) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(readwrite, inaccessiblemem: none)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none)
 ; FNATTRS-LABEL: define void @test_memcpy_src_global
 ; FNATTRS-SAME: (ptr nocapture writeonly [[DST:%.*]]) #[[ATTR11:[0-9]+]] {
 ; FNATTRS-NEXT:  entry:
@@ -263,7 +263,7 @@ entry:
 }
 
 define void @test_memcpy_dst_global(ptr %src) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(readwrite, inaccessiblemem: none)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none)
 ; FNATTRS-LABEL: define void @test_memcpy_dst_global
 ; FNATTRS-SAME: (ptr nocapture readonly [[SRC:%.*]]) #[[ATTR11]] {
 ; FNATTRS-NEXT:  entry:
diff --git a/llvm/test/Transforms/FunctionAttrs/convergent.ll b/llvm/test/Transforms/FunctionAttrs/convergent.ll
index 0263e0ec2255..a0f4c07e4337 100644
--- a/llvm/test/Transforms/FunctionAttrs/convergent.ll
+++ b/llvm/test/Transforms/FunctionAttrs/convergent.ll
@@ -74,7 +74,7 @@ declare void @llvm.nvvm.barrier0() convergent
 
 define i32 @intrinsic() convergent {
   ; Implicitly convergent, because the intrinsic is convergent.
-; CHECK: Function Attrs: convergent nounwind
+; CHECK: Function Attrs: convergent norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@intrinsic
 ; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:    call void @llvm.nvvm.barrier0()
diff --git a/llvm/test/Transforms/FunctionAttrs/int_sideeffect.ll b/llvm/test/Transforms/FunctionAttrs/int_sideeffect.ll
index 9ba82e2dc1cc..0f087e1a05f7 100644
--- a/llvm/test/Transforms/FunctionAttrs/int_sideeffect.ll
+++ b/llvm/test/Transforms/FunctionAttrs/int_sideeffect.ll
@@ -7,7 +7,7 @@ declare void @llvm.sideeffect()
 ; is present.
 
 define void @test() {
-; CHECK: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: readwrite)
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    call void @llvm.sideeffect()
 ; CHECK-NEXT:    ret void
@@ -17,7 +17,7 @@ define void @test() {
 }
 
 define void @loop() {
-; CHECK: Function Attrs: nofree noreturn nosync nounwind memory(inaccessiblemem: readwrite)
+; CHECK: Function Attrs: nofree norecurse noreturn nosync nounwind memory(inaccessiblemem: readwrite)
 ; CHECK-LABEL: @loop(
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
diff --git a/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll b/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll
index 17072bc433fb..bb9ef9156794 100644
--- a/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll
+++ b/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll
@@ -6,7 +6,7 @@
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7:8"
 
 define amdgpu_kernel void @test_make_buffer_rsrc(ptr %p, ptr %q) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
 ; FNATTRS-LABEL: define {{[^@]+}}@test_make_buffer_rsrc
 ; FNATTRS-SAME: (ptr nocapture readonly [[P:%.*]], ptr nocapture writeonly [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
 ; FNATTRS-NEXT:    [[P_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P]], i16 0, i32 4, i32 822243328)
diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
index a70d71e62c30..3d483f671b1a 100644
--- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
@@ -55,7 +55,7 @@ define void @c3(ptr %q) {
 
 define i1 @c4(ptr %q, i32 %bitno) {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; FNATTRS-LABEL: define i1 @c4
+; FNATTRS-LABEL: define noundef i1 @c4
 ; FNATTRS-SAME: (ptr [[Q:%.*]], i32 [[BITNO:%.*]]) #[[ATTR0]] {
 ; FNATTRS-NEXT:    [[TMP:%.*]] = ptrtoint ptr [[Q]] to i32
 ; FNATTRS-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP]], [[BITNO]]
@@ -91,7 +91,7 @@ l1:
 ; c4b is c4 but without the escaping part
 define i1 @c4b(ptr %q, i32 %bitno) {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; FNATTRS-LABEL: define i1 @c4b
+; FNATTRS-LABEL: define noundef i1 @c4b
 ; FNATTRS-SAME: (ptr [[Q:%.*]], i32 [[BITNO:%.*]]) #[[ATTR0]] {
 ; FNATTRS-NEXT:    [[TMP:%.*]] = ptrtoint ptr [[Q]] to i32
 ; FNATTRS-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP]], [[BITNO]]
@@ -160,7 +160,7 @@ declare void @throw_if_bit_set(ptr, i8) readonly
 
 define i1 @c6(ptr %q, i8 %bit) personality ptr @__gxx_personality_v0 {
 ; FNATTRS: Function Attrs: nofree memory(read)
-; FNATTRS-LABEL: define i1 @c6
+; FNATTRS-LABEL: define noundef i1 @c6
 ; FNATTRS-SAME: (ptr readonly [[Q:%.*]], i8 [[BIT:%.*]]) #[[ATTR5:[0-9]+]] personality ptr @__gxx_personality_v0 {
 ; FNATTRS-NEXT:    invoke void @throw_if_bit_set(ptr [[Q]], i8 [[BIT]])
 ; FNATTRS-NEXT:    to label [[RET0:%.*]] unwind label [[RET1:%.*]]
@@ -650,7 +650,7 @@ entry:
 }
 
 define void @nocaptureLaunder(ptr %p) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: write, inaccessiblemem: readwrite)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write, inaccessiblemem: readwrite)
 ; FNATTRS-LABEL: define void @nocaptureLaunder
 ; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]]) #[[ATTR13:[0-9]+]] {
 ; FNATTRS-NEXT:  entry:
@@ -674,7 +674,7 @@ entry:
 
 @g2 = global ptr null
 define void @captureLaunder(ptr %p) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: readwrite)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: readwrite)
 ; FNATTRS-LABEL: define void @captureLaunder
 ; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR14:[0-9]+]] {
 ; FNATTRS-NEXT:    [[B:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[P]])
@@ -694,7 +694,7 @@ define void @captureLaunder(ptr %p) {
 }
 
 define void @nocaptureStrip(ptr %p) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: write)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
 ; FNATTRS-LABEL: define void @nocaptureStrip
 ; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]]) #[[ATTR15:[0-9]+]] {
 ; FNATTRS-NEXT:  entry:
@@ -718,9 +718,9 @@ entry:
 
 @g3 = global ptr null
 define void @captureStrip(ptr %p) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
 ; FNATTRS-LABEL: define void @captureStrip
-; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR16:[0-9]+]] {
+; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR1]] {
 ; FNATTRS-NEXT:    [[B:%.*]] = call ptr @llvm.strip.invariant.group.p0(ptr [[P]])
 ; FNATTRS-NEXT:    store ptr [[B]], ptr @g3, align 8
 ; FNATTRS-NEXT:    ret void
@@ -813,7 +813,7 @@ define i1 @nocaptureInboundsGEPICmpRev(ptr %x) {
 
 define i1 @nocaptureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; FNATTRS-LABEL: define i1 @nocaptureDereferenceableOrNullICmp
+; FNATTRS-LABEL: define noundef i1 @nocaptureDereferenceableOrNullICmp
 ; FNATTRS-SAME: (ptr nocapture readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR0]] {
 ; FNATTRS-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[X]], null
 ; FNATTRS-NEXT:    ret i1 [[TMP1]]
@@ -830,8 +830,8 @@ define i1 @nocaptureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x)
 
 define i1 @captureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) null_pointer_is_valid {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none)
-; FNATTRS-LABEL: define i1 @captureDereferenceableOrNullICmp
-; FNATTRS-SAME: (ptr readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR17:[0-9]+]] {
+; FNATTRS-LABEL: define noundef i1 @captureDereferenceableOrNullICmp
+; FNATTRS-SAME: (ptr readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR16:[0-9]+]] {
 ; FNATTRS-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[X]], null
 ; FNATTRS-NEXT:    ret i1 [[TMP1]]
 ;
@@ -886,8 +886,8 @@ define void @recurse_fptr(ptr %f, ptr %p) {
 define void @readnone_indirec(ptr %f, ptr %p) {
 ; FNATTRS: Function Attrs: nofree nosync memory(none)
 ; FNATTRS-LABEL: define void @readnone_indirec
-; FNATTRS-SAME: (ptr nocapture readonly [[F:%.*]], ptr readnone [[P:%.*]]) #[[ATTR18:[0-9]+]] {
-; FNATTRS-NEXT:    call void [[F]](ptr [[P]]) #[[ATTR21:[0-9]+]]
+; FNATTRS-SAME: (ptr nocapture readonly [[F:%.*]], ptr readnone [[P:%.*]]) #[[ATTR17:[0-9]+]] {
+; FNATTRS-NEXT:    call void [[F]](ptr [[P]]) #[[ATTR20:[0-9]+]]
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR: Function Attrs: nosync memory(none)
diff --git a/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll b/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll
index 0fe0eadf5f66..ed5534a24cbe 100644
--- a/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll
@@ -225,9 +225,9 @@ define void @call_both() #0 {
 declare float @llvm.floor.f32(float)
 
 define void @call_floor(float %a) #0 {
-; FNATTR: Function Attrs: mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable
+; FNATTR: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable
 ; FNATTR-LABEL: define {{[^@]+}}@call_floor
-; FNATTR-SAME: (float [[A:%.*]]) #[[ATTR7:[0-9]+]] {
+; FNATTR-SAME: (float [[A:%.*]]) #[[ATTR3]] {
 ; FNATTR-NEXT:    [[TMP1:%.*]] = tail call float @llvm.floor.f32(float [[A]])
 ; FNATTR-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
index 7ca07e549346..d9bdb6298ed0 100644
--- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
@@ -32,14 +32,23 @@ define ptr @test2(ptr nonnull %p) {
 ; Given an SCC where one of the functions can not be marked nonnull,
 ; can we still mark the other one which is trivially nonnull
 define ptr @scc_binder(i1 %c) {
-; COMMON-LABEL: define ptr @scc_binder(
-; COMMON-SAME: i1 [[C:%.*]]) {
-; COMMON-NEXT:    br i1 [[C]], label [[REC:%.*]], label [[END:%.*]]
-; COMMON:       rec:
-; COMMON-NEXT:    [[TMP1:%.*]] = call ptr @test3(i1 [[C]])
-; COMMON-NEXT:    br label [[END]]
-; COMMON:       end:
-; COMMON-NEXT:    ret ptr null
+; FNATTRS-LABEL: define noundef ptr @scc_binder(
+; FNATTRS-SAME: i1 [[C:%.*]]) {
+; FNATTRS-NEXT:    br i1 [[C]], label [[REC:%.*]], label [[END:%.*]]
+; FNATTRS:       rec:
+; FNATTRS-NEXT:    [[TMP1:%.*]] = call ptr @test3(i1 [[C]])
+; FNATTRS-NEXT:    br label [[END]]
+; FNATTRS:       end:
+; FNATTRS-NEXT:    ret ptr null
+;
+; ATTRIBUTOR-LABEL: define ptr @scc_binder(
+; ATTRIBUTOR-SAME: i1 [[C:%.*]]) {
+; ATTRIBUTOR-NEXT:    br i1 [[C]], label [[REC:%.*]], label [[END:%.*]]
+; ATTRIBUTOR:       rec:
+; ATTRIBUTOR-NEXT:    [[TMP1:%.*]] = call ptr @test3(i1 [[C]])
+; ATTRIBUTOR-NEXT:    br label [[END]]
+; ATTRIBUTOR:       end:
+; ATTRIBUTOR-NEXT:    ret ptr null
 ;
   br i1 %c, label %rec, label %end
 rec:
@@ -97,7 +106,7 @@ define ptr @test4() {
 ; Given a mutual recursive set of functions which *can* return null
 ; make sure we haven't marked them as nonnull.
 define ptr @test5_helper(i1 %c) {
-; FNATTRS-LABEL: define noalias ptr @test5_helper(
+; FNATTRS-LABEL: define noalias noundef ptr @test5_helper(
 ; FNATTRS-SAME: i1 [[C:%.*]]) #[[ATTR1]] {
 ; FNATTRS-NEXT:    br i1 [[C]], label [[REC:%.*]], label [[END:%.*]]
 ; FNATTRS:       rec:
@@ -124,7 +133,7 @@ end:
 }
 
 define ptr @test5(i1 %c) {
-; FNATTRS-LABEL: define noalias ptr @test5(
+; FNATTRS-LABEL: define noalias noundef ptr @test5(
 ; FNATTRS-SAME: i1 [[C:%.*]]) #[[ATTR1]] {
 ; FNATTRS-NEXT:    [[RET:%.*]] = call ptr @test5_helper(i1 [[C]])
 ; FNATTRS-NEXT:    ret ptr [[RET]]
@@ -892,7 +901,7 @@ define i8 @parent7(ptr %a) {
 declare i32 @esfp(...)
 
 define i1 @parent8(ptr %a, ptr %bogus1, ptr %b) personality ptr @esfp{
-; FNATTRS-LABEL: define i1 @parent8(
+; FNATTRS-LABEL: define noundef i1 @parent8(
 ; FNATTRS-SAME: ptr nonnull [[A:%.*]], ptr nocapture readnone [[BOGUS1:%.*]], ptr nonnull [[B:%.*]]) #[[ATTR7]] personality ptr @esfp {
 ; FNATTRS-NEXT:  entry:
 ; FNATTRS-NEXT:    invoke void @use2nonnull(ptr [[A]], ptr [[B]])
@@ -981,7 +990,7 @@ define ptr addrspace(3) @gep2(ptr addrspace(3) %p) {
 
 ; FIXME: We should propagate dereferenceable here but *not* nonnull
 define ptr addrspace(3) @as(ptr addrspace(3) dereferenceable(4) %p) {
-; FNATTRS-LABEL: define ptr addrspace(3) @as(
+; FNATTRS-LABEL: define noundef ptr addrspace(3) @as(
 ; FNATTRS-SAME: ptr addrspace(3) readnone returned dereferenceable(4) [[P:%.*]]) #[[ATTR0]] {
 ; FNATTRS-NEXT:    ret ptr addrspace(3) [[P]]
 ;
@@ -993,7 +1002,7 @@ define ptr addrspace(3) @as(ptr addrspace(3) dereferenceable(4) %p) {
 }
 
 define internal ptr @g2() {
-; FNATTRS-LABEL: define internal nonnull ptr @g2(
+; FNATTRS-LABEL: define internal noundef nonnull ptr @g2(
 ; FNATTRS-SAME: ) #[[ATTR0]] {
 ; FNATTRS-NEXT:    ret ptr inttoptr (i64 4 to ptr)
 ;
@@ -1005,7 +1014,7 @@ define internal ptr @g2() {
 }
 
 define  ptr @g1() {
-; FNATTRS-LABEL: define nonnull ptr @g1(
+; FNATTRS-LABEL: define noundef nonnull ptr @g1(
 ; FNATTRS-SAME: ) #[[ATTR0]] {
 ; FNATTRS-NEXT:    [[C:%.*]] = call ptr @g2()
 ; FNATTRS-NEXT:    ret ptr [[C]]
diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse.ll b/llvm/test/Transforms/FunctionAttrs/norecurse.ll
index e1c624dc2ce5..7924428fb498 100644
--- a/llvm/test/Transforms/FunctionAttrs/norecurse.ll
+++ b/llvm/test/Transforms/FunctionAttrs/norecurse.ll
@@ -73,7 +73,7 @@ define i32 @extern() {
 ; ATTRIBUTOR: Function Attrs: nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@extern
 ; ATTRIBUTOR-SAME: () #[[ATTR2:[0-9]+]] {
-; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k()
+; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k() #[[ATTR7:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret i32 [[A]]
 ;
   %a = call i32 @k()
@@ -83,7 +83,7 @@ define i32 @extern() {
 declare i32 @k() readnone
 
 define void @intrinsic(ptr %dest, ptr %src, i32 %len) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
 ; FNATTRS-LABEL: define {{[^@]+}}@intrinsic
 ; FNATTRS-SAME: (ptr nocapture writeonly [[DEST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[LEN:%.*]]) #[[ATTR4:[0-9]+]] {
 ; FNATTRS-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[DEST]], ptr [[SRC]], i32 [[LEN]], i1 false)
@@ -92,7 +92,7 @@ define void @intrinsic(ptr %dest, ptr %src, i32 %len) {
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@intrinsic
 ; ATTRIBUTOR-SAME: (ptr nocapture nofree writeonly [[DEST:%.*]], ptr nocapture nofree readonly [[SRC:%.*]], i32 [[LEN:%.*]]) #[[ATTR4:[0-9]+]] {
-; ATTRIBUTOR-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr nocapture writeonly [[DEST]], ptr nocapture readonly [[SRC]], i32 [[LEN]], i1 false) #[[ATTR7:[0-9]+]]
+; ATTRIBUTOR-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr nocapture writeonly [[DEST]], ptr nocapture readonly [[SRC]], i32 [[LEN]], i1 false) #[[ATTR8:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 %len, i1 false)
@@ -111,7 +111,7 @@ define internal i32 @called_by_norecurse() {
 ; ATTRIBUTOR: Function Attrs: nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@called_by_norecurse
 ; ATTRIBUTOR-SAME: () #[[ATTR2]] {
-; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k()
+; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k() #[[ATTR7]]
 ; ATTRIBUTOR-NEXT:    ret i32 [[A]]
 ;
   %a = call i32 @k()
@@ -145,7 +145,7 @@ define internal i32 @called_by_norecurse_indirectly() {
 ; ATTRIBUTOR: Function Attrs: nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@called_by_norecurse_indirectly
 ; ATTRIBUTOR-SAME: () #[[ATTR2]] {
-; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k()
+; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k() #[[ATTR7]]
 ; ATTRIBUTOR-NEXT:    ret i32 [[A]]
 ;
   %a = call i32 @k()
@@ -196,7 +196,7 @@ define internal i32 @escapes_as_parameter(ptr %p) {
 ; ATTRIBUTOR: Function Attrs: nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@escapes_as_parameter
 ; ATTRIBUTOR-SAME: (ptr nocapture nofree readnone [[P:%.*]]) #[[ATTR2]] {
-; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k()
+; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k() #[[ATTR7]]
 ; ATTRIBUTOR-NEXT:    ret i32 [[A]]
 ;
   %a = call i32 @k()
@@ -241,7 +241,7 @@ define void @r() norecurse {
 ; FNATTRS: attributes #[[ATTR1]] = { nofree nosync nounwind memory(none) }
 ; FNATTRS: attributes #[[ATTR2]] = { nofree nosync memory(none) }
 ; FNATTRS: attributes #[[ATTR3:[0-9]+]] = { memory(none) }
-; FNATTRS: attributes #[[ATTR4]] = { mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite) }
+; FNATTRS: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
 ; FNATTRS: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ; FNATTRS: attributes #[[ATTR6]] = { nofree norecurse nosync memory(none) }
 ;.
@@ -252,5 +252,6 @@ define void @r() norecurse {
 ; ATTRIBUTOR: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
 ; ATTRIBUTOR: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ; ATTRIBUTOR: attributes #[[ATTR6]] = { norecurse nosync memory(none) }
-; ATTRIBUTOR: attributes #[[ATTR7]] = { nofree willreturn }
+; ATTRIBUTOR: attributes #[[ATTR7]] = { nosync }
+; ATTRIBUTOR: attributes #[[ATTR8]] = { nofree willreturn }
 ;.
diff --git a/llvm/test/Transforms/FunctionAttrs/nosync.ll b/llvm/test/Transforms/FunctionAttrs/nosync.ll
index 5950f9e626c4..de5398f17ce5 100644
--- a/llvm/test/Transforms/FunctionAttrs/nosync.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nosync.ll
@@ -236,7 +236,7 @@ declare void @llvm.memset(ptr %dest, i8 %val, i32 %len, i1 %isvolatile)
 
 ; negative, checking volatile intrinsics.
 define i32 @memcpy_volatile(ptr %ptr1, ptr %ptr2) {
-; CHECK: Function Attrs: mustprogress nofree nounwind willreturn memory(argmem: readwrite)
+; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; CHECK-LABEL: @memcpy_volatile(
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], i32 8, i1 true)
 ; CHECK-NEXT:    ret i32 4
@@ -247,7 +247,7 @@ define i32 @memcpy_volatile(ptr %ptr1, ptr %ptr2) {
 
 ; positive, non-volatile intrinsic.
 define i32 @memset_non_volatile(ptr %ptr1, i8 %val) {
-; CHECK: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: write)
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
 ; CHECK-LABEL: @memset_non_volatile(
 ; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr [[PTR1:%.*]], i8 [[VAL:%.*]], i32 8, i1 false)
 ; CHECK-NEXT:    ret i32 4
@@ -298,7 +298,7 @@ define void @i_totally_sync() {
 declare float @llvm.cos(float %val) readnone
 
 define float @cos_test(float %x) {
-; CHECK: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: @cos_test(
 ; CHECK-NEXT:    [[C:%.*]] = call float @llvm.cos.f32(float [[X:%.*]])
 ; CHECK-NEXT:    ret float [[C]]
diff --git a/llvm/test/Transforms/FunctionAttrs/noundef.ll b/llvm/test/Transforms/FunctionAttrs/noundef.ll
new file mode 100644
index 000000000000..946b562f3955
--- /dev/null
+++ b/llvm/test/Transforms/FunctionAttrs/noundef.ll
@@ -0,0 +1,154 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes='function-attrs' -S | FileCheck %s
+
+define i32 @test_ret_constant() {
+; CHECK-LABEL: define noundef i32 @test_ret_constant(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret i32 0
+;
+  ret i32 0
+}
+
+define i32 @test_ret_poison() {
+; CHECK-LABEL: define i32 @test_ret_poison(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 poison
+;
+  ret i32 poison
+}
+
+define i32 @test_ret_undef() {
+; CHECK-LABEL: define i32 @test_ret_undef(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 undef
+;
+  ret i32 undef
+}
+
+define i32 @test_ret_param(i32 %x) {
+; CHECK-LABEL: define i32 @test_ret_param(
+; CHECK-SAME: i32 returned [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  ret i32 %x
+}
+
+define i32 @test_ret_noundef_param(i32 noundef %x) {
+; CHECK-LABEL: define noundef i32 @test_ret_noundef_param(
+; CHECK-SAME: i32 noundef returned [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  ret i32 %x
+}
+
+define i32 @test_ret_noundef_expr(i32 noundef %x) {
+; CHECK-LABEL: define noundef i32 @test_ret_noundef_expr(
+; CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[Y:%.*]] = add i32 [[X]], 1
+; CHECK-NEXT:    ret i32 [[Y]]
+;
+  %y = add i32 %x, 1
+  ret i32 %y
+}
+
+define i32 @test_ret_create_poison_expr(i32 noundef %x) {
+; CHECK-LABEL: define i32 @test_ret_create_poison_expr(
+; CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[Y:%.*]] = add nsw i32 [[X]], 1
+; CHECK-NEXT:    ret i32 [[Y]]
+;
+  %y = add nsw i32 %x, 1
+  ret i32 %y
+}
+
+define i32 @test_ret_freezed(i32 noundef %x) {
+; CHECK-LABEL: define noundef i32 @test_ret_freezed(
+; CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[Y:%.*]] = add nsw i32 [[X]], 1
+; CHECK-NEXT:    [[Z:%.*]] = freeze i32 [[Y]]
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %y = add nsw i32 %x, 1
+  %z = freeze i32 %y
+  ret i32 %z
+}
+
+define i32 @test_ret_control_flow(i32 noundef %x) {
+; CHECK-LABEL: define noundef i32 @test_ret_control_flow(
+; CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i32 2
+; CHECK:       if.else:
+; CHECK-NEXT:    [[RET:%.*]] = add i32 [[X]], 1
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %cond = icmp eq i32 %x, 0
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  ret i32 2
+if.else:
+  %ret = add i32 %x, 1
+  ret i32 %ret
+}
+
+define i32 @test_ret_control_flow_may_poison(i32 noundef %x) {
+; CHECK-LABEL: define i32 @test_ret_control_flow_may_poison(
+; CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i32 2
+; CHECK:       if.else:
+; CHECK-NEXT:    [[RET:%.*]] = add nsw i32 [[X]], 1
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %cond = icmp eq i32 %x, 0
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  ret i32 2
+if.else:
+  %ret = add nsw i32 %x, 1
+  ret i32 %ret
+}
+
+; TODO: use context-sensitive analysis
+define i32 @test_ret_control_flow_never_poison(i32 noundef %x) {
+; CHECK-LABEL: define i32 @test_ret_control_flow_never_poison(
+; CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[X]], 2147483647
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i32 2
+; CHECK:       if.else:
+; CHECK-NEXT:    [[RET:%.*]] = add nsw i32 [[X]], 1
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %cond = icmp eq i32 %x, 2147483647
+  br i1 %cond, label %if.then, label %if.else
+if.then:
+  ret i32 2
+if.else:
+  %ret = add nsw i32 %x, 1
+  ret i32 %ret
+}
+
+define i32 @test_noundef_prop() {
+; CHECK-LABEL: define noundef i32 @test_noundef_prop(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @test_ret_constant()
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %ret = call i32 @test_ret_constant()
+  ret i32 %ret
+}
+
+; Don't deduce noundef for functions with sanitize_memory.
+define i32 @test_ret_constant_msan() sanitize_memory {
+; CHECK-LABEL: define i32 @test_ret_constant_msan(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    ret i32 0
+;
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/FunctionAttrs/out-of-bounds-iterator-bug.ll b/llvm/test/Transforms/FunctionAttrs/out-of-bounds-iterator-bug.ll
index 4b23583b8754..6209ddc8317e 100644
--- a/llvm/test/Transforms/FunctionAttrs/out-of-bounds-iterator-bug.ll
+++ b/llvm/test/Transforms/FunctionAttrs/out-of-bounds-iterator-bug.ll
@@ -17,7 +17,7 @@ define void @va_func(ptr readonly %b, ...) readonly nounwind {
 }
 
 define i32 @caller(ptr %x) {
-; CHECK-LABEL: define i32 @caller(ptr nocapture readonly %x)
+; CHECK-LABEL: define noundef i32 @caller(ptr nocapture readonly %x)
  entry:
   call void(ptr,...) @va_func(ptr null, i32 0, i32 0, i32 0, ptr %x)
   ret i32 42
@@ -34,7 +34,7 @@ define void @va_func2(ptr readonly %b, ...) {
 }
 
 define i32 @caller2(ptr %x, ptr %y) {
-; CHECK-LABEL: define i32 @caller2(ptr nocapture readonly %x, ptr %y)
+; CHECK-LABEL: define noundef i32 @caller2(ptr nocapture readonly %x, ptr %y)
  entry:
   call void(ptr,...) @va_func2(ptr %x, i32 0, i32 0, i32 0, ptr %y)
   ret i32 42
diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
index 0986f74c181d..39513976f90d 100644
--- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll
+++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
@@ -251,7 +251,7 @@ entry:
 declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>%val, <4 x ptr>, i32, <4 x i1>)
 
 define void @test9(<4 x ptr> %ptrs, <4 x i32>%val) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; FNATTRS-LABEL: define {{[^@]+}}@test9
 ; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR7:[0-9]+]] {
 ; FNATTRS-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
@@ -275,7 +275,7 @@ define void @test9(<4 x ptr> %ptrs, <4 x i32>%val) {
 
 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
 define <4 x i32> @test10(<4 x ptr> %ptrs) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(read)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; FNATTRS-LABEL: define {{[^@]+}}@test10
 ; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
 ; FNATTRS-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef)
diff --git a/llvm/test/Transforms/FunctionAttrs/writeonly.ll b/llvm/test/Transforms/FunctionAttrs/writeonly.ll
index 5b20300610d8..de2d5e223894 100644
--- a/llvm/test/Transforms/FunctionAttrs/writeonly.ll
+++ b/llvm/test/Transforms/FunctionAttrs/writeonly.ll
@@ -179,9 +179,9 @@ define void @test_atomicrmw(ptr %p) {
 }
 
 define void @test_ptrmask(ptr %p) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: write)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
 ; FNATTRS-LABEL: define {{[^@]+}}@test_ptrmask
-; FNATTRS-SAME: (ptr writeonly [[P:%.*]]) #[[ATTR8:[0-9]+]] {
+; FNATTRS-SAME: (ptr writeonly [[P:%.*]]) #[[ATTR3]] {
 ; FNATTRS-NEXT:    [[MASK:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -5)
 ; FNATTRS-NEXT:    store i8 0, ptr [[MASK]], align 1
 ; FNATTRS-NEXT:    ret void
@@ -218,7 +218,7 @@ declare void @direct2_callee(ptr %p) writeonly
 define void @direct2(ptr %p) {
 ; FNATTRS: Function Attrs: memory(write)
 ; FNATTRS-LABEL: define {{[^@]+}}@direct2
-; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR10:[0-9]+]] {
+; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR9:[0-9]+]] {
 ; FNATTRS-NEXT:    call void @direct2_callee(ptr [[P]])
 ; FNATTRS-NEXT:    ret void
 ;
@@ -236,7 +236,7 @@ define void @direct2(ptr %p) {
 define void @direct2b(ptr %p) {
 ; FNATTRS: Function Attrs: memory(write)
 ; FNATTRS-LABEL: define {{[^@]+}}@direct2b
-; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]]) #[[ATTR10]] {
+; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]]) #[[ATTR9]] {
 ; FNATTRS-NEXT:    call void @direct2_callee(ptr nocapture [[P]])
 ; FNATTRS-NEXT:    ret void
 ;
@@ -325,8 +325,8 @@ define void @fptr_test2(ptr %p, ptr %f) {
 define void @fptr_test3(ptr %p, ptr %f) {
 ; FNATTRS: Function Attrs: memory(write)
 ; FNATTRS-LABEL: define {{[^@]+}}@fptr_test3
-; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]], ptr nocapture readonly [[F:%.*]]) #[[ATTR10]] {
-; FNATTRS-NEXT:    call void [[F]](ptr nocapture [[P]]) #[[ATTR10]]
+; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]], ptr nocapture readonly [[F:%.*]]) #[[ATTR9]] {
+; FNATTRS-NEXT:    call void [[F]](ptr nocapture [[P]]) #[[ATTR9]]
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR: Function Attrs: memory(write)
@@ -342,7 +342,7 @@ define void @fptr_test3(ptr %p, ptr %f) {
 define void @test_argmem_none_callee(ptr %p) {
 ; FNATTRS-LABEL: define {{[^@]+}}@test_argmem_none_callee
 ; FNATTRS-SAME: (ptr nocapture readnone [[P:%.*]]) {
-; FNATTRS-NEXT:    call void @direct1_callee(ptr nocapture [[P]]) #[[ATTR11:[0-9]+]]
+; FNATTRS-NEXT:    call void @direct1_callee(ptr nocapture [[P]]) #[[ATTR10:[0-9]+]]
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test_argmem_none_callee
@@ -357,7 +357,7 @@ define void @test_argmem_none_callee(ptr %p) {
 define void @test_argmem_read_callee(ptr %p) {
 ; FNATTRS-LABEL: define {{[^@]+}}@test_argmem_read_callee
 ; FNATTRS-SAME: (ptr nocapture readonly [[P:%.*]]) {
-; FNATTRS-NEXT:    call void @direct1_callee(ptr nocapture [[P]]) #[[ATTR12:[0-9]+]]
+; FNATTRS-NEXT:    call void @direct1_callee(ptr nocapture [[P]]) #[[ATTR11:[0-9]+]]
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test_argmem_read_callee
@@ -372,7 +372,7 @@ define void @test_argmem_read_callee(ptr %p) {
 define void @test_argmem_write_callee(ptr %p) {
 ; FNATTRS-LABEL: define {{[^@]+}}@test_argmem_write_callee
 ; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]]) {
-; FNATTRS-NEXT:    call void @direct1_callee(ptr nocapture [[P]]) #[[ATTR13:[0-9]+]]
+; FNATTRS-NEXT:    call void @direct1_callee(ptr nocapture [[P]]) #[[ATTR12:[0-9]+]]
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test_argmem_write_callee
diff --git a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll
index 612e9452c637..739db26311f4 100644
--- a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll
+++ b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll
@@ -331,9 +331,7 @@ define void @zext_nneg_add_nsw(ptr %A, i32 %offset, i32 %M) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext nneg i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -658,9 +656,7 @@ define void @zext_nneg_mul_nsw(ptr %A, i32 %multiple, i32 %M) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext nneg i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
diff --git a/llvm/test/Transforms/Inline/devirtualize-3.ll b/llvm/test/Transforms/Inline/devirtualize-3.ll
index b4f80072391a..8ef20301a4e0 100644
--- a/llvm/test/Transforms/Inline/devirtualize-3.ll
+++ b/llvm/test/Transforms/Inline/devirtualize-3.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -aa-pipeline=basic-aa -S -passes='default<O2>' < %s | FileCheck %s
 ; PR5009
 
-; CHECK: define i32 @main() 
+; CHECK: define noundef i32 @main() 
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:  call void @exit(i32 38) 
 
diff --git a/llvm/test/Transforms/Inline/devirtualize-5.ll b/llvm/test/Transforms/Inline/devirtualize-5.ll
index dbfe445e898c..298113dbaec2 100644
--- a/llvm/test/Transforms/Inline/devirtualize-5.ll
+++ b/llvm/test/Transforms/Inline/devirtualize-5.ll
@@ -5,7 +5,7 @@ define i32 @i() alwaysinline {
   ret i32 45
 }
 
-; CHECK-LABEL: define i32 @main
+; CHECK-LABEL: define {{(noundef )?}}i32 @main
 ; CHECK-NEXT: ret i32 45
 
 define i32 @main() {
diff --git a/llvm/test/Transforms/Inline/launder.invariant.group.ll b/llvm/test/Transforms/Inline/launder.invariant.group.ll
index 71df79651369..9c1ddf7b5666 100644
--- a/llvm/test/Transforms/Inline/launder.invariant.group.ll
+++ b/llvm/test/Transforms/Inline/launder.invariant.group.ll
@@ -7,7 +7,7 @@
 ; This test checks if value returned from the launder is considered aliasing
 ; with its argument.  Due to bug caused by handling launder in capture tracking
 ; sometimes it would be considered noalias.
-; CHECK-LABEL: define i32 @bar(ptr noalias
+; CHECK-LABEL: define {{(noundef )?}}i32 @bar(ptr noalias
 define i32 @bar(ptr noalias) {
 ; CHECK-NOT: noalias
   %2 = call ptr @llvm.launder.invariant.group.p0(ptr %0)
@@ -18,7 +18,7 @@ define i32 @bar(ptr noalias) {
   ret i32 %5
 }
 
-; CHECK-LABEL: define i32 @foo(ptr noalias
+; CHECK-LABEL: define {{(noundef )?}}i32 @foo(ptr noalias
 define i32 @foo(ptr noalias)  {
   ; CHECK-NOT: call i32 @bar(
   ; CHECK-NOT: !noalias
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmps.ll b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
index 881a9b7ff129..91ecf2476025 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmps.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
@@ -3015,10 +3015,8 @@ define i32 @icmp_x_slt_0_and_icmp_y_sgt_neg1_i32_fail(i32 %x, i32 %y) {
 
 define i32 @icmp_slt_0_xor_icmp_sge_neg2_i32_fail(i32 %x) {
 ; CHECK-LABEL: @icmp_slt_0_xor_icmp_sge_neg2_i32_fail(
-; CHECK-NEXT:    [[A:%.*]] = icmp sgt i32 [[X:%.*]], -3
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[X]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], [[A]]
-; CHECK-NEXT:    [[D:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[X:%.*]], -2
+; CHECK-NEXT:    [[D:%.*]] = zext i1 [[TMP1]] to i32
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp sge i32 %x, -2
diff --git a/llvm/test/Transforms/InstCombine/array.ll b/llvm/test/Transforms/InstCombine/array.ll
index 8bab3104fd8c..396a7aa340f6 100644
--- a/llvm/test/Transforms/InstCombine/array.ll
+++ b/llvm/test/Transforms/InstCombine/array.ll
@@ -72,3 +72,39 @@ entry:
   store i32 %b, ptr %gep
   ret void
 }
+
+define void @test_zext_nneg(ptr %ptr, i32 %a, i32 %b) {
+; CHECK-LABEL: define void @test_zext_nneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[A]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 10
+; CHECK-NEXT:    store i32 [[B]], ptr [[GEP]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add nsw i32 %a, 10
+  %idx = zext nneg i32 %add to i64
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %idx
+  store i32 %b, ptr %gep
+  ret void
+}
+
+define void @test_zext_missing_nneg(ptr %ptr, i32 %a, i32 %b) {
+; CHECK-LABEL: define void @test_zext_missing_nneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[A]], 10
+; CHECK-NEXT:    [[IDX:%.*]] = zext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IDX]]
+; CHECK-NEXT:    store i32 [[B]], ptr [[GEP]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add nsw i32 %a, 10
+  %idx = zext i32 %add to i64
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %idx
+  store i32 %b, ptr %gep
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/call-cast-attrs.ll b/llvm/test/Transforms/InstCombine/call-cast-attrs.ll
index 0b4ce1fa293f..bb122b0e2c4a 100644
--- a/llvm/test/Transforms/InstCombine/call-cast-attrs.ll
+++ b/llvm/test/Transforms/InstCombine/call-cast-attrs.ll
@@ -16,6 +16,11 @@ define void @d(i32 %x, ...) {
   ret void
 }
 
+define void @naked_func() naked {
+  tail call void asm sideeffect "mov  r1, r0", ""()
+  unreachable
+}
+
 define void @g(ptr %y) {
   call i32 @b(i32 zeroext 0)
   call void @c(ptr %y)
@@ -23,6 +28,7 @@ define void @g(ptr %y) {
   call void @d(i32 0, ptr sret(i32) %y)
   call void @d(i32 0, ptr nocapture %y)
   call void @d(ptr nocapture noundef %y)
+  call void @naked_func(i32 1)
   ret void
 }
 ; CHECK-LABEL: define void @g(ptr %y)
@@ -34,3 +40,4 @@ define void @g(ptr %y) {
 ; CHECK32:  %2 = ptrtoint ptr %y to i32
 ; CHECK32:  call void (i32, ...) @d(i32 noundef %2)
 ; CHECK64:  call void @d(ptr nocapture noundef %y)
+; CHECK:    call void @naked_func(i32 1)
diff --git a/llvm/test/Transforms/InstCombine/commutative-operation-over-phis.ll b/llvm/test/Transforms/InstCombine/commutative-operation-over-phis.ll
new file mode 100644
index 000000000000..e8b0fb198bd1
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/commutative-operation-over-phis.ll
@@ -0,0 +1,645 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+declare void @dummy()
+
+declare i32 @llvm.smax.i32(i32 %a, i32 %b)
+declare i32 @llvm.smin.i32(i32 %a, i32 %b)
+declare i32 @llvm.umax.i32(i32 %a, i32 %b)
+declare i32 @llvm.umin.i32(i32 %a, i32 %b)
+declare float @llvm.maxnum.f32(float %a, float %b)
+declare float @llvm.minnum.f32(float %a, float %b)
+declare float @llvm.maximum.f32(float %a, float %b)
+declare float @llvm.minimum.f32(float %a, float %b)
+declare float @llvm.pow.f32(float %a, float %b)
+
+define i8 @fold_phi_mul(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_mul(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = mul i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = mul i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+define i8 @fold_phi_mul_three(i1 %c, i1 %d, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_mul_three(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN1:%.*]], label [[END:%.*]]
+; CHECK:       then1:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br i1 [[D]], label [[THEN2:%.*]], label [[END]]
+; CHECK:       then2:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = mul i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then1, label %end
+then1:
+  call void @dummy()
+  br i1 %d, label %then2, label %end
+then2:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then1], [%a, %then2]
+  %phi2 = phi i8 [%b, %entry], [%a, %then1], [%b, %then2]
+  %ret = mul i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+define i8 @fold_phi_mul_three_notopt(i1 %c, i1 %d, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_mul_three_notopt(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN1:%.*]], label [[END:%.*]]
+; CHECK:       then1:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br i1 [[D]], label [[THEN2:%.*]], label [[END]]
+; CHECK:       then2:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI1:%.*]] = phi i8 [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[THEN1]] ], [ [[A]], [[THEN2]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ [[B]], [[ENTRY]] ], [ [[A]], [[THEN1]] ], [ [[A]], [[THEN2]] ]
+; CHECK-NEXT:    [[RET:%.*]] = mul i8 [[PHI1]], [[PHI2]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then1, label %end
+then1:
+  call void @dummy()
+  br i1 %d, label %then2, label %end
+then2:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then1], [%a, %then2]
+  %phi2 = phi i8 [%b, %entry], [%a, %then1], [%a, %then2]
+  %ret = mul i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+define i8 @fold_phi_mul_nsw_nuw(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_mul_nsw_nuw(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = mul nuw nsw i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = mul nsw nuw i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+define <2 x i8> @fold_phi_mul_fix_vec(i1 %c, <2 x i8> %a, <2 x i8> %b)  {
+; CHECK-LABEL: define <2 x i8> @fold_phi_mul_fix_vec(
+; CHECK-SAME: i1 [[C:%.*]], <2 x i8> [[A:%.*]], <2 x i8> [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = mul <2 x i8> [[A]], [[B]]
+; CHECK-NEXT:    ret <2 x i8> [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi <2 x i8> [%a, %entry], [%b, %then]
+  %phi2 = phi <2 x i8> [%b, %entry], [%a, %then]
+  %ret = mul <2 x i8> %phi1, %phi2
+  ret <2 x i8> %ret
+}
+
+define <vscale x 2 x i8> @fold_phi_mul_scale_vec(i1 %c, <vscale x 2 x i8> %a, <vscale x 2 x i8> %b)  {
+; CHECK-LABEL: define <vscale x 2 x i8> @fold_phi_mul_scale_vec(
+; CHECK-SAME: i1 [[C:%.*]], <vscale x 2 x i8> [[A:%.*]], <vscale x 2 x i8> [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = mul <vscale x 2 x i8> [[A]], [[B]]
+; CHECK-NEXT:    ret <vscale x 2 x i8> [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi <vscale x 2 x i8> [%a, %entry], [%b, %then]
+  %phi2 = phi <vscale x 2 x i8> [%b, %entry], [%a, %then]
+  %ret = mul <vscale x 2 x i8> %phi1, %phi2
+  ret <vscale x 2 x i8> %ret
+}
+
+define i8 @fold_phi_mul_commute(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_mul_commute(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = mul i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%a, %then], [%b, %entry]
+  %ret = mul i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+
+define i8 @fold_phi_mul_notopt(i1 %c, i8 %a, i8 %b, i8 %d)  {
+; CHECK-LABEL: define i8 @fold_phi_mul_notopt(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI1:%.*]] = phi i8 [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[THEN]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ [[B]], [[ENTRY]] ], [ [[D]], [[THEN]] ]
+; CHECK-NEXT:    [[RET:%.*]] = mul i8 [[PHI1]], [[PHI2]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%d, %then]
+  %ret = mul i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+
+define i8 @fold_phi_sub(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_sub(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI1:%.*]] = phi i8 [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[THEN]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ [[B]], [[ENTRY]] ], [ [[A]], [[THEN]] ]
+; CHECK-NEXT:    [[RET:%.*]] = sub i8 [[PHI1]], [[PHI2]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = sub i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+
+define i8 @fold_phi_add(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_add(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = add i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = add i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+define i8 @fold_phi_and(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_and(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = and i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = and i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+define i8 @fold_phi_or(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_or(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = or i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = or i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+
+define i8 @fold_phi_xor(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_xor(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = xor i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = xor i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+
+define float @fold_phi_fadd(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_fadd(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = fadd float [[A]], [[B]]
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = fadd float %phi1, %phi2
+  ret float %ret
+}
+
+define float @fold_phi_fadd_nnan(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_fadd_nnan(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = fadd nnan float [[A]], [[B]]
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = fadd nnan float %phi1, %phi2
+  ret float %ret
+}
+
+
+define float @fold_phi_fmul(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_fmul(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = fmul float [[A]], [[B]]
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = fmul float %phi1, %phi2
+  ret float %ret
+}
+
+
+define i32 @fold_phi_smax(i1 %c, i32 %a, i32 %b)  {
+; CHECK-LABEL: define i32 @fold_phi_smax(
+; CHECK-SAME: i1 [[C:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.smax.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i32 [%a, %entry], [%b, %then]
+  %phi2 = phi i32 [%b, %entry], [%a, %then]
+  %ret = call i32 @llvm.smax.i32(i32  %phi1, i32 %phi2)
+  ret i32 %ret
+}
+
+
+define i32 @fold_phi_smin(i1 %c, i32 %a, i32 %b)  {
+; CHECK-LABEL: define i32 @fold_phi_smin(
+; CHECK-SAME: i1 [[C:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.smin.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i32 [%a, %entry], [%b, %then]
+  %phi2 = phi i32 [%b, %entry], [%a, %then]
+  %ret = call i32 @llvm.smin.i32(i32  %phi1, i32 %phi2)
+  ret i32 %ret
+}
+
+
+define i32 @fold_phi_umax(i1 %c, i32 %a, i32 %b)  {
+; CHECK-LABEL: define i32 @fold_phi_umax(
+; CHECK-SAME: i1 [[C:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.umax.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i32 [%a, %entry], [%b, %then]
+  %phi2 = phi i32 [%b, %entry], [%a, %then]
+  %ret = call i32 @llvm.umax.i32(i32  %phi1, i32 %phi2)
+  ret i32 %ret
+}
+
+define i32 @fold_phi_umin(i1 %c, i32 %a, i32 %b)  {
+; CHECK-LABEL: define i32 @fold_phi_umin(
+; CHECK-SAME: i1 [[C:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.umin.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i32 [%a, %entry], [%b, %then]
+  %phi2 = phi i32 [%b, %entry], [%a, %then]
+  %ret = call i32 @llvm.umin.i32(i32  %phi1, i32 %phi2)
+  ret i32 %ret
+}
+
+
+define float @fold_phi_maxnum(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_maxnum(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.maxnum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = call float @llvm.maxnum.f32(float  %phi1, float %phi2)
+  ret float %ret
+}
+
+define float @fold_phi_pow(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_pow(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI1:%.*]] = phi float [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[THEN]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi float [ [[B]], [[ENTRY]] ], [ [[A]], [[THEN]] ]
+; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.pow.f32(float [[PHI1]], float [[PHI2]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = call float @llvm.pow.f32(float  %phi1, float %phi2)
+  ret float %ret
+}
+
+define float @fold_phi_minnum(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_minnum(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.minnum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = call float @llvm.minnum.f32(float  %phi1, float %phi2)
+  ret float %ret
+}
+
+define float @fold_phi_maximum(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_maximum(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.maximum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = call float @llvm.maximum.f32(float  %phi1, float %phi2)
+  ret float %ret
+}
+
+define float @fold_phi_minimum(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_minimum(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.minimum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = call float @llvm.minimum.f32(float  %phi1, float %phi2)
+  ret float %ret
+}
+
diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll
index 5e5e65164f70..be9bedbf7985 100644
--- a/llvm/test/Transforms/InstCombine/free-inversion.ll
+++ b/llvm/test/Transforms/InstCombine/free-inversion.ll
@@ -133,6 +133,24 @@ define i8 @sub_2(i8 %a, i1 %c, i8 %x, i8 %y) {
   ret i8 %not_ab
 }
 
+; Same as above but with a type larger than i64 to make sure we create -2
+; correctly.
+define i128 @sub_3(i128 %a, i1 %c, i128 %x, i128 %y) {
+; CHECK-LABEL: @sub_3(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i128 [[Y:%.*]], -124
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[C:%.*]], i128 [[X:%.*]], i128 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i128 [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[NOT_AB:%.*]] = sub i128 -2, [[TMP3]]
+; CHECK-NEXT:    ret i128 [[NOT_AB]]
+;
+  %nx = xor i128 %x, -1
+  %yy = xor i128 %y, 123
+  %b = select i1 %c, i128 %nx, i128 %yy
+  %ab = sub i128 %a, %b
+  %not_ab = xor i128 %ab, -1
+  ret i128 %not_ab
+}
+
 define i8 @sub_fail(i8 %a, i1 %c, i8 %x, i8 %y) {
 ; CHECK-LABEL: @sub_fail(
 ; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index bc7fdc9352df..373b7f5f2fc0 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -1453,4 +1453,166 @@ define ptr @const_gep_chain(ptr %p, i64 %a) {
   ret ptr %p4
 }
 
+define ptr @gep_sdiv(ptr %p, i64 %off) {
+; CHECK-LABEL: @gep_sdiv(
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[OFF:%.*]]
+; CHECK-NEXT:    ret ptr [[PTR]]
+;
+  %index = sdiv exact i64 %off, 7
+  %ptr = getelementptr %struct.C, ptr %p, i64 %index
+  ret ptr %ptr
+}
+
+define <2 x ptr> @gep_sdiv_vec(<2 x ptr> %p, <2 x i64> %off) {
+; CHECK-LABEL: @gep_sdiv_vec(
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, <2 x ptr> [[P:%.*]], <2 x i64> [[OFF:%.*]]
+; CHECK-NEXT:    ret <2 x ptr> [[PTR]]
+;
+  %index = sdiv exact <2 x i64> %off, <i64 7, i64 7>
+  %ptr = getelementptr %struct.C, <2 x ptr> %p, <2 x i64> %index
+  ret <2 x ptr> %ptr
+}
+
+define ptr @gep_sdiv_inbounds(ptr %p, i64 %off) {
+; CHECK-LABEL: @gep_sdiv_inbounds(
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 [[OFF:%.*]]
+; CHECK-NEXT:    ret ptr [[PTR]]
+;
+  %index = sdiv exact i64 %off, 7
+  %ptr = getelementptr inbounds %struct.C, ptr %p, i64 %index
+  ret ptr %ptr
+}
+
+define ptr @gep_ashr(ptr %p, i64 %off) {
+; CHECK-LABEL: @gep_ashr(
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[OFF:%.*]]
+; CHECK-NEXT:    ret ptr [[PTR]]
+;
+  %index = ashr exact i64 %off, 2
+  %ptr = getelementptr i32, ptr %p, i64 %index
+  ret ptr %ptr
+}
+
+; Negative tests
+
+define ptr @gep_i8(ptr %p, i64 %off) {
+; CHECK-LABEL: @gep_i8(
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[OFF:%.*]]
+; CHECK-NEXT:    ret ptr [[PTR]]
+;
+  %ptr = getelementptr i8, ptr %p, i64 %off
+  ret ptr %ptr
+}
+
+define ptr @gep_sdiv_mismatched_size(ptr %p, i64 %off) {
+; CHECK-LABEL: @gep_sdiv_mismatched_size(
+; CHECK-NEXT:    [[INDEX:%.*]] = sdiv exact i64 [[OFF:%.*]], 20
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr [[STRUCT_C:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    ret ptr [[PTR]]
+;
+  %index = sdiv exact i64 %off, 20
+  %ptr = getelementptr %struct.C, ptr %p, i64 %index
+  ret ptr %ptr
+}
+
+define ptr @gep_sdiv_without_exact(ptr %p, i64 %off) {
+; CHECK-LABEL: @gep_sdiv_without_exact(
+; CHECK-NEXT:    [[INDEX:%.*]] = sdiv i64 [[OFF:%.*]], 7
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr [[STRUCT_C:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    ret ptr [[PTR]]
+;
+  %index = sdiv i64 %off, 7
+  %ptr = getelementptr %struct.C, ptr %p, i64 %index
+  ret ptr %ptr
+}
+
+define ptr @gep_ashr_without_exact(ptr %p, i64 %off) {
+; CHECK-LABEL: @gep_ashr_without_exact(
+; CHECK-NEXT:    [[INDEX:%.*]] = ashr i64 [[OFF:%.*]], 2
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    ret ptr [[PTR]]
+;
+  %index = ashr i64 %off, 2
+  %ptr = getelementptr i32, ptr %p, i64 %index
+  ret ptr %ptr
+}
+
+define i1 @test_only_used_by_icmp(ptr %a, ptr %b, ptr %c) {
+; CHECK-LABEL: @test_only_used_by_icmp(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %pa = ptrtoint ptr %a to i64
+  %pb = ptrtoint ptr %b to i64
+  %sub = sub i64 %pb, %pa
+  %gep = getelementptr i8, ptr %a, i64 %sub
+  %cmp = icmp eq ptr %gep, %c
+  ret i1 %cmp
+}
+
+define i64 @test_only_used_by_ptrtoint(ptr %a, ptr %b) {
+; CHECK-LABEL: @test_only_used_by_ptrtoint(
+; CHECK-NEXT:    [[VAL:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    ret i64 [[VAL]]
+;
+  %pa = ptrtoint ptr %a to i64
+  %pb = ptrtoint ptr %b to i64
+  %sub = sub i64 %pb, %pa
+  %gep = getelementptr i8, ptr %a, i64 %sub
+  %val = ptrtoint ptr %gep to i64
+  ret i64 %val
+}
+
+define i64 @test_used_by_both(ptr %a, ptr %b, ptr %c) {
+; CHECK-LABEL: @test_used_by_both(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[VAL:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    ret i64 [[VAL]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i64 0
+;
+  %pa = ptrtoint ptr %a to i64
+  %pb = ptrtoint ptr %b to i64
+  %sub = sub i64 %pb, %pa
+  %gep = getelementptr i8, ptr %a, i64 %sub
+  %cmp = icmp eq ptr %gep, %c
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  %val = ptrtoint ptr %gep to i64
+  ret i64 %val
+if.else:
+  ret i64 0
+}
+
+; Negative tests
+
+define i64 @test_used_by_both_invalid(ptr %a, ptr %b, ptr %c) {
+; CHECK-LABEL: @test_used_by_both_invalid(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[PB:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[PA:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[PB]], [[PA]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[SUB]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    ret i64 [[VAL]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i64 0
+;
+  %pa = ptrtoint ptr %a to i64
+  %pb = ptrtoint ptr %b to i64
+  %sub = sub i64 %pb, %pa
+  %gep = getelementptr i8, ptr %a, i64 %sub
+  %cmp = icmp eq ptr %gep, %c
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  %val = load i64, ptr %gep, align 8
+  ret i64 %val
+if.else:
+  ret i64 0
+}
+
 !0 = !{!"branch_weights", i32 2, i32 10}
diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index 99c784d15eb3..fb297e04ceb0 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -5,6 +5,7 @@ target datalayout = "e-p:64:64:64-p1:16:16:16-p2:32:32:32-p3:64:64:64-i1:8:8-i8:
 
 declare ptr @getptr()
 declare void @use(ptr)
+declare void @use.i1(i1)
 
 define i1 @eq_base(ptr %x, i64 %y) {
 ; CHECK-LABEL: @eq_base(
@@ -443,6 +444,29 @@ define i1 @test60_extra_use_const_operands_no_inbounds(ptr %foo, i64 %i, i64 %j)
   ret i1 %cmp
 }
 
+define void @test60_extra_use_fold(ptr %foo, i64 %start.idx, i64 %end.offset) {
+; CHECK-LABEL: @test60_extra_use_fold(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[FOO:%.*]], i64 [[START_IDX:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 [[END_OFFSET:%.*]]
+; CHECK-NEXT:    call void @use(ptr [[GEP1]])
+; CHECK-NEXT:    call void @use(ptr [[GEP2]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[GEP1]], [[GEP2]]
+; CHECK-NEXT:    call void @use.i1(i1 [[CMP1]])
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult ptr [[GEP1]], [[GEP2]]
+; CHECK-NEXT:    call void @use.i1(i1 [[CMP2]])
+; CHECK-NEXT:    ret void
+;
+  %gep1 = getelementptr inbounds i32, ptr %foo, i64 %start.idx
+  %gep2 = getelementptr inbounds i8, ptr %foo, i64 %end.offset
+  call void @use(ptr %gep1)
+  call void @use(ptr %gep2)
+  %cmp1 = icmp eq ptr %gep1, %gep2
+  call void @use.i1(i1 %cmp1)
+  %cmp2 = icmp ult ptr %gep1, %gep2
+  call void @use.i1(i1 %cmp2)
+  ret void
+}
+
 define i1 @test_scalable_same(ptr %x) {
 ; CHECK-LABEL: @test_scalable_same(
 ; CHECK-NEXT:    ret i1 false
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 1c7bb36f0d34..1f554c7b6025 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -854,6 +854,107 @@ define i1 @PR32949(i32 %X, i32 %Y, i32 %Z) {
   ret i1 %C
 }
 
+define i1 @test_sdiv_pos_slt(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_slt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp slt i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_sle(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_sle(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp sle i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_sgt(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_sgt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp sgt i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_sge(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_sge(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp sge i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_ult(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_ult(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp ult i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_ule(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_ule(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp ule i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_ugt(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_ugt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp ugt i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_uge(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_uge(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp uge i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_neg_slt(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_neg_slt(
+; CHECK-NEXT:    [[DIVX:%.*]] = sdiv exact i32 [[X:%.*]], -40
+; CHECK-NEXT:    [[DIVY:%.*]] = sdiv exact i32 [[Y:%.*]], -40
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[DIVX]], [[DIVY]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, -40
+  %divy = sdiv exact i32 %y, -40
+  %cmp = icmp slt i32 %divx, %divy
+  ret i1 %cmp
+}
+
 ; PR8469
 define <2 x i1> @test49(<2 x i32> %i3) {
 ; CHECK-LABEL: @test49(
@@ -3862,10 +3963,9 @@ define <8 x i1> @bitreverse_vec_ne(<8 x i16> %x, <8 x i16> %y) {
 define i1 @knownbits1(i8 %a, i8 %b) {
 ; CHECK-LABEL: @knownbits1(
 ; CHECK-NEXT:    [[A1:%.*]] = and i8 [[A:%.*]], 1
-; CHECK-NEXT:    [[A2:%.*]] = or disjoint i8 [[A1]], 4
 ; CHECK-NEXT:    [[B1:%.*]] = and i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[B2:%.*]] = or disjoint i8 [[B1]], 5
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[A2]], [[B2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[B1]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[A1]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a1 = and i8 %a, 5
@@ -3879,10 +3979,9 @@ define i1 @knownbits1(i8 %a, i8 %b) {
 define i1 @knownbits2(i8 %a, i8 %b) {
 ; CHECK-LABEL: @knownbits2(
 ; CHECK-NEXT:    [[A1:%.*]] = and i8 [[A:%.*]], 1
-; CHECK-NEXT:    [[A2:%.*]] = or disjoint i8 [[A1]], 4
 ; CHECK-NEXT:    [[B1:%.*]] = and i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[B2:%.*]] = or disjoint i8 [[B1]], 5
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[A2]], [[B2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[B1]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[A1]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a1 = and i8 %a, 5
@@ -3896,10 +3995,9 @@ define i1 @knownbits2(i8 %a, i8 %b) {
 define i1 @knownbits3(i8 %a, i8 %b) {
 ; CHECK-LABEL: @knownbits3(
 ; CHECK-NEXT:    [[A1:%.*]] = and i8 [[A:%.*]], 1
-; CHECK-NEXT:    [[A2:%.*]] = or disjoint i8 [[A1]], 4
 ; CHECK-NEXT:    [[B1:%.*]] = and i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[B2:%.*]] = or disjoint i8 [[B1]], 5
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[B2]], [[A2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[B1]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[TMP1]], [[A1]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a1 = and i8 %a, 5
@@ -3913,10 +4011,9 @@ define i1 @knownbits3(i8 %a, i8 %b) {
 define <2 x i1> @knownbits4(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-LABEL: @knownbits4(
 ; CHECK-NEXT:    [[A1:%.*]] = and <2 x i8> [[A:%.*]], <i8 1, i8 1>
-; CHECK-NEXT:    [[A2:%.*]] = or disjoint <2 x i8> [[A1]], <i8 4, i8 4>
 ; CHECK-NEXT:    [[B1:%.*]] = and <2 x i8> [[B:%.*]], <i8 2, i8 2>
-; CHECK-NEXT:    [[B2:%.*]] = or disjoint <2 x i8> [[B1]], <i8 5, i8 5>
-; CHECK-NEXT:    [[C:%.*]] = icmp ne <2 x i8> [[B2]], [[A2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint <2 x i8> [[B1]], <i8 1, i8 1>
+; CHECK-NEXT:    [[C:%.*]] = icmp ne <2 x i8> [[TMP1]], [[A1]]
 ; CHECK-NEXT:    ret <2 x i1> [[C]]
 ;
   %a1 = and <2 x i8> %a, <i8 5, i8 5>
@@ -3932,10 +4029,9 @@ define <2 x i1> @knownbits4(<2 x i8> %a, <2 x i8> %b) {
 define i1 @knownbits5(i8 %a, i8 %b) {
 ; CHECK-LABEL: @knownbits5(
 ; CHECK-NEXT:    [[A1:%.*]] = and i8 [[A:%.*]], -127
-; CHECK-NEXT:    [[A2:%.*]] = or disjoint i8 [[A1]], 4
 ; CHECK-NEXT:    [[B1:%.*]] = and i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[B2:%.*]] = or disjoint i8 [[B1]], 5
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[A2]], [[B2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[B1]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[A1]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a1 = and i8 %a, 133
@@ -3949,10 +4045,9 @@ define i1 @knownbits5(i8 %a, i8 %b) {
 define i1 @knownbits6(i8 %a, i8 %b) {
 ; CHECK-LABEL: @knownbits6(
 ; CHECK-NEXT:    [[A1:%.*]] = and i8 [[A:%.*]], -127
-; CHECK-NEXT:    [[A2:%.*]] = or disjoint i8 [[A1]], 4
 ; CHECK-NEXT:    [[B1:%.*]] = and i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[B2:%.*]] = or disjoint i8 [[B1]], 5
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[A2]], [[B2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[B1]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[A1]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a1 = and i8 %a, 133
@@ -3966,10 +4061,9 @@ define i1 @knownbits6(i8 %a, i8 %b) {
 define <2 x i1> @knownbits7(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-LABEL: @knownbits7(
 ; CHECK-NEXT:    [[A1:%.*]] = and <2 x i8> [[A:%.*]], <i8 -127, i8 -127>
-; CHECK-NEXT:    [[A2:%.*]] = or disjoint <2 x i8> [[A1]], <i8 4, i8 4>
 ; CHECK-NEXT:    [[B1:%.*]] = and <2 x i8> [[B:%.*]], <i8 2, i8 2>
-; CHECK-NEXT:    [[B2:%.*]] = or disjoint <2 x i8> [[B1]], <i8 5, i8 5>
-; CHECK-NEXT:    [[C:%.*]] = icmp eq <2 x i8> [[B2]], [[A2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint <2 x i8> [[B1]], <i8 1, i8 1>
+; CHECK-NEXT:    [[C:%.*]] = icmp eq <2 x i8> [[TMP1]], [[A1]]
 ; CHECK-NEXT:    ret <2 x i1> [[C]]
 ;
   %a1 = and <2 x i8> %a, <i8 133, i8 133>
@@ -3983,10 +4077,9 @@ define <2 x i1> @knownbits7(<2 x i8> %a, <2 x i8> %b) {
 define i1 @knownbits8(i8 %a, i8 %b) {
 ; CHECK-LABEL: @knownbits8(
 ; CHECK-NEXT:    [[A1:%.*]] = and i8 [[A:%.*]], -127
-; CHECK-NEXT:    [[A2:%.*]] = or disjoint i8 [[A1]], 4
 ; CHECK-NEXT:    [[B1:%.*]] = and i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[B2:%.*]] = or disjoint i8 [[B1]], 5
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[B2]], [[A2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[B1]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[TMP1]], [[A1]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a1 = and i8 %a, 133
@@ -4912,3 +5005,136 @@ define i1 @or_positive_sgt_zero_multi_use(i8 %a) {
   %cmp = icmp sgt i8 %b, 0
   ret i1 %cmp
 }
+
+
+define i1 @disjoint_or_sgt_1(i8 %a, i8 %b) {
+; CHECK-LABEL: @disjoint_or_sgt_1(
+; CHECK-NEXT:    [[B1:%.*]] = add nsw i8 [[B:%.*]], 2
+; CHECK-NEXT:    [[ICMP_:%.*]] = icmp sle i8 [[B1]], [[A:%.*]]
+; CHECK-NEXT:    ret i1 [[ICMP_]]
+;
+  %a1 = or disjoint i8 %a, 1
+  %b1 = add nsw i8 %b, 2
+  %icmp_ = icmp sgt i8 %a1, %b1
+  ret i1 %icmp_
+}
+
+define i1 @disjoint_or_sgt_2(i8 %a, i8 %b) {
+; CHECK-LABEL: @disjoint_or_sgt_2(
+; CHECK-NEXT:    [[A1:%.*]] = or disjoint i8 [[A:%.*]], 2
+; CHECK-NEXT:    [[B1:%.*]] = add i8 [[B:%.*]], 1
+; CHECK-NEXT:    [[ICMP_:%.*]] = icmp sgt i8 [[A1]], [[B1]]
+; CHECK-NEXT:    ret i1 [[ICMP_]]
+;
+  %a1 = or disjoint i8 %a, 2
+  %b1 = add i8 %b, 1
+  %icmp_ = icmp sgt i8 %a1, %b1
+  ret i1 %icmp_
+}
+
+define i1 @disjoint_or_sgt_3(i8 %a, i8 %b) {
+; CHECK-LABEL: @disjoint_or_sgt_3(
+; CHECK-NEXT:    [[A1:%.*]] = or disjoint i8 [[A:%.*]], 2
+; CHECK-NEXT:    [[B1:%.*]] = add nuw i8 [[B:%.*]], 1
+; CHECK-NEXT:    [[ICMP_:%.*]] = icmp sgt i8 [[A1]], [[B1]]
+; CHECK-NEXT:    ret i1 [[ICMP_]]
+;
+  %a1 = or disjoint i8 %a, 2
+  %b1 = add nuw i8 %b, 1
+  %icmp_ = icmp sgt i8 %a1, %b1
+  ret i1 %icmp_
+}
+
+define i1 @disjoint_or_ugt_1(i8 %a, i8 %b) {
+; CHECK-LABEL: @disjoint_or_ugt_1(
+; CHECK-NEXT:    [[B1:%.*]] = add nsw i8 [[B:%.*]], 2
+; CHECK-NEXT:    [[ICMP_:%.*]] = icmp ule i8 [[B1]], [[A:%.*]]
+; CHECK-NEXT:    ret i1 [[ICMP_]]
+;
+  %a1 = or disjoint i8 %a, 1
+  %b1 = add nsw i8 %b, 2
+  %icmp_ = icmp ugt i8 %a1, %b1
+  ret i1 %icmp_
+}
+
+define i1 @disjoint_or_ugt_2(i8 %a, i8 %b) {
+; CHECK-LABEL: @disjoint_or_ugt_2(
+; CHECK-NEXT:    [[A1:%.*]] = or disjoint i8 [[A:%.*]], 2
+; CHECK-NEXT:    [[B1:%.*]] = add i8 [[B:%.*]], 1
+; CHECK-NEXT:    [[ICMP_:%.*]] = icmp ugt i8 [[A1]], [[B1]]
+; CHECK-NEXT:    ret i1 [[ICMP_]]
+;
+  %a1 = or disjoint i8 %a, 2
+  %b1 = add i8 %b, 1
+  %icmp_ = icmp ugt i8 %a1, %b1
+  ret i1 %icmp_
+}
+
+define i1 @disjoint_or_ugt_3(i8 %a, i8 %b) {
+; CHECK-LABEL: @disjoint_or_ugt_3(
+; CHECK-NEXT:    [[A1:%.*]] = or disjoint i8 [[A:%.*]], 2
+; CHECK-NEXT:    [[B1:%.*]] = add nuw i8 [[B:%.*]], 1
+; CHECK-NEXT:    [[ICMP_:%.*]] = icmp ugt i8 [[A1]], [[B1]]
+; CHECK-NEXT:    ret i1 [[ICMP_]]
+;
+  %a1 = or disjoint i8 %a, 2
+  %b1 = add nuw i8 %b, 1
+  %icmp_ = icmp ugt i8 %a1, %b1
+  ret i1 %icmp_
+}
+
+define i1 @deduce_nuw_flag_1(i8 %a, i8 %b) {
+; CHECK-LABEL: @deduce_nuw_flag_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw i8 [[B:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP0]], [[A:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %add1 = add nuw i8 %b, 2
+  %add2 = add i8 %a, 1
+  %cmp = icmp eq i8 %add1, %add2
+  ret i1 %cmp
+}
+
+define i1 @deduce_nuw_flag_2(i8 %a, i8 %b) {
+; CHECK-LABEL: @deduce_nuw_flag_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw i8 [[B:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP0]], [[A:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %add1 = add nuw i8 %b, 2
+  %add2 = add i8 %a, 1
+  %cmp = icmp eq i8 %add2, %add1
+  ret i1 %cmp
+}
+
+define i1 @dont_deduce_nuw_flag_1(i8 %a, i8 %b) {
+; CHECK-LABEL: @dont_deduce_nuw_flag_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i8 [[B:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP0]], [[A:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %add1 = add nuw i8 %b, -2
+  %add2 = add i8 %a, -1
+  %cmp = icmp eq i8 %add1, %add2
+  ret i1 %cmp
+}
+
+define i1 @dont_deduce_nuw_flag_2(i8 %a, i8 %b) {
+; CHECK-LABEL: @dont_deduce_nuw_flag_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i8 [[B:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP0]], [[A:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  %add1 = add nuw i8 %b, -2
+  %add2 = add i8 %a, -1
+  %cmp = icmp eq i8 %add2, %add1
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
index f3833a420ee8..ae2e115b1dd9 100644
--- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
@@ -2489,3 +2489,95 @@ define i1 @PR57986() {
   %umin = call i1 @llvm.umin.i1(i1 ptrtoint (ptr @g to i1), i1 true)
   ret i1 %umin
 }
+
+define i8 @fold_umax_with_knownbits_info(i8 %a, i8 %b) {
+; CHECK-LABEL: @fold_umax_with_knownbits_info(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = or i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[A2:%.*]] = shl i8 [[B:%.*]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[A1]], [[A2]]
+; CHECK-NEXT:    ret i8 [[SUB]]
+;
+entry:
+  %a1 = or i8 %a, 1
+  %a2 = shl i8 %b, 1
+  %sub = sub i8 %a1, %a2
+  %val = call i8 @llvm.umax.i8(i8 %sub, i8 1)
+  ret i8 %val
+}
+
+define <3 x i8> @fold_umax_with_knownbits_info_undef_in_splat(<3 x i8> %a, <3 x i8> %b) {
+; CHECK-LABEL: @fold_umax_with_knownbits_info_undef_in_splat(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = or <3 x i8> [[A:%.*]], <i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[A2:%.*]] = shl <3 x i8> [[B:%.*]], <i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[SUB:%.*]] = sub <3 x i8> [[A1]], [[A2]]
+; CHECK-NEXT:    ret <3 x i8> [[SUB]]
+;
+entry:
+  %a1 = or <3 x i8> %a, <i8 1, i8 1, i8 1>
+  %a2 = shl <3 x i8> %b, <i8 1, i8 1, i8 1>
+  %sub = sub <3 x i8> %a1, %a2
+  %val = call <3 x i8> @llvm.umax.v3i8(<3 x i8> %sub, <3 x i8> <i8 1, i8 undef, i8 1>)
+  ret <3 x i8> %val
+}
+
+define i8 @fold_umin_with_knownbits_info(i8 %a, i8 %b) {
+; CHECK-LABEL: @fold_umin_with_knownbits_info(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i8 3
+;
+entry:
+  %a1 = or i8 %a, 3
+  %a2 = shl i8 %b, 2
+  %sub = sub i8 %a1, %a2
+  %val = call i8 @llvm.umin.i8(i8 %sub, i8 3)
+  ret i8 %val
+}
+
+define <3 x i8> @fold_umin_with_knownbits_info_undef_in_splat(<3 x i8> %a, <3 x i8> %b) {
+; CHECK-LABEL: @fold_umin_with_knownbits_info_undef_in_splat(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <3 x i8> <i8 3, i8 3, i8 3>
+;
+entry:
+  %a1 = or <3 x i8> %a, <i8 3, i8 3, i8 3>
+  %a2 = shl <3 x i8> %b, <i8 2, i8 2, i8 2>
+  %sub = sub <3 x i8> %a1, %a2
+  %val = call <3 x i8> @llvm.umin.v3i8(<3 x i8> %sub, <3 x i8> <i8 3, i8 undef, i8 3>)
+  ret <3 x i8> %val
+}
+
+define i8 @fold_umax_with_knownbits_info_fail(i8 %a, i8 %b) {
+; CHECK-LABEL: @fold_umax_with_knownbits_info_fail(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = or i8 [[A:%.*]], 2
+; CHECK-NEXT:    [[A2:%.*]] = shl i8 [[B:%.*]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[A1]], [[A2]]
+; CHECK-NEXT:    [[VAL:%.*]] = call i8 @llvm.umax.i8(i8 [[SUB]], i8 1)
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
+entry:
+  %a1 = or i8 %a, 2
+  %a2 = shl i8 %b, 1
+  %sub = sub i8 %a1, %a2
+  %val = call i8 @llvm.umax.i8(i8 %sub, i8 1)
+  ret i8 %val
+}
+
+define i8 @fold_umin_with_knownbits_info_fail(i8 %a, i8 %b) {
+; CHECK-LABEL: @fold_umin_with_knownbits_info_fail(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = or i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[A2:%.*]] = shl i8 [[B:%.*]], 2
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[A1]], [[A2]]
+; CHECK-NEXT:    [[VAL:%.*]] = call i8 @llvm.umin.i8(i8 [[SUB]], i8 3)
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
+entry:
+  %a1 = or i8 %a, 1
+  %a2 = shl i8 %b, 2
+  %sub = sub i8 %a1, %a2
+  %val = call i8 @llvm.umin.i8(i8 %sub, i8 3)
+  ret i8 %val
+}
diff --git a/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll b/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll
index 448558d755a5..8fe4261bbf00 100644
--- a/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll
@@ -672,9 +672,8 @@ define <2 x i32> @test_mul_canonicalize_vec(<2 x i32> %x, <2 x i32> %y) {
 
 define i32 @test_mul_canonicalize_multiple_uses(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_canonicalize_multiple_uses(
-; CHECK-NEXT:    [[NEG:%.*]] = sub i32 0, [[X:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[NEG]], [[Y:%.*]]
-; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL]], [[NEG]]
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
 ; CHECK-NEXT:    ret i32 [[MUL2]]
 ;
   %neg = sub i32 0, %x
diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll
index 9fe8462c5d31..b404fcffbf42 100644
--- a/llvm/test/Transforms/InstCombine/mul.ll
+++ b/llvm/test/Transforms/InstCombine/mul.ll
@@ -1255,17 +1255,190 @@ define <2 x i32> @test_mul_canonicalize_vec(<2 x i32> %x, <2 x i32> %y) {
 
 define i32 @test_mul_canonicalize_multiple_uses(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_canonicalize_multiple_uses(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul i32 %neg, %y
+  %mul2 = mul i32 %mul, %neg
+  ret i32 %mul2
+}
+
+define i32 @mul_nsw_mul_nsw_neg(i32 %x, i32 %y) {
+; CHECK-LABEL: @mul_nsw_mul_nsw_neg(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul nsw i32 %neg, %y
+  %mul2 = mul nsw i32 %mul, %neg
+  ret i32 %mul2
+}
+
+define i32 @mul_mul_nsw_neg(i32 %x,i32 %y) {
+; CHECK-LABEL: @mul_mul_nsw_neg(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul nsw i32 %neg, %y
+  %mul2 = mul i32 %mul, %neg
+  ret i32 %mul2
+}
+
+define i32 @mul_nsw_mul_neg(i32 %x,i32 %y) {
+; CHECK-LABEL: @mul_nsw_mul_neg(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul i32 %neg, %y
+  %mul2 = mul nsw i32 %mul, %neg
+  ret i32 %mul2
+}
+
+define i32 @mul_nsw_mul_neg_onearg(i32 %x) {
+; CHECK-LABEL: @mul_nsw_mul_neg_onearg(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul i32 %neg, %x
+  %mul2 = mul nsw i32 %mul, %neg
+  ret i32 %mul2
+}
+
+define i8 @mul_mul_nsw_neg_onearg(i8 %x) {
+; CHECK-LABEL: @mul_mul_nsw_neg_onearg(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i8 [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i8 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i8 [[MUL2]]
+;
+  %neg = sub i8 0, %x
+  %mul = mul nsw i8 %neg, %x
+  %mul2 = mul i8 %mul, %neg
+  ret i8 %mul2
+}
+
+define i32 @mul_nsw_mul_nsw_neg_onearg(i32 %x) {
+; CHECK-LABEL: @mul_nsw_mul_nsw_neg_onearg(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul nsw i32 %neg, %x
+  %mul2 = mul nsw i32 %mul, %neg
+  ret i32 %mul2
+}
+
+define i32 @mul_nsw_shl_nsw_neg(i32 %x, i32 %y) {
+; CHECK-LABEL: @mul_nsw_shl_nsw_neg(
+; CHECK-NEXT:    [[SHL_NEG:%.*]] = shl i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SHL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %neg = sub i32 0, %x
+  %shl = shl nsw i32 %neg, %y
+  %mul = mul nsw i32 %shl, %neg
+  ret i32 %mul
+}
+
+define i32 @mul_shl_nsw_neg(i32 %x,i32 %y) {
+; CHECK-LABEL: @mul_shl_nsw_neg(
+; CHECK-NEXT:    [[SHL_NEG:%.*]] = shl i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SHL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %neg = sub i32 0, %x
+  %shl = shl nsw i32 %neg, %y
+  %mul = mul i32 %shl, %neg
+  ret i32 %mul
+}
+
+define i32 @mul_nsw_shl_neg(i32 %x,i32 %y) {
+; CHECK-LABEL: @mul_nsw_shl_neg(
+; CHECK-NEXT:    [[SHL_NEG:%.*]] = shl i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SHL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %neg = sub i32 0, %x
+  %shl = shl i32 %neg, %y
+  %mul = mul nsw i32 %shl, %neg
+  ret i32 %mul
+}
+
+define i32 @mul_nsw_shl_neg_onearg(i32 %x) {
+; CHECK-LABEL: @mul_nsw_shl_neg_onearg(
+; CHECK-NEXT:    [[SHL_NEG:%.*]] = shl i32 [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SHL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %neg = sub i32 0, %x
+  %shl = shl i32 %neg, %x
+  %mul = mul nsw i32 %shl, %neg
+  ret i32 %mul
+}
+
+define i8 @mul_shl_nsw_neg_onearg(i8 %x) {
+; CHECK-LABEL: @mul_shl_nsw_neg_onearg(
+; CHECK-NEXT:    [[SHL_NEG:%.*]] = shl i8 [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[SHL_NEG]], [[X]]
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %neg = sub i8 0, %x
+  %shl = shl nsw i8 %neg, %x
+  %mul = mul i8 %shl, %neg
+  ret i8 %mul
+}
+
+define i32 @mul_nsw_shl_nsw_neg_onearg(i32 %x) {
+; CHECK-LABEL: @mul_nsw_shl_nsw_neg_onearg(
+; CHECK-NEXT:    [[SHL_NEG:%.*]] = mul i32 [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SHL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %neg = sub i32 0, %x
+  %shl = mul nsw i32 %neg, %x
+  %mul = mul nsw i32 %shl, %neg
+  ret i32 %mul
+}
+
+define i32 @mul_use_mul_neg(i32 %x,i32 %y) {
+; CHECK-LABEL: @mul_use_mul_neg(
 ; CHECK-NEXT:    [[NEG:%.*]] = sub i32 0, [[X:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[NEG]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use32(i32 [[MUL]])
 ; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL]], [[NEG]]
 ; CHECK-NEXT:    ret i32 [[MUL2]]
 ;
   %neg = sub i32 0, %x
   %mul = mul i32 %neg, %y
+  call void @use32(i32 %mul)
   %mul2 = mul i32 %mul, %neg
   ret i32 %mul2
 }
 
+define i32 @mul_shl_use_mul_neg(i32 %x,i32 %y) {
+; CHECK-LABEL: @mul_shl_use_mul_neg(
+; CHECK-NEXT:    [[NEG:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[NEG]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use32(i32 [[SHL]])
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[SHL]], [[NEG]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %shl = shl i32 %neg, %y
+  call void @use32(i32 %shl)
+  %mul2 = mul i32 %shl, %neg
+  ret i32 %mul2
+}
+
 @X = global i32 5
 
 define i64 @test_mul_canonicalize_neg_is_not_undone(i64 %L1) {
diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll
index 805546099398..573a11599141 100644
--- a/llvm/test/Transforms/InstCombine/or.ll
+++ b/llvm/test/Transforms/InstCombine/or.ll
@@ -753,6 +753,52 @@ define i32 @test45(i32 %x, i32 %y, i32 %z) {
   ret i32 %or1
 }
 
+define i32 @test45_commuted1(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @test45_commuted1(
+; CHECK-NEXT:    [[YY:%.*]] = mul i32 [[Y:%.*]], [[Y]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[YY]], [[TMP1]]
+; CHECK-NEXT:    ret i32 [[OR1]]
+;
+  %yy = mul i32 %y, %y ; thwart complexity-based ordering
+  %or = or i32 %yy, %z
+  %and = and i32 %or, %x
+  %or1 = or i32 %yy, %and
+  ret i32 %or1
+}
+
+define i32 @test45_commuted2(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @test45_commuted2(
+; CHECK-NEXT:    [[YY:%.*]] = mul i32 [[Y:%.*]], [[Y]]
+; CHECK-NEXT:    [[XX:%.*]] = mul i32 [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[XX]], [[Z:%.*]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[YY]], [[TMP1]]
+; CHECK-NEXT:    ret i32 [[OR1]]
+;
+  %yy = mul i32 %y, %y ; thwart complexity-based ordering
+  %xx = mul i32 %x, %x ; thwart complexity-based ordering
+  %or = or i32 %yy, %z
+  %and = and i32 %xx, %or
+  %or1 = or i32 %and, %yy
+  ret i32 %or1
+}
+
+define i32 @test45_commuted3(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @test45_commuted3(
+; CHECK-NEXT:    [[YY:%.*]] = mul i32 [[Y:%.*]], [[Y]]
+; CHECK-NEXT:    [[ZZ:%.*]] = mul i32 [[Z:%.*]], [[Z]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[ZZ]], [[X:%.*]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[YY]], [[TMP1]]
+; CHECK-NEXT:    ret i32 [[OR1]]
+;
+  %yy = mul i32 %y, %y ; thwart complexity-based ordering
+  %zz = mul i32 %z, %z ; thwart complexity-based ordering
+  %or = or i32 %zz, %yy
+  %and = and i32 %or, %x
+  %or1 = or i32 %and, %yy
+  ret i32 %or1
+}
+
 define i1 @test46(i8 signext %c)  {
 ; CHECK-LABEL: @test46(
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[C:%.*]], -33
@@ -1213,11 +1259,11 @@ define i32 @PR46712(i1 %x, i1 %y, i1 %b, i64 %z) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[TRUE:%.*]], label [[END:%.*]]
 ; CHECK:       true:
-; CHECK-NEXT:    [[BOOL5:%.*]] = icmp eq i64 [[Z:%.*]], 0
-; CHECK-NEXT:    [[SEL:%.*]] = zext i1 [[BOOL5]] to i32
+; CHECK-NEXT:    [[BOOL5_NOT:%.*]] = icmp eq i64 [[Z:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i1 [[BOOL5_NOT]] to i32
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[T5:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SEL]], [[TRUE]] ]
+; CHECK-NEXT:    [[T5:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP0]], [[TRUE]] ]
 ; CHECK-NEXT:    ret i32 [[T5]]
 ;
 entry:
@@ -1245,11 +1291,11 @@ define i32 @PR46712_logical(i1 %x, i1 %y, i1 %b, i64 %z) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[TRUE:%.*]], label [[END:%.*]]
 ; CHECK:       true:
-; CHECK-NEXT:    [[BOOL5:%.*]] = icmp eq i64 [[Z:%.*]], 0
-; CHECK-NEXT:    [[SEL:%.*]] = zext i1 [[BOOL5]] to i32
+; CHECK-NEXT:    [[BOOL5_NOT:%.*]] = icmp eq i64 [[Z:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i1 [[BOOL5_NOT]] to i32
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[T5:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SEL]], [[TRUE]] ]
+; CHECK-NEXT:    [[T5:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP0]], [[TRUE]] ]
 ; CHECK-NEXT:    ret i32 [[T5]]
 ;
 entry:
@@ -1636,3 +1682,98 @@ define i32 @assoc_cast_assoc_disjoint(i16 %x) {
   %c = or disjoint i32 %b, 65536
   ret i32 %c
 }
+
+; (X & C1) | C2 -> X & (C1 | C2) iff (X & C2) == C2
+define i32 @test_or_and_disjoint(i32 %a) {
+; CHECK-LABEL: @test_or_and_disjoint(
+; CHECK-NEXT:    [[A0:%.*]] = and i32 [[A:%.*]], 24
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A0]], 8
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[A2:%.*]] = and i32 [[A]], 15
+; CHECK-NEXT:    ret i32 [[A2]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+  %a0 = and i32 %a, 24
+  %cmp = icmp eq i32 %a0, 8
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  %a1 = and i32 %a, 7
+  %a2 = or i32 %a1, 8
+  ret i32 %a2
+if.else:
+  ret i32 0
+}
+
+define i32 @test_or_and_mixed(i32 %a) {
+; CHECK-LABEL: @test_or_and_mixed(
+; CHECK-NEXT:    [[A0:%.*]] = and i32 [[A:%.*]], 27
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A0]], 11
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[A2:%.*]] = and i32 [[A]], 15
+; CHECK-NEXT:    ret i32 [[A2]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+  %a0 = and i32 %a, 27
+  %cmp = icmp eq i32 %a0, 11
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  %a1 = and i32 %a, 7
+  %a2 = or i32 %a1, 11
+  ret i32 %a2
+if.else:
+  ret i32 0
+}
+
+; Negative tests
+
+define i32 @test_or_and_disjoint_fail(i32 %a) {
+; CHECK-LABEL: @test_or_and_disjoint_fail(
+; CHECK-NEXT:    [[A0:%.*]] = and i32 [[A:%.*]], 24
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A0]], 16
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[A1:%.*]] = and i32 [[A]], 7
+; CHECK-NEXT:    [[A2:%.*]] = or disjoint i32 [[A1]], 8
+; CHECK-NEXT:    ret i32 [[A2]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+  %a0 = and i32 %a, 24
+  %cmp = icmp eq i32 %a0, 16
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  %a1 = and i32 %a, 7
+  %a2 = or i32 %a1, 8
+  ret i32 %a2
+if.else:
+  ret i32 0
+}
+
+define i32 @test_or_and_disjoint_multiuse(i32 %a) {
+; CHECK-LABEL: @test_or_and_disjoint_multiuse(
+; CHECK-NEXT:    [[A0:%.*]] = and i32 [[A:%.*]], 24
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A0]], 8
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[A1:%.*]] = and i32 [[A]], 7
+; CHECK-NEXT:    call void @use(i32 [[A1]])
+; CHECK-NEXT:    [[A2:%.*]] = or disjoint i32 [[A1]], 8
+; CHECK-NEXT:    ret i32 [[A2]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i32 0
+;
+  %a0 = and i32 %a, 24
+  %cmp = icmp eq i32 %a0, 8
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  %a1 = and i32 %a, 7
+  call void @use(i32 %a1)
+  %a2 = or i32 %a1, 8
+  ret i32 %a2
+if.else:
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/InstCombine/select-and-or.ll b/llvm/test/Transforms/InstCombine/select-and-or.ll
index 7edcd767b86e..0f7acd4d56c0 100644
--- a/llvm/test/Transforms/InstCombine/select-and-or.ll
+++ b/llvm/test/Transforms/InstCombine/select-and-or.ll
@@ -613,9 +613,9 @@ define i1 @and_or2_wrong_operand(i1 %a, i1 %b, i1 %c, i1 %d) {
 
 define i1 @and_or3(i1 %a, i1 %b, i32 %x, i32 %y) {
 ; CHECK-LABEL: @and_or3(
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[C]], i1 true, i1 [[A:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = select i1 [[B:%.*]], i1 [[TMP1]], i1 false
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[B:%.*]], i1 [[TMP2]], i1 false
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %c = icmp eq i32 %x, %y
@@ -626,9 +626,9 @@ define i1 @and_or3(i1 %a, i1 %b, i32 %x, i32 %y) {
 
 define i1 @and_or3_commuted(i1 %a, i1 %b, i32 %x, i32 %y) {
 ; CHECK-LABEL: @and_or3_commuted(
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[C]], i1 true, i1 [[A:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = select i1 [[B:%.*]], i1 [[TMP1]], i1 false
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[B:%.*]], i1 [[TMP2]], i1 false
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %c = icmp eq i32 %x, %y
@@ -665,9 +665,9 @@ define i1 @and_or3_multiuse(i1 %a, i1 %b, i32 %x, i32 %y) {
 
 define <2 x i1> @and_or3_vec(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @and_or3_vec(
-; CHECK-NEXT:    [[C:%.*]] = icmp ne <2 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[C]], <2 x i1> <i1 true, i1 true>, <2 x i1> [[A:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[B:%.*]], <2 x i1> [[TMP1]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i1> <i1 true, i1 true>, <2 x i1> [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[B:%.*]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %c = icmp eq <2 x i32> %x, %y
@@ -678,9 +678,9 @@ define <2 x i1> @and_or3_vec(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x i32> %
 
 define <2 x i1> @and_or3_vec_commuted(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @and_or3_vec_commuted(
-; CHECK-NEXT:    [[C:%.*]] = icmp ne <2 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[C]], <2 x i1> <i1 true, i1 true>, <2 x i1> [[A:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[B:%.*]], <2 x i1> [[TMP1]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i1> <i1 true, i1 true>, <2 x i1> [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[B:%.*]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %c = icmp eq <2 x i32> %x, %y
@@ -877,9 +877,9 @@ entry:
 
 define i1 @or_and3(i1 %a, i1 %b, i32 %x, i32 %y) {
 ; CHECK-LABEL: @or_and3(
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[C]], i1 [[B:%.*]], i1 false
-; CHECK-NEXT:    [[R:%.*]] = select i1 [[A:%.*]], i1 true, i1 [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i1 [[B:%.*]], i1 false
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[A:%.*]], i1 true, i1 [[TMP2]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %c = icmp eq i32 %x, %y
@@ -890,9 +890,9 @@ define i1 @or_and3(i1 %a, i1 %b, i32 %x, i32 %y) {
 
 define i1 @or_and3_commuted(i1 %a, i1 %b, i32 %x, i32 %y) {
 ; CHECK-LABEL: @or_and3_commuted(
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[C]], i1 [[B:%.*]], i1 false
-; CHECK-NEXT:    [[R:%.*]] = select i1 [[A:%.*]], i1 true, i1 [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i1 [[B:%.*]], i1 false
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[A:%.*]], i1 true, i1 [[TMP2]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %c = icmp eq i32 %x, %y
@@ -929,9 +929,9 @@ define i1 @or_and3_multiuse(i1 %a, i1 %b, i32 %x, i32 %y) {
 
 define <2 x i1> @or_and3_vec(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @or_and3_vec(
-; CHECK-NEXT:    [[C:%.*]] = icmp ne <2 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[C]], <2 x i1> [[B:%.*]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[A:%.*]], <2 x i1> <i1 true, i1 true>, <2 x i1> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i1> [[B:%.*]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[A:%.*]], <2 x i1> <i1 true, i1 true>, <2 x i1> [[TMP2]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %c = icmp eq <2 x i32> %x, %y
@@ -942,9 +942,9 @@ define <2 x i1> @or_and3_vec(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x i32> %
 
 define <2 x i1> @or_and3_vec_commuted(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @or_and3_vec_commuted(
-; CHECK-NEXT:    [[C:%.*]] = icmp ne <2 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[C]], <2 x i1> [[B:%.*]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[A:%.*]], <2 x i1> <i1 true, i1 true>, <2 x i1> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i1> [[B:%.*]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[A:%.*]], <2 x i1> <i1 true, i1 true>, <2 x i1> [[TMP2]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %c = icmp eq <2 x i32> %x, %y
@@ -965,3 +965,351 @@ define i1 @or_and3_wrong_operand(i1 %a, i1 %b, i32 %x, i32 %y, i1 %d) {
   %r = select i1 %cond, i1 %d, i1 %b
   ret i1 %r
 }
+
+define i8 @test_or_umax(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @test_or_umax(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND:%.*]], i8 [[X]], i8 [[TMP1]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %cmp = icmp ugt i8 %x, %y
+  %or = select i1 %cond, i1 true, i1 %cmp
+  %ret = select i1 %or, i8 %x, i8 %y
+  ret i8 %ret
+}
+
+define i8 @test_or_umin(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @test_or_umin(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND:%.*]], i8 [[Y]], i8 [[TMP1]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %cmp = icmp ugt i8 %x, %y
+  %or = select i1 %cond, i1 true, i1 %cmp
+  %ret = select i1 %or, i8 %y, i8 %x
+  ret i8 %ret
+}
+
+define i8 @test_and_umax(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @test_and_umax(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND:%.*]], i8 [[TMP1]], i8 [[Y]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %cmp = icmp ugt i8 %x, %y
+  %and = select i1 %cond, i1 %cmp, i1 false
+  %ret = select i1 %and, i8 %x, i8 %y
+  ret i8 %ret
+}
+
+define i8 @test_and_umin(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @test_and_umin(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND:%.*]], i8 [[TMP1]], i8 [[X]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %cmp = icmp ugt i8 %x, %y
+  %and = select i1 %cond, i1 %cmp, i1 false
+  %ret = select i1 %and, i8 %y, i8 %x
+  ret i8 %ret
+}
+
+define i8 @test_or_umax_bitwise1(i8 %x, i8 %y, i8 %val) {
+; CHECK-LABEL: @test_or_umax_bitwise1(
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[VAL:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND]], i8 [[X]], i8 [[TMP1]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %cond = icmp eq i8 %val, 0 ; thwart complexity-based ordering
+  %cmp = icmp ugt i8 %x, %y
+  %or = or i1 %cond, %cmp
+  %ret = select i1 %or, i8 %x, i8 %y
+  ret i8 %ret
+}
+
+define i8 @test_or_umax_bitwise2(i8 %x, i8 %y, i8 %val) {
+; CHECK-LABEL: @test_or_umax_bitwise2(
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[VAL:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND]], i8 [[X]], i8 [[TMP1]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %cond = icmp eq i8 %val, 0 ; thwart complexity-based ordering
+  %cmp = icmp ugt i8 %x, %y
+  %or = or i1 %cmp, %cond
+  %ret = select i1 %or, i8 %x, i8 %y
+  ret i8 %ret
+}
+
+define i8 @test_and_umax_bitwise1(i8 %x, i8 %y, i8 %val) {
+; CHECK-LABEL: @test_and_umax_bitwise1(
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[VAL:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND]], i8 [[TMP1]], i8 [[Y]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %cond = icmp eq i8 %val, 0 ; thwart complexity-based ordering
+  %cmp = icmp ugt i8 %x, %y
+  %and = and i1 %cond, %cmp
+  %ret = select i1 %and, i8 %x, i8 %y
+  ret i8 %ret
+}
+
+define i8 @test_and_umax_bitwise2(i8 %x, i8 %y, i8 %val) {
+; CHECK-LABEL: @test_and_umax_bitwise2(
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[VAL:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND]], i8 [[TMP1]], i8 [[Y]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %cond = icmp eq i8 %val, 0 ; thwart complexity-based ordering
+  %cmp = icmp ugt i8 %x, %y
+  %and = and i1 %cmp, %cond
+  %ret = select i1 %and, i8 %x, i8 %y
+  ret i8 %ret
+}
+
+; Other SPFs
+
+define i8 @test_or_smax(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @test_or_smax(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND:%.*]], i8 [[X]], i8 [[TMP1]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %cmp = icmp sgt i8 %x, %y
+  %or = select i1 %cond, i1 true, i1 %cmp
+  %ret = select i1 %or, i8 %x, i8 %y
+  ret i8 %ret
+}
+
+define i8 @test_or_abs(i8 %x, i1 %cond) {
+; CHECK-LABEL: @test_or_abs(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND:%.*]], i8 [[X]], i8 [[TMP1]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %cmp = icmp sgt i8 %x, -1
+  %neg = sub nsw i8 0, %x
+  %or = select i1 %cond, i1 true, i1 %cmp
+  %ret = select i1 %or, i8 %x, i8 %neg
+  ret i8 %ret
+}
+
+; TODO: fold SPF_FMAXNUM
+define float @test_or_fmaxnum(float %x, float %y, i1 %cond) {
+; CHECK-LABEL: @test_or_fmaxnum(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ogt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[COND:%.*]], i1 true, i1 [[CMP]]
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[OR]], float [[X]], float [[Y]]
+; CHECK-NEXT:    ret float [[RET]]
+;
+  %cmp = fcmp nnan ogt float %x, %y
+  %or = select i1 %cond, i1 true, i1 %cmp
+  %ret = select i1 %or, float %x, float %y
+  ret float %ret
+}
+
+; Negative tests
+
+define i8 @test_or_umax_invalid_logical(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @test_or_umax_invalid_logical(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[CMP]], i1 true, i1 [[COND:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[OR]], i8 [[X]], i8 [[Y]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %cmp = icmp ugt i8 %x, %y
+  %or = select i1 %cmp, i1 true, i1 %cond
+  %ret = select i1 %or, i8 %x, i8 %y
+  ret i8 %ret
+}
+
+define i8 @test_and_umax_invalid_logical(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @test_and_umax_invalid_logical(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = select i1 [[CMP]], i1 [[COND:%.*]], i1 false
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[AND]], i8 [[X]], i8 [[Y]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %cmp = icmp ugt i8 %x, %y
+  %and = select i1 %cmp, i1 %cond, i1 false
+  %ret = select i1 %and, i8 %x, i8 %y
+  ret i8 %ret
+}
+
+define i8 @test_or_umax_multiuse_cond(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @test_or_umax_multiuse_cond(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[COND:%.*]], i1 true, i1 [[CMP]]
+; CHECK-NEXT:    call void @use(i1 [[OR]])
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[OR]], i8 [[X]], i8 [[Y]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %cmp = icmp ugt i8 %x, %y
+  %or = select i1 %cond, i1 true, i1 %cmp
+  call void @use(i1 %or)
+  %ret = select i1 %or, i8 %x, i8 %y
+  ret i8 %ret
+}
+
+; Tests from PR76203
+
+define i8 @test_or_eq_a_b(i1 %other_cond, i8 %a, i8 %b)  {
+; CHECK-LABEL: @test_or_eq_a_b(
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[OTHER_COND:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %cmp = icmp eq i8 %a, %b
+  %cond = or i1 %other_cond, %cmp
+  %select = select i1 %cond, i8 %a, i8 %b
+  ret i8 %select
+}
+
+define i8 @test_and_ne_a_b(i1 %other_cond, i8 %a, i8 %b)  {
+; CHECK-LABEL: @test_and_ne_a_b(
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[OTHER_COND:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %cmp = icmp ne i8 %a, %b
+  %cond = and i1 %other_cond, %cmp
+  %select = select i1 %cond, i8 %a, i8 %b
+  ret i8 %select
+}
+
+define i8 @test_or_eq_a_b_commuted(i1 %other_cond, i8 %a, i8 %b)  {
+; CHECK-LABEL: @test_or_eq_a_b_commuted(
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[OTHER_COND:%.*]], i8 [[B:%.*]], i8 [[A:%.*]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %cmp = icmp eq i8 %a, %b
+  %cond = or i1 %other_cond, %cmp
+  %select = select i1 %cond, i8 %b, i8 %a
+  ret i8 %select
+}
+
+define i8 @test_and_ne_a_b_commuted(i1 %other_cond, i8 %a, i8 %b)  {
+; CHECK-LABEL: @test_and_ne_a_b_commuted(
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[OTHER_COND:%.*]], i8 [[B:%.*]], i8 [[A:%.*]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %cmp = icmp ne i8 %a, %b
+  %cond = and i1 %other_cond, %cmp
+  %select = select i1 %cond, i8 %b, i8 %a
+  ret i8 %select
+}
+
+define i8 @test_or_eq_different_operands(i8 %a, i8 %b, i8 %c)  {
+; CHECK-LABEL: @test_or_eq_different_operands(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], i8 [[A]], i8 [[B:%.*]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %cmp = icmp eq i8 %a, %c
+  %cmp1 = icmp eq i8 %b, %a
+  %cond = or i1 %cmp, %cmp1
+  %select = select i1 %cond, i8 %a, i8 %b
+  ret i8 %select
+}
+
+define i8 @test_or_eq_a_b_multi_use(i1 %other_cond, i8 %a, i8 %b)  {
+; CHECK-LABEL: @test_or_eq_a_b_multi_use(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = or i1 [[CMP]], [[OTHER_COND:%.*]]
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    call void @use(i1 [[COND]])
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[OTHER_COND]], i8 [[A]], i8 [[B]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %cmp = icmp eq i8 %a, %b
+  %cond = or i1 %other_cond, %cmp
+  call void @use(i1 %cmp)
+  call void @use(i1 %cond)
+  %select = select i1 %cond, i8 %a, i8 %b
+  ret i8 %select
+}
+
+define <2 x i8> @test_or_eq_a_b_vec(<2 x i1> %other_cond, <2 x i8> %a, <2 x i8> %b)  {
+; CHECK-LABEL: @test_or_eq_a_b_vec(
+; CHECK-NEXT:    [[SELECT:%.*]] = select <2 x i1> [[OTHER_COND:%.*]], <2 x i8> [[A:%.*]], <2 x i8> [[B:%.*]]
+; CHECK-NEXT:    ret <2 x i8> [[SELECT]]
+;
+  %cmp = icmp eq <2 x i8> %a, %b
+  %cond = or <2 x i1> %other_cond, %cmp
+  %select = select <2 x i1> %cond, <2 x i8> %a, <2 x i8> %b
+  ret <2 x i8> %select
+}
+
+define i8 @test_or_ne_a_b(i1 %other_cond, i8 %a, i8 %b)  {
+; CHECK-LABEL: @test_or_ne_a_b(
+; CHECK-NEXT:    ret i8 [[A:%.*]]
+;
+  %cmp = icmp ne i8 %a, %b
+  %cond = or i1 %other_cond, %cmp
+  %select = select i1 %cond, i8 %a, i8 %b
+  ret i8 %select
+}
+
+define i8 @test_and_ne_different_operands_fail(i8 %a, i8 %b, i8 %c)  {
+; CHECK-LABEL: @test_and_ne_different_operands_fail(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 [[B:%.*]], [[C]]
+; CHECK-NEXT:    [[COND:%.*]] = and i1 [[CMP]], [[CMP1]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], i8 [[B]], i8 [[A]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %cmp = icmp ne i8 %a, %c
+  %cmp1 = icmp ne i8 %b, %c
+  %cond = and i1 %cmp, %cmp1
+  %select = select i1 %cond, i8 %b, i8 %a
+  ret i8 %select
+}
+
+define i8 @test_logical_or_eq_a_b(i1 %other_cond, i8 %a, i8 %b)  {
+; CHECK-LABEL: @test_logical_or_eq_a_b(
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[OTHER_COND:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %cmp = icmp eq i8 %a, %b
+  %or.cond = select i1 %other_cond, i1 true, i1 %cmp
+  %select = select i1 %or.cond, i8 %a, i8 %b
+  ret i8 %select
+}
+
+define i8 @test_logical_commuted_or_eq_a_b(i1 %other_cond, i8 %a, i8 %b)  {
+; CHECK-LABEL: @test_logical_commuted_or_eq_a_b(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP]], i1 true, i1 [[OTHER_COND:%.*]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[OR_COND]], i8 [[A]], i8 [[B]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %cmp = icmp eq i8 %a, %b
+  %or.cond = select i1 %cmp, i1 true, i1 %other_cond
+  %select = select i1 %or.cond, i8 %a, i8 %b
+  ret i8 %select
+}
+
+define i8 @test_logical_and_ne_a_b(i1 %other_cond, i8 %a, i8 %b)  {
+; CHECK-LABEL: @test_logical_and_ne_a_b(
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[OTHER_COND:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %cmp = icmp ne i8 %a, %b
+  %or.cond = select i1 %other_cond, i1 %cmp, i1 false
+  %select = select i1 %or.cond, i8 %a, i8 %b
+  ret i8 %select
+}
+
+define i8 @test_logical_commuted_and_ne_a_b(i1 %other_cond, i8 %a, i8 %b)  {
+; CHECK-LABEL: @test_logical_commuted_and_ne_a_b(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP]], i1 [[OTHER_COND:%.*]], i1 false
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[OR_COND]], i8 [[A]], i8 [[B]]
+; CHECK-NEXT:    ret i8 [[SELECT]]
+;
+  %cmp = icmp ne i8 %a, %b
+  %or.cond = select i1 %cmp, i1 %other_cond, i1 false
+  %select = select i1 %or.cond, i8 %a, i8 %b
+  ret i8 %select
+}
diff --git a/llvm/test/Transforms/InstCombine/select-factorize.ll b/llvm/test/Transforms/InstCombine/select-factorize.ll
index 1b727a3aaee3..386c8e522759 100644
--- a/llvm/test/Transforms/InstCombine/select-factorize.ll
+++ b/llvm/test/Transforms/InstCombine/select-factorize.ll
@@ -303,7 +303,7 @@ define i1 @and_logic_and_logic_or_not_one_use(i1 %c, i1 %a, i1 %b) {
 ; CHECK-LABEL: @and_logic_and_logic_or_not_one_use(
 ; CHECK-NEXT:    [[AC:%.*]] = and i1 [[A:%.*]], [[C:%.*]]
 ; CHECK-NEXT:    [[BC:%.*]] = select i1 [[B:%.*]], i1 [[C]], i1 false
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[BC]], i1 true, i1 [[AC]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[B]], i1 [[C]], i1 [[AC]]
 ; CHECK-NEXT:    call void @use(i1 [[AC]])
 ; CHECK-NEXT:    call void @use(i1 [[BC]])
 ; CHECK-NEXT:    ret i1 [[OR]]
@@ -368,7 +368,7 @@ define i1 @and_and_logic_or_not_one_use(i1 %c, i1 %a, i1 %b) {
 ; CHECK-LABEL: @and_and_logic_or_not_one_use(
 ; CHECK-NEXT:    [[AC:%.*]] = and i1 [[A:%.*]], [[C:%.*]]
 ; CHECK-NEXT:    [[BC:%.*]] = and i1 [[C]], [[B:%.*]]
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[BC]], i1 true, i1 [[AC]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[B]], i1 [[C]], i1 [[AC]]
 ; CHECK-NEXT:    call void @use(i1 [[AC]])
 ; CHECK-NEXT:    call void @use(i1 [[BC]])
 ; CHECK-NEXT:    ret i1 [[OR]]
@@ -532,7 +532,7 @@ define i1 @logic_or_logic_and_not_one_use(i1 %c, i1 %a, i1 %b) {
 ; CHECK-LABEL: @logic_or_logic_and_not_one_use(
 ; CHECK-NEXT:    [[AC:%.*]] = select i1 [[C:%.*]], i1 true, i1 [[A:%.*]]
 ; CHECK-NEXT:    [[BC:%.*]] = select i1 [[B:%.*]], i1 true, i1 [[C]]
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[BC]], i1 [[AC]], i1 false
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[B]], i1 [[AC]], i1 [[C]]
 ; CHECK-NEXT:    call void @use(i1 [[AC]])
 ; CHECK-NEXT:    call void @use(i1 [[BC]])
 ; CHECK-NEXT:    ret i1 [[OR]]
@@ -681,7 +681,7 @@ define i1 @or_logic_or_logic_and_not_one_use(i1 %c, i1 %a, i1 %b) {
 ; CHECK-LABEL: @or_logic_or_logic_and_not_one_use(
 ; CHECK-NEXT:    [[AC:%.*]] = or i1 [[C:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    [[BC:%.*]] = select i1 [[B:%.*]], i1 true, i1 [[C]]
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[BC]], i1 [[AC]], i1 false
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[B]], i1 [[AC]], i1 [[C]]
 ; CHECK-NEXT:    call void @use(i1 [[AC]])
 ; CHECK-NEXT:    call void @use(i1 [[BC]])
 ; CHECK-NEXT:    ret i1 [[OR]]
@@ -746,7 +746,7 @@ define i1 @or_or_logic_and_not_one_use(i1 %c, i1 %a, i1 %b) {
 ; CHECK-LABEL: @or_or_logic_and_not_one_use(
 ; CHECK-NEXT:    [[AC:%.*]] = or i1 [[C:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    [[BC:%.*]] = or i1 [[B:%.*]], [[C]]
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[BC]], i1 [[AC]], i1 false
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[B]], i1 [[AC]], i1 [[C]]
 ; CHECK-NEXT:    call void @use(i1 [[AC]])
 ; CHECK-NEXT:    call void @use(i1 [[BC]])
 ; CHECK-NEXT:    ret i1 [[OR]]
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 7583a75385a7..d3e959b1eaa0 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -1735,6 +1735,21 @@ define float @copysign_type_mismatch(double %x) {
 
 ; Negative test
 
+define <2 x float> @copysign_type_mismatch2(<2 x float> %x) {
+; CHECK-LABEL: @copysign_type_mismatch2(
+; CHECK-NEXT:    [[I:%.*]] = bitcast <2 x float> [[X:%.*]] to i64
+; CHECK-NEXT:    [[ISPOS:%.*]] = icmp sgt i64 [[I]], -1
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[ISPOS]], <2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float> <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %i = bitcast <2 x float> %x to i64
+  %ispos = icmp sgt i64 %i, -1
+  %r = select i1 %ispos, <2 x float> <float 1.0, float 1.0>, <2 x float> <float -1.0, float -1.0>
+  ret <2 x float> %r
+}
+
+; Negative test
+
 define float @copysign_wrong_cmp(float %x) {
 ; CHECK-LABEL: @copysign_wrong_cmp(
 ; CHECK-NEXT:    [[I:%.*]] = bitcast float [[X:%.*]] to i32
@@ -2880,10 +2895,7 @@ define i8 @select_replacement_loop(i8 %x, i8 %y, i8 %z) {
 define i32 @select_replacement_loop2(i32 %arg, i32 %arg2) {
 ; CHECK-LABEL: @select_replacement_loop2(
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[ARG:%.*]], [[ARG2:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], [[ARG2]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MUL]], [[ARG]]
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[DIV]], i32 undef
-; CHECK-NEXT:    ret i32 [[SEL]]
+; CHECK-NEXT:    ret i32 [[DIV]]
 ;
   %div = udiv i32 %arg, %arg2
   %mul = mul nsw i32 %div, %arg2
@@ -3612,8 +3624,8 @@ define i32 @pr62088() {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[NOT2:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ -2, [[LOOP]] ]
 ; CHECK-NEXT:    [[H_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ 1, [[LOOP]] ]
-; CHECK-NEXT:    [[XOR1:%.*]] = or disjoint i32 [[H_0]], [[NOT2]]
-; CHECK-NEXT:    [[SUB5:%.*]] = sub i32 -1824888657, [[XOR1]]
+; CHECK-NEXT:    [[XOR:%.*]] = or disjoint i32 [[H_0]], [[NOT2]]
+; CHECK-NEXT:    [[SUB5:%.*]] = sub i32 -1824888657, [[XOR]]
 ; CHECK-NEXT:    [[XOR6:%.*]] = xor i32 [[SUB5]], -1260914025
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[XOR6]], 824855120
 ; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/xor-icmps.ll b/llvm/test/Transforms/InstCombine/xor-icmps.ll
index c85993ea9a7e..f104cd7fdcad 100644
--- a/llvm/test/Transforms/InstCombine/xor-icmps.ll
+++ b/llvm/test/Transforms/InstCombine/xor-icmps.ll
@@ -171,3 +171,151 @@ define i1 @xor_icmp_ptr(ptr %c, ptr %d) {
   ret i1 %xor
 }
 
+; Tests from PR70928
+define i1 @xor_icmp_true_signed(i32 %a) {
+; CHECK-LABEL: @xor_icmp_true_signed(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp sgt i32 %a, 5
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_true_signed_multiuse1(i32 %a) {
+; CHECK-LABEL: @xor_icmp_true_signed_multiuse1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp sgt i32 %a, 5
+  call void @use(i1 %cmp)
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_true_signed_multiuse2(i32 %a) {
+; CHECK-LABEL: @xor_icmp_true_signed_multiuse2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], 6
+; CHECK-NEXT:    call void @use(i1 [[CMP1]])
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp sgt i32 %a, 5
+  call void @use(i1 %cmp)
+  %cmp1 = icmp slt i32 %a, 6
+  call void @use(i1 %cmp1)
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_true_signed_commuted(i32 %a) {
+; CHECK-LABEL: @xor_icmp_true_signed_commuted(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp sgt i32 %a, 5
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp1, %cmp
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_true_unsigned(i32 %a) {
+; CHECK-LABEL: @xor_icmp_true_unsigned(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp ugt i32 %a, 5
+  %cmp1 = icmp ult i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_to_ne(i32 %a) {
+; CHECK-LABEL: @xor_icmp_to_ne(
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[A:%.*]], 5
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %a, 4
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_to_ne_multiuse1(i32 %a) {
+; CHECK-LABEL: @xor_icmp_to_ne_multiuse1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 4
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[A]], 5
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %a, 4
+  call void @use(i1 %cmp)
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_to_icmp_add(i32 %a) {
+; CHECK-LABEL: @xor_icmp_to_icmp_add(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[A:%.*]], -6
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[TMP1]], -2
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %a, 3
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+; Negative tests
+; The result of ConstantRange::difference is not exact.
+define i1 @xor_icmp_invalid_range(i8 %x0) {
+; CHECK-LABEL: @xor_icmp_invalid_range(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X0:%.*]], -5
+; CHECK-NEXT:    [[OR_COND:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[OR_COND]]
+;
+  %cmp = icmp eq i8 %x0, 0
+  %cmp4 = icmp ne i8 %x0, 4
+  %or.cond = xor i1 %cmp, %cmp4
+  ret i1 %or.cond
+}
+define i1 @xor_icmp_to_ne_multiuse2(i32 %a) {
+; CHECK-LABEL: @xor_icmp_to_ne_multiuse2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 4
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], 6
+; CHECK-NEXT:    call void @use(i1 [[CMP1]])
+; CHECK-NEXT:    [[CMP3:%.*]] = xor i1 [[CMP]], [[CMP1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %a, 4
+  call void @use(i1 %cmp)
+  %cmp1 = icmp slt i32 %a, 6
+  call void @use(i1 %cmp1)
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_to_icmp_add_multiuse1(i32 %a) {
+; CHECK-LABEL: @xor_icmp_to_icmp_add_multiuse1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 3
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], 6
+; CHECK-NEXT:    [[CMP3:%.*]] = xor i1 [[CMP]], [[CMP1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %a, 3
+  call void @use(i1 %cmp)
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_to_icmp_add_multiuse2(i32 %a) {
+; CHECK-LABEL: @xor_icmp_to_icmp_add_multiuse2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 3
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], 6
+; CHECK-NEXT:    call void @use(i1 [[CMP1]])
+; CHECK-NEXT:    [[CMP3:%.*]] = xor i1 [[CMP]], [[CMP1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %a, 3
+  call void @use(i1 %cmp)
+  %cmp1 = icmp slt i32 %a, 6
+  call void @use(i1 %cmp1)
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
diff --git a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
index 585f099fd41b..661c36038a67 100644
--- a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
+++ b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
@@ -170,18 +170,16 @@ define i32 @PR49475(i32 %x, i16 %y) {
 
 define i8 @PR49475_infloop(i32 %t0, i16 %insert, i64 %e, i8 %i162) {
 ; CHECK-LABEL: @PR49475_infloop(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[T0:%.*]], 0
 ; CHECK-NEXT:    [[B2:%.*]] = icmp eq i16 [[INSERT:%.*]], 0
-; CHECK-NEXT:    [[T1:%.*]] = or i1 [[B]], [[B2]]
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[T0]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[T0:%.*]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = or disjoint i32 [[TMP1]], 140
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext nneg i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[XOR1:%.*]] = select i1 [[T1]], i64 [[TMP3]], i64 140
+; CHECK-NEXT:    [[XOR:%.*]] = select i1 [[B2]], i64 [[TMP3]], i64 140
 ; CHECK-NEXT:    [[CONV16:%.*]] = sext i8 [[I162:%.*]] to i64
 ; CHECK-NEXT:    [[SUB17:%.*]] = sub i64 [[CONV16]], [[E:%.*]]
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[SUB17]], 32
 ; CHECK-NEXT:    [[CONV18:%.*]] = ashr exact i64 [[SEXT]], 32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i64 [[XOR1]], [[CONV18]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i64 [[XOR]], [[CONV18]]
 ; CHECK-NEXT:    [[CONV19:%.*]] = zext i1 [[CMP]] to i16
 ; CHECK-NEXT:    [[OR21:%.*]] = or i16 [[CONV19]], [[INSERT]]
 ; CHECK-NEXT:    [[TOBOOL23_NOT:%.*]] = icmp eq i16 [[OR21]], 0
diff --git a/llvm/test/Transforms/InstSimplify/div.ll b/llvm/test/Transforms/InstSimplify/div.ll
index a379e1ec9efe..e13b6f139bcf 100644
--- a/llvm/test/Transforms/InstSimplify/div.ll
+++ b/llvm/test/Transforms/InstSimplify/div.ll
@@ -567,3 +567,100 @@ define <2 x i8> @sdiv_vec_multi_one_bit_divisor(<2 x i8> %x, <2 x i8> %y) {
   %res = sdiv <2 x i8> %y, %and
   ret <2 x i8> %res
 }
+
+define i8 @udiv_exact_mul_nsw(i8 %x) {
+; CHECK-LABEL: @udiv_exact_mul_nsw(
+; CHECK-NEXT:    ret i8 [[X:%.*]]
+;
+  %a = mul nsw i8 %x, 24
+  %b = udiv exact i8 %a, 24
+  ret i8 %b
+}
+
+define i8 @sdiv_exact_mul_nuw(i8 %x) {
+; CHECK-LABEL: @sdiv_exact_mul_nuw(
+; CHECK-NEXT:    ret i8 [[X:%.*]]
+;
+  %a = mul nuw i8 %x, 24
+  %b = sdiv exact i8 %a, 24
+  ret i8 %b
+}
+
+; Negative tests
+
+define i8 @udiv_exact_mul_nsw_mismatch(i8 %x) {
+; CHECK-LABEL: @udiv_exact_mul_nsw_mismatch(
+; CHECK-NEXT:    [[A:%.*]] = mul nsw i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[B:%.*]] = udiv exact i8 [[A]], 12
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %a = mul nsw i8 %x, 24
+  %b = udiv exact i8 %a, 12
+  ret i8 %b
+}
+
+define i8 @udiv_exact_mul_nsw_power_of_2(i8 %x) {
+; CHECK-LABEL: @udiv_exact_mul_nsw_power_of_2(
+; CHECK-NEXT:    [[A:%.*]] = mul nsw i8 [[X:%.*]], 8
+; CHECK-NEXT:    [[B:%.*]] = udiv exact i8 [[A]], 8
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %a = mul nsw i8 %x, 8
+  %b = udiv exact i8 %a, 8
+  ret i8 %b
+}
+
+define i8 @sdiv_exact_mul_nuw_power_of_2(i8 %x) {
+; CHECK-LABEL: @sdiv_exact_mul_nuw_power_of_2(
+; CHECK-NEXT:    [[A:%.*]] = mul nuw i8 [[X:%.*]], 8
+; CHECK-NEXT:    [[B:%.*]] = sdiv exact i8 [[A]], 8
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %a = mul nuw i8 %x, 8
+  %b = sdiv exact i8 %a, 8
+  ret i8 %b
+}
+
+define i8 @udiv_exact_mul(i8 %x) {
+; CHECK-LABEL: @udiv_exact_mul(
+; CHECK-NEXT:    [[A:%.*]] = mul i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[B:%.*]] = udiv exact i8 [[A]], 24
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %a = mul i8 %x, 24
+  %b = udiv exact i8 %a, 24
+  ret i8 %b
+}
+
+define i8 @sdiv_exact_mul(i8 %x) {
+; CHECK-LABEL: @sdiv_exact_mul(
+; CHECK-NEXT:    [[A:%.*]] = mul i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[B:%.*]] = sdiv exact i8 [[A]], 24
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %a = mul i8 %x, 24
+  %b = sdiv exact i8 %a, 24
+  ret i8 %b
+}
+
+define i8 @udiv_mul_nsw(i8 %x) {
+; CHECK-LABEL: @udiv_mul_nsw(
+; CHECK-NEXT:    [[A:%.*]] = mul nsw i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[B:%.*]] = udiv i8 [[A]], 24
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %a = mul nsw i8 %x, 24
+  %b = udiv i8 %a, 24
+  ret i8 %b
+}
+
+define i8 @sdiv_mul_nuw(i8 %x) {
+; CHECK-LABEL: @sdiv_mul_nuw(
+; CHECK-NEXT:    [[A:%.*]] = mul nuw i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[B:%.*]] = sdiv i8 [[A]], 24
+; CHECK-NEXT:    ret i8 [[B]]
+;
+  %a = mul nuw i8 %x, 24
+  %b = sdiv i8 %a, 24
+  ret i8 %b
+}
diff --git a/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll
index 74a1ed27e59d..2a4ce85ed11f 100644
--- a/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll
@@ -926,12 +926,8 @@ define <2 x i32> @all_constant_true_undef_false_constexpr_vec() {
 
 define i1 @expand_binop_undef(i32 %x, i32 %y) {
 ; CHECK-LABEL: @expand_binop_undef(
-; CHECK-NEXT:    [[CMP9_NOT_1:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMP15:%.*]] = icmp slt i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[SPEC_SELECT39:%.*]] = select i1 [[CMP9_NOT_1]], i1 undef, i1 [[CMP15]]
-; CHECK-NEXT:    [[SPEC_SELECT40:%.*]] = xor i1 [[CMP9_NOT_1]], true
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = and i1 [[SPEC_SELECT39]], [[SPEC_SELECT40]]
-; CHECK-NEXT:    ret i1 [[SPEC_SELECT]]
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp slt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP15]]
 ;
   %cmp9.not.1 = icmp eq i32 %x, %y
   %cmp15 = icmp slt i32 %x, %y
diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index b9c79f02245c..1b229f551093 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -971,12 +971,8 @@ define <2 x i32> @all_constant_true_undef_false_constexpr_vec() {
 
 define i1 @expand_binop_undef(i32 %x, i32 %y) {
 ; CHECK-LABEL: @expand_binop_undef(
-; CHECK-NEXT:    [[CMP9_NOT_1:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMP15:%.*]] = icmp slt i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[SPEC_SELECT39:%.*]] = select i1 [[CMP9_NOT_1]], i1 undef, i1 [[CMP15]]
-; CHECK-NEXT:    [[SPEC_SELECT40:%.*]] = xor i1 [[CMP9_NOT_1]], true
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = and i1 [[SPEC_SELECT39]], [[SPEC_SELECT40]]
-; CHECK-NEXT:    ret i1 [[SPEC_SELECT]]
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp slt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP15]]
 ;
   %cmp9.not.1 = icmp eq i32 %x, %y
   %cmp15 = icmp slt i32 %x, %y
diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
index ea2d1ca722ca..9ae9245eb15d 100644
--- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
@@ -364,15 +364,3 @@ define void @load_factor2_fp128(ptr %ptr) {
   %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> <i32 1, i32 3>
   ret void
 }
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/llvm/test/Transforms/LoopIdiom/ARM/ctlz.ll b/llvm/test/Transforms/LoopIdiom/ARM/ctlz.ll
index ec9271a683f4..c80c94c90534 100644
--- a/llvm/test/Transforms/LoopIdiom/ARM/ctlz.ll
+++ b/llvm/test/Transforms/LoopIdiom/ARM/ctlz.ll
@@ -31,9 +31,7 @@
 ; Function Attrs: norecurse nounwind uwtable
 define i32 @ctlz_and_other(i32 %n, ptr nocapture %a) {
 entry:
-  %c = icmp sgt i32 %n, 0
-  %negn = sub nsw i32 0, %n
-  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
   %shr8 = lshr i32 %abs_n, 1
   %tobool9 = icmp eq i32 %shr8, 0
   br i1 %tobool9, label %while.end, label %while.body.preheader
@@ -90,9 +88,7 @@ while.end:                                        ; preds = %while.end.loopexit,
 ; Function Attrs: norecurse nounwind readnone uwtable
 define i32 @ctlz_zero_check(i32 %n) {
 entry:
-  %c = icmp sgt i32 %n, 0
-  %negn = sub nsw i32 0, %n
-  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
   %tobool4 = icmp eq i32 %abs_n, 0
   br i1 %tobool4, label %while.end, label %while.body.preheader
 
@@ -140,9 +136,7 @@ while.end:                                        ; preds = %while.end.loopexit,
 ; Function Attrs: norecurse nounwind readnone uwtable
 define i32 @ctlz(i32 %n) {
 entry:
-  %c = icmp sgt i32 %n, 0
-  %negn = sub nsw i32 0, %n
-  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
   br label %while.cond
 
 while.cond:                                       ; preds = %while.cond, %entry
@@ -183,9 +177,7 @@ while.end:                                        ; preds = %while.cond
 ; Function Attrs: norecurse nounwind readnone uwtable
 define i32 @ctlz_add(i32 %n, i32 %i0) {
 entry:
-  %c = icmp sgt i32 %n, 0
-  %negn = sub nsw i32 0, %n
-  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
   br label %while.cond
 
 while.cond:                                       ; preds = %while.cond, %entry
@@ -227,10 +219,8 @@ while.end:                                        ; preds = %while.cond
 ; Function Attrs: norecurse nounwind readnone uwtable
 define i32 @ctlz_sext(i16 %in) {
 entry:
-  %n = sext i16 %in to i32
-  %c = icmp sgt i16 %in, 0
-  %negn = sub nsw i32 0, %n
-  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %abs = call i16 @llvm.abs.i16(i16 %in, i1 false)
+  %abs_n = zext i16 %abs to i32
   br label %while.cond
 
 while.cond:                                       ; preds = %while.cond, %entry
@@ -244,3 +234,6 @@ while.cond:                                       ; preds = %while.cond, %entry
 while.end:                                        ; preds = %while.cond
   ret i32 %i.0
 }
+
+declare i32 @llvm.abs.i32(i32, i1)
+declare i16 @llvm.abs.i16(i16, i1)
diff --git a/llvm/test/Transforms/LoopIdiom/X86/ctlz.ll b/llvm/test/Transforms/LoopIdiom/X86/ctlz.ll
index 8ea8ba8f3463..1b57fcc39667 100644
--- a/llvm/test/Transforms/LoopIdiom/X86/ctlz.ll
+++ b/llvm/test/Transforms/LoopIdiom/X86/ctlz.ll
@@ -20,9 +20,7 @@
 define i32 @ctlz_and_other(i32 %n, ptr nocapture %a) {
 ; ALL-LABEL: @ctlz_and_other(
 ; ALL-NEXT:  entry:
-; ALL-NEXT:    [[C:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; ALL-NEXT:    [[NEGN:%.*]] = sub nsw i32 0, [[N]]
-; ALL-NEXT:    [[ABS_N:%.*]] = select i1 [[C]], i32 [[N]], i32 [[NEGN]]
+; ALL-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N:%.*]], i1 true)
 ; ALL-NEXT:    [[SHR8:%.*]] = lshr i32 [[ABS_N]], 1
 ; ALL-NEXT:    [[TOBOOL9:%.*]] = icmp eq i32 [[SHR8]], 0
 ; ALL-NEXT:    br i1 [[TOBOOL9]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
@@ -56,9 +54,7 @@ define i32 @ctlz_and_other(i32 %n, ptr nocapture %a) {
 ; ALL-NEXT:    ret i32 [[I_0_LCSSA]]
 ;
 entry:
-  %c = icmp sgt i32 %n, 0
-  %negn = sub nsw i32 0, %n
-  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
   %shr8 = lshr i32 %abs_n, 1
   %tobool9 = icmp eq i32 %shr8, 0
   br i1 %tobool9, label %while.end, label %while.body.preheader
@@ -108,9 +104,7 @@ while.end:                                        ; preds = %while.end.loopexit,
 define i32 @ctlz_zero_check(i32 %n) {
 ; ALL-LABEL: @ctlz_zero_check(
 ; ALL-NEXT:  entry:
-; ALL-NEXT:    [[C:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; ALL-NEXT:    [[NEGN:%.*]] = sub nsw i32 0, [[N]]
-; ALL-NEXT:    [[ABS_N:%.*]] = select i1 [[C]], i32 [[N]], i32 [[NEGN]]
+; ALL-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N:%.*]], i1 true)
 ; ALL-NEXT:    [[TOBOOL4:%.*]] = icmp eq i32 [[ABS_N]], 0
 ; ALL-NEXT:    br i1 [[TOBOOL4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
 ; ALL:       while.body.preheader:
@@ -134,9 +128,7 @@ define i32 @ctlz_zero_check(i32 %n) {
 ; ALL-NEXT:    ret i32 [[I_0_LCSSA]]
 ;
 entry:
-  %c = icmp sgt i32 %n, 0
-  %negn = sub nsw i32 0, %n
-  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
   %tobool4 = icmp eq i32 %abs_n, 0
   br i1 %tobool4, label %while.end, label %while.body.preheader
 
@@ -238,9 +230,7 @@ while.end:                                        ; preds = %while.end.loopexit,
 define i32 @ctlz(i32 %n) {
 ; ALL-LABEL: @ctlz(
 ; ALL-NEXT:  entry:
-; ALL-NEXT:    [[C:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; ALL-NEXT:    [[NEGN:%.*]] = sub nsw i32 0, [[N]]
-; ALL-NEXT:    [[ABS_N:%.*]] = select i1 [[C]], i32 [[N]], i32 [[NEGN]]
+; ALL-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N:%.*]], i1 true)
 ; ALL-NEXT:    [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1
 ; ALL-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false)
 ; ALL-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
@@ -260,9 +250,7 @@ define i32 @ctlz(i32 %n) {
 ; ALL-NEXT:    ret i32 [[I_0_LCSSA]]
 ;
 entry:
-  %c = icmp sgt i32 %n, 0
-  %negn = sub nsw i32 0, %n
-  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
   br label %while.cond
 
 while.cond:                                       ; preds = %while.cond, %entry
@@ -343,9 +331,7 @@ while.end:                                        ; preds = %while.cond
 define i32 @ctlz_add(i32 %n, i32 %i0) {
 ; ALL-LABEL: @ctlz_add(
 ; ALL-NEXT:  entry:
-; ALL-NEXT:    [[C:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; ALL-NEXT:    [[NEGN:%.*]] = sub nsw i32 0, [[N]]
-; ALL-NEXT:    [[ABS_N:%.*]] = select i1 [[C]], i32 [[N]], i32 [[NEGN]]
+; ALL-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N:%.*]], i1 true)
 ; ALL-NEXT:    [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1
 ; ALL-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false)
 ; ALL-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
@@ -366,9 +352,7 @@ define i32 @ctlz_add(i32 %n, i32 %i0) {
 ; ALL-NEXT:    ret i32 [[I_0_LCSSA]]
 ;
 entry:
-  %c = icmp sgt i32 %n, 0
-  %negn = sub nsw i32 0, %n
-  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
   br label %while.cond
 
 while.cond:                                       ; preds = %while.cond, %entry
@@ -452,10 +436,8 @@ while.end:                                        ; preds = %while.cond
 define i32 @ctlz_sext(i16 %in) {
 ; ALL-LABEL: @ctlz_sext(
 ; ALL-NEXT:  entry:
-; ALL-NEXT:    [[N:%.*]] = sext i16 [[IN:%.*]] to i32
-; ALL-NEXT:    [[C:%.*]] = icmp sgt i16 [[IN]], 0
-; ALL-NEXT:    [[NEGN:%.*]] = sub nsw i32 0, [[N]]
-; ALL-NEXT:    [[ABS_N:%.*]] = select i1 [[C]], i32 [[N]], i32 [[NEGN]]
+; ALL-NEXT:    [[ABS:%.*]] = call i16 @llvm.abs.i16(i16 [[IN:%.*]], i1 false)
+; ALL-NEXT:    [[ABS_N:%.*]] = zext i16 [[ABS]] to i32
 ; ALL-NEXT:    [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1
 ; ALL-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false)
 ; ALL-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
@@ -475,10 +457,8 @@ define i32 @ctlz_sext(i16 %in) {
 ; ALL-NEXT:    ret i32 [[I_0_LCSSA]]
 ;
 entry:
-  %n = sext i16 %in to i32
-  %c = icmp sgt i16 %in, 0
-  %negn = sub nsw i32 0, %n
-  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %abs = call i16 @llvm.abs.i16(i16 %in, i1 false)
+  %abs_n = zext i16 %abs to i32
   br label %while.cond
 
 while.cond:                                       ; preds = %while.cond, %entry
@@ -753,3 +733,6 @@ while.cond:                                       ; preds = %while.cond, %entry
 while.end:                                        ; preds = %while.cond
   ret i32 %i.0
 }
+
+declare i32 @llvm.abs.i32(i32, i1)
+declare i16 @llvm.abs.i16(i16, i1)
diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-cost-compare.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-cost-compare.ll
index 9c11bd064ad4..38a754d1faa8 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-cost-compare.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-cost-compare.ll
@@ -8,14 +8,15 @@ target triple = "riscv64"
 define void @test1(ptr %a) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 128000
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[LOOP]] ], [ [[A]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[LOOP]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[LOOP]] ], [ 32000, [[ENTRY]] ]
 ; CHECK-NEXT:    store float 1.000000e+00, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], -1
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
-; CHECK-NEXT:    [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[SCEVGEP]], [[SCEVGEP2]]
-; CHECK-NEXT:    br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    [[T21:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[T21]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -38,14 +39,15 @@ exit:                                             ; preds = %loop
 define void @test2(ptr %a) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 128000
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[LOOP]] ], [ [[A]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[LOOP]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[LOOP]] ], [ 32000, [[ENTRY]] ]
 ; CHECK-NEXT:    store float 1.000000e+00, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], -1
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
-; CHECK-NEXT:    [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[SCEVGEP]], [[SCEVGEP2]]
-; CHECK-NEXT:    br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    [[T21:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[T21]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @use(ptr [[A]])
 ; CHECK-NEXT:    ret void
@@ -70,18 +72,19 @@ exit:                                             ; preds = %loop
 define void @test3(ptr %a, ptr %b) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 128000
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[LSR_IV2:%.*]] = phi ptr [ [[SCEVGEP3:%.*]], [[LOOP]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[LOOP]] ], [ [[B]], [[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[LOOP]] ], [ [[B:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[LOOP]] ], [ 32000, [[ENTRY]] ]
 ; CHECK-NEXT:    [[T17:%.*]] = load float, ptr [[LSR_IV2]], align 4
 ; CHECK-NEXT:    [[T18:%.*]] = fadd float [[T17]], 1.000000e+00
 ; CHECK-NEXT:    store float [[T18]], ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], -1
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
 ; CHECK-NEXT:    [[SCEVGEP3]] = getelementptr i8, ptr [[LSR_IV2]], i64 4
-; CHECK-NEXT:    [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[SCEVGEP]], [[SCEVGEP4]]
-; CHECK-NEXT:    br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    [[T21:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[T21]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -107,18 +110,19 @@ exit:                                             ; preds = %loop
 define void @test4(ptr %a, ptr %b) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 128000
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[LSR_IV2:%.*]] = phi ptr [ [[SCEVGEP3:%.*]], [[LOOP]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[LOOP]] ], [ [[B]], [[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[LOOP]] ], [ [[B:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[LOOP]] ], [ 32000, [[ENTRY]] ]
 ; CHECK-NEXT:    [[T17:%.*]] = load float, ptr [[LSR_IV2]], align 4
 ; CHECK-NEXT:    [[T18:%.*]] = fadd float [[T17]], 1.000000e+00
 ; CHECK-NEXT:    store float [[T18]], ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], -1
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
 ; CHECK-NEXT:    [[SCEVGEP3]] = getelementptr i8, ptr [[LSR_IV2]], i64 4
-; CHECK-NEXT:    [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[SCEVGEP]], [[SCEVGEP4]]
-; CHECK-NEXT:    br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    [[T21:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[T21]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @use(ptr [[A]])
 ; CHECK-NEXT:    call void @use(ptr [[B]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/armpl-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/armpl-calls.ll
deleted file mode 100644
index aa5fdf59e14c..000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/armpl-calls.ll
+++ /dev/null
@@ -1,1846 +0,0 @@
-; RUN: opt -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s --check-prefixes=CHECK,NEON
-; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s --check-prefixes=CHECK,SVE
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-gnu"
-
-
-; Tests are checking if LV can vectorize loops with function calls
-; using mappings from TLI for scalable and fixed width vectorization.
-
-declare double @acos(double)
-declare float @acosf(float)
-
-define void @acos_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @acos_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vacosq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svacos_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @acos(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @acos_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @acos_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vacosq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svacos_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @acosf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @acosh(double)
-declare float @acoshf(float)
-
-define void @acosh_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @acosh_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vacoshq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svacosh_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @acosh(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @acosh_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @acosh_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vacoshq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svacosh_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @acoshf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @asin(double)
-declare float @asinf(float)
-
-define void @asin_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @asin_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vasinq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svasin_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @asin(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @asin_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @asin_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vasinq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svasin_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @asinf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @asinh(double)
-declare float @asinhf(float)
-
-define void @asinh_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @asinh_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vasinhq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svasinh_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @asinh(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @asinh_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @asinh_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vasinhq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svasinh_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @asinhf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @atan(double)
-declare float @atanf(float)
-
-define void @atan_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @atan_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vatanq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svatan_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @atan(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @atan_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @atan_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vatanq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svatan_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @atanf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @atanh(double)
-declare float @atanhf(float)
-
-define void @atanh_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @atanh_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vatanhq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svatanh_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @atanh(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @atanh_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @atanh_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vatanhq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svatanh_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @atanhf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @cbrt(double)
-declare float @cbrtf(float)
-
-define void @cbrt_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @cbrt_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vcbrtq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svcbrt_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @cbrt(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @cbrt_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @cbrt_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vcbrtq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svcbrt_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @cbrtf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @cos(double)
-declare float @cosf(float)
-
-define void @cos_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @cos_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vcosq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @cos(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @cos_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @cos_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vcosq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @cosf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @cosh(double)
-declare float @coshf(float)
-
-define void @cosh_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @cosh_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vcoshq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svcosh_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @cosh(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @cosh_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @cosh_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vcoshq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svcosh_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @coshf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @erf(double)
-declare float @erff(float)
-
-define void @erf_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @erf_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_verfq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_sverf_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @erf(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @erf_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @erf_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_verfq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_sverf_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @erff(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @erfc(double)
-declare float @erfcf(float)
-
-define void @erfc_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @erfc_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_verfcq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_sverfc_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @erfc(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @erfc_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @erfc_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_verfcq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_sverfc_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @erfcf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @exp(double)
-declare float @expf(float)
-
-define void @exp_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @exp_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vexpq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @exp(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @exp_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vexpq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @expf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @exp2(double)
-declare float @exp2f(float)
-
-define void @exp2_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @exp2_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vexp2q_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @exp2(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp2_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @exp2_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vexp2q_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @exp2f(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @exp10(double)
-declare float @exp10f(float)
-
-define void @exp10_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @exp10_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vexp10q_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @exp10(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp10_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @exp10_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vexp10q_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @exp10f(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @expm1(double)
-declare float @expm1f(float)
-
-define void @expm1_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @expm1_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vexpm1q_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svexpm1_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @expm1(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @expm1_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @expm1_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vexpm1q_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svexpm1_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @expm1f(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @lgamma(double)
-declare float @lgammaf(float)
-
-define void @lgamma_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @lgamma_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vlgammaq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svlgamma_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @lgamma(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @lgamma_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @lgamma_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vlgammaq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svlgamma_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @lgammaf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log(double)
-declare float @logf(float)
-
-define void @log_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vlogq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @log(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vlogq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @logf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log1p(double)
-declare float @log1pf(float)
-
-define void @log1p_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log1p_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vlog1pq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svlog1p_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @log1p(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log1p_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log1p_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vlog1pq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svlog1p_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @log1pf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log2(double)
-declare float @log2f(float)
-
-define void @log2_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log2_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vlog2q_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @log2(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log2_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log2_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vlog2q_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @log2f(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log10(double)
-declare float @log10f(float)
-
-define void @log10_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log10_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vlog10q_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @log10(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log10_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log10_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vlog10q_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @log10f(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sin(double)
-declare float @sinf(float)
-
-define void @sin_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sin_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vsinq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @sin(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sin_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sin_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vsinq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @sinf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sinh(double)
-declare float @sinhf(float)
-
-define void @sinh_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sinh_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vsinhq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svsinh_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @sinh(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sinh_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sinh_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vsinhq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svsinh_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @sinhf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sinpi(double)
-declare float @sinpif(float)
-
-define void @sinpi_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sinpi_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vsinpiq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svsinpi_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @sinpi(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sinpi_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sinpi_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vsinpiq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svsinpi_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @sinpif(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sqrt(double)
-declare float @sqrtf(float)
-
-define void @sqrt_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sqrt_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vsqrtq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svsqrt_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @sqrt(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sqrt_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sqrt_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vsqrtq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svsqrt_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @sqrtf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @tan(double)
-declare float @tanf(float)
-
-define void @tan_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @tan_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vtanq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svtan_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @tan(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @tan_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @tan_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vtanq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svtan_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @tanf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @tanh(double)
-declare float @tanhf(float)
-
-define void @tanh_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @tanh_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vtanhq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svtanh_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @tanh(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @tanh_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @tanh_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vtanhq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svtanh_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @tanhf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @tgamma(double)
-declare float @tgammaf(float)
-
-define void @tgamma_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @tgamma_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vtgammaq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svtgamma_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @tgamma(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @tgamma_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @tgamma_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vtgammaq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svtgamma_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @tgammaf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @atan2(double, double)
-declare float @atan2f(float, float)
-
-define void @atan2_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @atan2_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vatan2q_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svatan2_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @atan2(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @atan2_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @atan2_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vatan2q_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svatan2_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @atan2f(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @copysign(double, double)
-declare float @copysignf(float, float)
-
-define void @copysign_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @copysign_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vcopysignq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svcopysign_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @copysign(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @copysign_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @copysign_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vcopysignq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svcopysign_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @copysignf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @fdim(double, double)
-declare float @fdimf(float, float)
-
-define void @fdim_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fdim_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vfdimq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svfdim_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @fdim(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @fdim_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fdim_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vfdimq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svfdim_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @fdimf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @fmin(double, double)
-declare float @fminf(float, float)
-
-define void @fmin_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fmin_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vfminq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svfmin_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @fmin(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @fmin_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fmin_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vfminq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svfmin_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @fminf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @fmod(double, double)
-declare float @fmodf(float, float)
-
-define void @fmod_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fmod_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vfmodq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svfmod_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @fmod(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @fmod_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fmod_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vfmodq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svfmod_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @fmodf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @hypot(double, double)
-declare float @hypotf(float, float)
-
-define void @hypot_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @hypot_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vhypotq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svhypot_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @hypot(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @hypot_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @hypot_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vhypotq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svhypot_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @hypotf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @nextafter(double, double)
-declare float @nextafterf(float, float)
-
-define void @nextafter_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @nextafter_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vnextafterq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svnextafter_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @nextafter(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @nextafter_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @nextafter_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vnextafterq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svnextafter_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @nextafterf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @pow(double, double)
-declare float @powf(float, float)
-
-define void @pow_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @pow_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vpowq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svpow_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @pow(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @pow_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @pow_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vpowq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svpow_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @powf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @fma(double, double, double)
-declare float @fmaf(float, float, float)
-
-define void @fma_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fma_f64(
-; NEON: [[TMP5:%.*]] = call <2 x double> @armpl_vfmaq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svfma_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @fma(double %in, double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @fma_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fma_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vfmaq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svfma_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @fmaf(float %in, float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/armpl-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/armpl-intrinsics.ll
deleted file mode 100644
index 96d94f72fabf..000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/armpl-intrinsics.ll
+++ /dev/null
@@ -1,554 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(\.|_v|_sv)(ceil|copysign|cos|exp\.|expf?\(|exp2|exp10|fabs|floor|fma|log|m..num|pow|nearbyint|rint|round|sin|sqrt|trunc)|(ret)" --version 2
-; RUN: opt -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -prefer-predicate-over-epilogue=predicate-dont-vectorize  -S < %s | FileCheck %s --check-prefixes=NEON
-; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefixes=SVE
-
-target triple = "aarch64-unknown-linux-gnu"
-
-; Tests are checking if LV can vectorize loops with llvm math intrinsics
-; using mappings from TLI for scalable and fixed width vectorization.
-
-declare double @llvm.cos.f64(double)
-declare float @llvm.cos.f32(float)
-
-define void @cos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-;
-; NEON-LABEL: define void @cos_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcosq_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @cos_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.cos.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @cos_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @cos_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcosq_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @cos_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.cos.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.exp.f64(double)
-declare float @llvm.exp.f32(float)
-
-define void @exp_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @exp_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.exp.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @exp_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.exp.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.exp2.f64(double)
-declare float @llvm.exp2.f32(float)
-
-define void @exp2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @exp2_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp2_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.exp2.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @exp2_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp2_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.exp2.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.exp10.f64(double)
-declare float @llvm.exp10.f32(float)
-
-define void @exp10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @exp10_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp10_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.exp10.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @exp10_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp10_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.exp10.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.log.f64(double)
-declare float @llvm.log.f32(float)
-
-define void @log_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @log_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlogq_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.log.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @log_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlogq_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.log.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.log2.f64(double)
-declare float @llvm.log2.f32(float)
-
-define void @log2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @log2_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log2_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.log2.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @log2_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log2_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.log2.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.log10.f64(double)
-declare float @llvm.log10.f32(float)
-
-define void @log10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @log10_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log10_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.log10.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @log10_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log10_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.log10.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.sin.f64(double)
-declare float @llvm.sin.f32(float)
-
-define void @sin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @sin_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinq_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sin_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.sin.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sin_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @sin_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinq_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sin_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.sin.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.pow.f64(double, double)
-declare float @llvm.pow.f32(float, float)
-
-define void @pow_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @pow_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vpowq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @pow_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svpow_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.pow.f64(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @pow_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @pow_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vpowq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @pow_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svpow_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.pow.f32(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
index bd0b7a8412b9..bcf5c1888786 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
@@ -176,8 +176,8 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i
 ; CHECK-NEXT:    [[TMP10:%.*]] = trunc <16 x i16> [[TMP8]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP11]]
-; CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 16
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
@@ -459,8 +459,8 @@ define void @old_and_new_size_equalko(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]]
-; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index 24d6d2d532aa..24c59fdb47b6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -38,8 +38,8 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2
+; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP14]], align 1
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP15]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
@@ -55,22 +55,22 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX9]], 0
-; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX9]], 1
-; CHECK-NEXT:    [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP10]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP11]], i32 1
+; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX8]], 0
+; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX8]], 1
+; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP9]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP10]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP22]])
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP23]])
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP10]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP9]], i32 0
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP24]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 2
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 10000
+; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 2
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 10000
 ; CHECK-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -132,8 +132,8 @@ define void @test_widen_induction(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
+; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    store <2 x i64> [[STEP_ADD]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
@@ -156,13 +156,13 @@ define void @test_widen_induction(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND8:%.*]] = phi <2 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX7]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    store <2 x i64> [[VEC_IND8]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT10]] = add <2 x i64> [[VEC_IND8]], <i64 2, i64 2>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
@@ -224,8 +224,8 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 2
+; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    store <2 x i64> [[STEP_ADD]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
@@ -315,8 +315,8 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[VEC_IND]], <i64 10, i64 10>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STEP_ADD]], <i64 10, i64 10>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
@@ -339,14 +339,14 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) {
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND10:%.*]] = phi <2 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX9]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[VEC_IND10]], <i64 10, i64 10>
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX9]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT12]] = add <2 x i64> [[VEC_IND10]], <i64 2, i64 2>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[IND_END]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
@@ -410,8 +410,8 @@ define void @test_widen_extended_induction(ptr %dst) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [6 x i8], ptr [[DST:%.*]], i64 0, i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [6 x i8], ptr [[DST]], i64 0, i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <2 x i8> [[VEC_IND]], ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 2
+; CHECK-NEXT:    store <2 x i8> [[VEC_IND]], ptr [[TMP6]], align 1
 ; CHECK-NEXT:    store <2 x i8> [[STEP_ADD]], ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i8> [[STEP_ADD]], <i8 2, i8 2>
@@ -429,17 +429,17 @@ define void @test_widen_extended_induction(ptr %dst) {
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], <i8 0, i8 1>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND5:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX8:%.*]] = trunc i32 [[INDEX4]] to i8
-; CHECK-NEXT:    [[TMP9:%.*]] = add i8 [[OFFSET_IDX8]], 0
+; CHECK-NEXT:    [[INDEX3:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND4:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX7:%.*]] = trunc i32 [[INDEX3]] to i8
+; CHECK-NEXT:    [[TMP9:%.*]] = add i8 [[OFFSET_IDX7]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP9]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [6 x i8], ptr [[DST]], i64 0, i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <2 x i8> [[VEC_IND5]], ptr [[TMP12]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i32 [[INDEX4]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT7]] = add <2 x i8> [[VEC_IND5]], <i8 2, i8 2>
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT9]], 10000
+; CHECK-NEXT:    store <2 x i8> [[VEC_IND4]], ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i32 [[INDEX3]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT6]] = add <2 x i8> [[VEC_IND4]], <i8 2, i8 2>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT8]], 10000
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -492,8 +492,8 @@ define void @test_widen_truncated_induction(ptr %A) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <2 x i8> [[VEC_IND]], ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 2
+; CHECK-NEXT:    store <2 x i8> [[VEC_IND]], ptr [[TMP4]], align 1
 ; CHECK-NEXT:    store <2 x i8> [[STEP_ADD]], ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i8> [[STEP_ADD]], <i8 2, i8 2>
@@ -512,15 +512,15 @@ define void @test_widen_truncated_induction(ptr %A) {
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], <i8 0, i8 1>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND5:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[INDEX3:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND4:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX3]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <2 x i8> [[VEC_IND5]], ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT7]] = add <2 x i8> [[VEC_IND5]], <i8 2, i8 2>
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 10000
+; CHECK-NEXT:    store <2 x i8> [[VEC_IND4]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX3]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT6]] = add <2 x i8> [[VEC_IND4]], <i8 2, i8 2>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT7]], 10000
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
index e80630412d67..33d7a3a3c8ac 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
@@ -30,8 +30,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD1]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i8> [[VECTOR_RECUR]], <16 x i8> [[WIDE_LOAD]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> [[WIDE_LOAD1]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -40,8 +40,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -129,8 +129,8 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD5]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i8> [[VECTOR_RECUR]], <16 x i8> [[WIDE_LOAD]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-NEXT:    [[TMP8]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> [[WIDE_LOAD5]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -147,8 +147,8 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[TMP17]], ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 16
+; CHECK-NEXT:    store <16 x i8> [[TMP17]], ptr [[TMP21]], align 1
 ; CHECK-NEXT:    store <16 x i8> [[TMP18]], ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
index 760c294c8853..0e54bd15e5ea 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
@@ -41,12 +41,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
 ; INTERLEAVE-4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP3]]
 ; INTERLEAVE-4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP4]]
 ; INTERLEAVE-4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
 ; INTERLEAVE-4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
 ; INTERLEAVE-4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 32
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
 ; INTERLEAVE-4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 48
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
 ; INTERLEAVE-4-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
 ; INTERLEAVE-4-NEXT:    [[TMP13:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; INTERLEAVE-4-NEXT:    [[TMP14:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD4]], [[BROADCAST_SPLAT]]
@@ -65,12 +65,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
 ; INTERLEAVE-4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP3]]
 ; INTERLEAVE-4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP4]]
 ; INTERLEAVE-4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 0
-; INTERLEAVE-4-NEXT:    store <16 x i8> [[TMP21]], ptr [[TMP29]], align 1
 ; INTERLEAVE-4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 16
-; INTERLEAVE-4-NEXT:    store <16 x i8> [[TMP22]], ptr [[TMP30]], align 1
 ; INTERLEAVE-4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 32
-; INTERLEAVE-4-NEXT:    store <16 x i8> [[TMP23]], ptr [[TMP31]], align 1
 ; INTERLEAVE-4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 48
+; INTERLEAVE-4-NEXT:    store <16 x i8> [[TMP21]], ptr [[TMP29]], align 1
+; INTERLEAVE-4-NEXT:    store <16 x i8> [[TMP22]], ptr [[TMP30]], align 1
+; INTERLEAVE-4-NEXT:    store <16 x i8> [[TMP23]], ptr [[TMP31]], align 1
 ; INTERLEAVE-4-NEXT:    store <16 x i8> [[TMP24]], ptr [[TMP32]], align 1
 ; INTERLEAVE-4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; INTERLEAVE-4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -155,8 +155,8 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
 ; INTERLEAVE-2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP1]]
 ; INTERLEAVE-2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP2]]
 ; INTERLEAVE-2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; INTERLEAVE-2-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; INTERLEAVE-2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16
+; INTERLEAVE-2-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; INTERLEAVE-2-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; INTERLEAVE-2-NEXT:    [[TMP7:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; INTERLEAVE-2-NEXT:    [[TMP8:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD4]], [[BROADCAST_SPLAT]]
@@ -167,8 +167,8 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
 ; INTERLEAVE-2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP1]]
 ; INTERLEAVE-2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP2]]
 ; INTERLEAVE-2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; INTERLEAVE-2-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP15]], align 1
 ; INTERLEAVE-2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 16
+; INTERLEAVE-2-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP15]], align 1
 ; INTERLEAVE-2-NEXT:    store <16 x i8> [[TMP12]], ptr [[TMP16]], align 1
 ; INTERLEAVE-2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; INTERLEAVE-2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
index c4f60c8e0911..0d7a1dbc841e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
@@ -32,12 +32,12 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
 ; INTERLEAVE-4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP2]]
 ; INTERLEAVE-4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP3]]
 ; INTERLEAVE-4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 1
 ; INTERLEAVE-4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1
 ; INTERLEAVE-4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1
 ; INTERLEAVE-4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 1
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1
 ; INTERLEAVE-4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP11]], align 1
 ; INTERLEAVE-4-NEXT:    [[TMP12]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; INTERLEAVE-4-NEXT:    [[TMP13]] = add <4 x i32> [[VEC_PHI1]], [[WIDE_LOAD4]]
@@ -87,8 +87,8 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
 ; INTERLEAVE-2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP0]]
 ; INTERLEAVE-2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]]
 ; INTERLEAVE-2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; INTERLEAVE-2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 1
 ; INTERLEAVE-2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4
+; INTERLEAVE-2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 1
 ; INTERLEAVE-2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1
 ; INTERLEAVE-2-NEXT:    [[TMP6]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; INTERLEAVE-2-NEXT:    [[TMP7]] = add <4 x i32> [[VEC_PHI1]], [[WIDE_LOAD2]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index 52df1a38f5b4..95dc010811e6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -38,13 +38,13 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP4]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT]])
-; CHECK-NEXT:    store <8 x i16> [[TMP6]], ptr [[NEXT_GEP6]], align 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[NEXT_GEP6]], i64 8
+; CHECK-NEXT:    store <8 x i16> [[TMP6]], ptr [[NEXT_GEP6]], align 2
 ; CHECK-NEXT:    store <8 x i16> [[TMP7]], ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -121,13 +121,13 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT]])
-; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[NEXT_GEP3]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 16
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[NEXT_GEP3]], align 2
 ; CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index cdc50c57b947..acbb6e86fe58 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -260,18 +260,18 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
 ; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
 ; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
 ; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 16
 ; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
 ; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP33]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP34]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-UNORDERED-NEXT:    [[TMP35]] = fadd <vscale x 8 x float> [[WIDE_LOAD4]], [[VEC_PHI1]]
@@ -343,18 +343,18 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
 ; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
 ; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
 ; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 16
 ; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
 ; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP33]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
 ; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP34]], <vscale x 8 x float> [[WIDE_LOAD1]])
@@ -460,18 +460,18 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = mul i64 [[TMP55]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP58]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP61:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i64 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP62:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP61]])
@@ -1528,36 +1528,36 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
 ; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
 ; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
 ; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 16
 ; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
 ; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP33]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
 ; CHECK-UNORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
 ; CHECK-UNORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]]
 ; CHECK-UNORDERED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
 ; CHECK-UNORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP38]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 8
 ; CHECK-UNORDERED-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP41]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 16
 ; CHECK-UNORDERED-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, ptr [[TMP44]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP45]], 24
 ; CHECK-UNORDERED-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP38]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP41]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, ptr [[TMP44]], align 4
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, ptr [[TMP47]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP48]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
 ; CHECK-UNORDERED-NEXT:    [[TMP49]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
@@ -1631,36 +1631,36 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
 ; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
 ; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
 ; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 16
 ; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
 ; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP33]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
 ; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
 ; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]]
 ; CHECK-ORDERED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
 ; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP38]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 8
 ; CHECK-ORDERED-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP41]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 16
 ; CHECK-ORDERED-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP44]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP45]], 24
 ; CHECK-ORDERED-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP38]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP41]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP44]], align 4
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP47]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP48:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
 ; CHECK-ORDERED-NEXT:    [[TMP49:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
@@ -1772,36 +1772,36 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = mul i64 [[TMP55]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP58]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP31]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP36]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP46]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP66:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP67:%.*]] = mul i64 [[TMP66]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP67]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP69:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP70:%.*]] = mul i64 [[TMP69]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP70]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP72:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP73:%.*]] = mul i64 [[TMP72]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP73]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP74]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP75:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP76:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]]
@@ -1942,36 +1942,36 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
 ; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
 ; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
 ; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 16
 ; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
 ; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP33]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
 ; CHECK-UNORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
 ; CHECK-UNORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]]
 ; CHECK-UNORDERED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
 ; CHECK-UNORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP38]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 8
 ; CHECK-UNORDERED-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP41]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 16
 ; CHECK-UNORDERED-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, ptr [[TMP44]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UNORDERED-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP45]], 24
 ; CHECK-UNORDERED-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP38]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP41]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, ptr [[TMP44]], align 4
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, ptr [[TMP47]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP48]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
 ; CHECK-UNORDERED-NEXT:    [[TMP49]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
@@ -2045,36 +2045,36 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
 ; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]]
 ; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 8
 ; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 16
 ; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 24
 ; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP30]], align 4
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP33]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
 ; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
 ; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]]
 ; CHECK-ORDERED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]]
 ; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP38]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP39]], 8
 ; CHECK-ORDERED-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP41]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 16
 ; CHECK-ORDERED-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP44]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP45]], 24
 ; CHECK-ORDERED-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP38]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP41]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP44]], align 4
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP47]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP48:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
 ; CHECK-ORDERED-NEXT:    [[TMP49:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
@@ -2186,36 +2186,36 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = mul i64 [[TMP55]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP58]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP31]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP36]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP46]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP66:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP67:%.*]] = mul i64 [[TMP66]], 8
 ; CHECK-ORDERED-TF-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP67]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP69:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP70:%.*]] = mul i64 [[TMP69]], 16
 ; CHECK-ORDERED-TF-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP70]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP72:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP73:%.*]] = mul i64 [[TMP72]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP73]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP74]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP75:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP76:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll
deleted file mode 100644
index bd39dcb3371a..000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll
+++ /dev/null
@@ -1,1347 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(_)|(cos|expf?\(|exp2|exp10|fmod|gamma|log|pow|sin|sqrt|tan)|(ret)" --version 2
-; RUN: opt -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=NEON
-; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=SVE
-
-target triple = "aarch64-unknown-linux-gnu"
-
-declare double @acos(double)
-declare float @acosf(float)
-
-define void @acos_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @acos_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @acos_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0:[0-9]+]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @acos(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @acos_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @acos_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @acos_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @acosf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @asin(double)
-declare float @asinf(float)
-
-define void @asin_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @asin_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @asin_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @asin(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @asin_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @asin_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @asin_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @asinf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @atan(double)
-declare float @atanf(float)
-
-define void @atan_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @atan_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @atan_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @atan(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @atan_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @atan_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @atan_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @atanf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @atan2(double, double)
-declare float @atan2f(float, float)
-
-define void @atan2_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @atan2_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @atan2_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x double> [[TMP19]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @atan2(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @atan2_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @atan2_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @atan2_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @atan2f(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @atanh(double)
-declare float @atanhf(float)
-
-define void @atanh_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @atanh_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_atanh(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @atanh_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atanh(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @atanh(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @atanh_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @atanh_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @atanh_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanhf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @atanhf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @cos(double)
-declare float @cosf(float)
-
-define void @cos_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @cos_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @cos_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @cos(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @cos_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @cos_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @cos_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @cosf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @cosh(double)
-declare float @coshf(float)
-
-define void @cosh_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @cosh_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @cosh_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @cosh(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @cosh_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @cosh_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @cosh_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @coshf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @exp(double)
-declare float @expf(float)
-
-define void @exp_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @exp_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @exp(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @exp_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @expf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @exp2(double)
-declare float @exp2f(float)
-
-define void @exp2_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @exp2_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp2_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @exp2(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp2_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @exp2_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp2_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @exp2f(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @exp10(double)
-declare float @exp10f(float)
-
-define void @exp10_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @exp10_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp10_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @exp10(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp10_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @exp10_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp10_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @exp10f(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @fmod(double, double)
-declare float @fmodf(float, float)
-
-define void @fmod_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @fmod_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fmod(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @fmod_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_fmod(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x double> [[TMP19]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @fmod(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @fmod_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @fmod_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fmodf(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @fmod_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_fmodf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @fmodf(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @lgamma(double)
-declare float @lgammaf(float)
-
-define void @lgamma_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @lgamma_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_lgamma(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @lgamma_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_lgamma(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @lgamma(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @lgamma_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @lgamma_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_lgammaf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @lgamma_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_lgammaf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @lgammaf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log10(double)
-declare float @log10f(float)
-
-define void @log10_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @log10_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log10_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @log10(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log10_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @log10_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log10_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @log10f(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log2(double)
-declare float @log2f(float)
-
-define void @log2_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @log2_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log2_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @log2(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log2_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @log2_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log2_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @log2f(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log(double)
-declare float @logf(float)
-
-define void @log_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @log_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @log(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @log_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @logf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @pow(double, double)
-declare float @powf(float, float)
-
-define void @pow_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @pow_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @pow_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x double> [[TMP19]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @pow(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @pow_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @pow_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @pow_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @powf(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sin(double)
-declare float @sinf(float)
-
-define void @sin_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @sin_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sin_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @sin(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sin_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @sin_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sin_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @sinf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sinh(double)
-declare float @sinhf(float)
-
-define void @sinh_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @sinh_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sinh_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @sinh(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sinh_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @sinh_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sinh_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @sinhf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sqrt(double)
-declare float @sqrtf(float)
-
-define void @sqrt_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @sqrt_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sqrt(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sqrt_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sqrt(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @sqrt(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sqrt_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @sqrt_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sqrtf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sqrt_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sqrtf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @sqrtf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @tan(double)
-declare float @tanf(float)
-
-define void @tan_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @tan_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @tan_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @tan(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @tan_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @tan_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @tan_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @tanf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @tanh(double)
-declare float @tanhf(float)
-
-define void @tanh_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @tanh_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @tanh_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @tanh(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @tanh_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @tanh_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @tanh_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @tanhf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @tgamma(double)
-declare float @tgammaf(float)
-
-define void @tgamma_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @tgamma_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tgamma(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @tgamma_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tgamma(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @tgamma(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @tgamma_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @tgamma_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tgammaf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @tgamma_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tgammaf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @tgammaf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
deleted file mode 100644
index 2300ce74996e..000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
+++ /dev/null
@@ -1,1290 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(\.|_)(ceil|copysign|cos|exp\.|expf?\(|exp2|exp10|fabs|floor|fma|log|m..num|pow|nearbyint|rint|round|sin|sqrt|trunc)|(ret)" --version 2
-; RUN: opt -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=NEON
-; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=SVE
-
-target triple = "aarch64-unknown-linux-gnu"
-
-; Tests are checking if LV can vectorize loops with llvm math intrinsics using mappings
-; from TLI (if such mappings exist) for scalable and fixed width vectors.
-
-declare double @llvm.ceil.f64(double)
-declare float @llvm.ceil.f32(float)
-
-define void @llvm_ceil_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_ceil_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_ceil_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1:[0-9]+]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.ceil.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_ceil_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_ceil_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_ceil_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.ceil.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.copysign.f64(double, double)
-declare float @llvm.copysign.f32(float, float)
-
-define void @llvm_copysign_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_copysign_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_copysign_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.copysign.f64(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_copysign_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_copysign_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_copysign_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.copysign.f32(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.cos.f64(double)
-declare float @llvm.cos.f32(float)
-
-define void @llvm_cos_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_cos_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_cos_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.cos.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_cos_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_cos_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_cos_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.cos.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.exp.f64(double)
-declare float @llvm.exp.f32(float)
-
-define void @llvm_exp_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_exp_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_exp_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.exp.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_exp_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_exp_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_exp_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.exp.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.exp2.f64(double)
-declare float @llvm.exp2.f32(float)
-
-define void @llvm_exp2_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_exp2_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_exp2_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.exp2.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_exp2_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_exp2_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_exp2_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.exp2.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.exp10.f64(double)
-declare float @llvm.exp10.f32(float)
-
-define void @llvm_exp10_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_exp10_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_exp10_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.exp10.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_exp10_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_exp10_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_exp10_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.exp10.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.fabs.f64(double)
-declare float @llvm.fabs.f32(float)
-
-define void @llvm_fabs_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_fabs_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_fabs_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.fabs.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-
-define void @llvm_fabs_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_fabs_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_fabs_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.fabs.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.floor.f64(double)
-declare float @llvm.floor.f32(float)
-
-define void @llvm_floor_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_floor_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_floor_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.floor.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_floor_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_floor_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_floor_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.floor.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.fma.f64(double, double, double)
-declare float @llvm.fma.f32(float, float, float)
-
-define void @llvm_fma_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_fma_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_fma_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]], <vscale x 2 x double> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.fma.f64(double %conv, double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_fma_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_fma_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_fma_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.fma.f32(float %conv, float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.log.f64(double)
-declare float @llvm.log.f32(float)
-
-define void @llvm_log_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_log_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_log_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.log.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_log_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_log_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_log_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.log.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.log10.f64(double)
-declare float @llvm.log10.f32(float)
-
-define void @llvm_log10_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_log10_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_log10_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.log10.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_log10_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_log10_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_log10_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.log10.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.log2.f64(double)
-declare float @llvm.log2.f32(float)
-
-define void @llvm_log2_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_log2_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_log2_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.log2.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_log2_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_log2_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_log2_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.log2.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.maxnum.f64(double, double)
-declare float @llvm.maxnum.f32(float, float)
-
-define void @llvm_maxnum_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_maxnum_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_maxnum_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.maxnum.f64(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_maxnum_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_maxnum_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_maxnum_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.maxnum.f32(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.minnum.f64(double, double)
-declare float @llvm.minnum.f32(float, float)
-
-define void @llvm_minnum_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_minnum_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_minnum_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.minnum.f64(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_minnum_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_minnum_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_minnum_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.minnum.f32(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.nearbyint.f64(double)
-declare float @llvm.nearbyint.f32(float)
-
-define void @llvm_nearbyint_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_nearbyint_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_nearbyint_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.nearbyint.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_nearbyint_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_nearbyint_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_nearbyint_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.nearbyint.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.pow.f64(double, double)
-declare float @llvm.pow.f32(float, float)
-
-define void @llvm_pow_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_pow_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_pow_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.pow.f64(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_pow_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_pow_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_pow_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.pow.f32(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.rint.f64(double)
-declare float @llvm.rint.f32(float)
-
-define void @llvm_rint_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_rint_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_rint_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.rint.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_rint_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_rint_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_rint_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.rint.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.round.f64(double)
-declare float @llvm.round.f32(float)
-
-define void @llvm_round_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_round_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_round_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.round.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_round_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_round_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_round_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.round.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.sin.f64(double)
-declare float @llvm.sin.f32(float)
-
-define void @llvm_sin_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_sin_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_sin_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.sin.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_sin_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_sin_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_sin_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.sin.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.sqrt.f64(double)
-declare float @llvm.sqrt.f32(float)
-
-define void @llvm_sqrt_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_sqrt_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_sqrt_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.sqrt.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_sqrt_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_sqrt_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_sqrt_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.sqrt.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.trunc.f64(double)
-declare float @llvm.trunc.f32(float)
-
-define void @llvm_trunc_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_trunc_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_trunc_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.trunc.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_trunc_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_trunc_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_trunc_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.trunc.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
index 982915fe5cc5..a35c9d0c678c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
@@ -35,10 +35,10 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP17]] = and i64 [[TMP16]], [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
index f48933a76709..0502ff5dc08f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
@@ -35,10 +35,10 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP16]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP17]] = add <vscale x 2 x i64> [[WIDE_LOAD3]], [[VEC_PHI2]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
index fd3d8d09fb39..9aab1728f9c6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
@@ -34,10 +34,10 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP16]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[TMP18]], <vscale x 4 x float> [[WIDE_LOAD2]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index d5f41aa440e5..24d2127ee171 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -49,10 +49,10 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    store <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), ptr [[TMP16]], align 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 16
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 [[TMP18]]
+; CHECK-NEXT:    store <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), ptr [[TMP16]], align 1
 ; CHECK-NEXT:    store <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -127,10 +127,10 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-VF8-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP6]]
 ; CHECK-VF8-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
 ; CHECK-VF8-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
-; CHECK-VF8-NEXT:    store <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), ptr [[TMP14]], align 1
 ; CHECK-VF8-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF8-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 16
 ; CHECK-VF8-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[TMP16]]
+; CHECK-VF8-NEXT:    store <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), ptr [[TMP14]], align 1
 ; CHECK-VF8-NEXT:    store <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), ptr [[TMP17]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-VF8-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -227,10 +227,10 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP16]]
+; CHECK-NEXT:    store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP14]], align 1
 ; CHECK-NEXT:    store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -296,10 +296,10 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) {
 ; CHECK-VF8-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]]
 ; CHECK-VF8-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
 ; CHECK-VF8-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; CHECK-VF8-NEXT:    store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP14]], align 1
 ; CHECK-VF8-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF8-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 2
 ; CHECK-VF8-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP16]]
+; CHECK-VF8-NEXT:    store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP14]], align 1
 ; CHECK-VF8-NEXT:    store <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP17]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-VF8-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -386,10 +386,10 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 16
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP15]]
+; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP16]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -469,10 +469,10 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-VF8-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]]
 ; CHECK-VF8-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP10]]
 ; CHECK-VF8-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-VF8-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP11]], align 1
 ; CHECK-VF8-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF8-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 16
 ; CHECK-VF8-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP13]]
+; CHECK-VF8-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP11]], align 1
 ; CHECK-VF8-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP14]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-VF8-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
index 061b59f5442c..08d0fb77e456 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
@@ -39,19 +39,19 @@ define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef read
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds half, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds half, ptr [[TMP11]], i64 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x half>, ptr [[TMP15]], align 2
 ; CHECK-NEXT:    [[TMP16:%.*]] = fneg <vscale x 8 x half> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = fneg <vscale x 8 x half> [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds half, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    store <vscale x 8 x half> [[TMP16]], ptr [[TMP19]], align 2
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 8
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds half, ptr [[TMP18]], i64 [[TMP21]]
+; CHECK-NEXT:    store <vscale x 8 x half> [[TMP16]], ptr [[TMP19]], align 2
 ; CHECK-NEXT:    store <vscale x 8 x half> [[TMP17]], ptr [[TMP22]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
index a9657971d48a..ce1cfda43817 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
@@ -311,10 +311,10 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP14]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER2]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
index d12a1036ed53..cfb0f9e59ecb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
@@ -57,10 +57,10 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP28]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP29]], align 8
 ; CHECK-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 2
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i64 [[TMP31]]
+; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP29]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP32]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP34]]
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP13]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
index 006171f3b068..b3a2414693ec 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
@@ -47,18 +47,18 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP23]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP24]], align 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
-; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP27]], align 4
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP27]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD3]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -145,18 +145,18 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP23]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP24]], align 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
-; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP27]], align 4
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP27]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD3]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
index 1d5c60bd7374..98081e47b234 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
@@ -63,16 +63,16 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i64, ptr [[TMP23]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP27]], align 8
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i64, ptr [[TMP23]], i64 [[TMP29]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP27]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 2 x i64>, ptr [[TMP30]], align 8
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i64, ptr [[TMP25]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 2 x i64>, ptr [[TMP31]], align 8
 ; CHECK-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 2
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[TMP25]], i64 [[TMP33]]
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 2 x i64>, ptr [[TMP31]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 2 x i64>, ptr [[TMP34]], align 8
 ; CHECK-NEXT:    [[TMP35:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD13]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD12]], [[WIDE_LOAD14]]
@@ -81,16 +81,16 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i64, ptr [[TMP37]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP41]], align 8
 ; CHECK-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 2
 ; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i64, ptr [[TMP37]], i64 [[TMP43]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP41]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP44]], align 8
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr i64, ptr [[TMP39]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP45]], align 8
 ; CHECK-NEXT:    [[TMP46:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP47:%.*]] = mul i64 [[TMP46]], 2
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i64, ptr [[TMP39]], i64 [[TMP47]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP45]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP48]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP50]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index 3aac54114c51..cc72dfa4ce63 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -29,7 +29,8 @@ target triple = "aarch64-unknown-linux-gnu"
 ; VPLANS-NEXT:     ACTIVE-LANE-MASK-PHI vp<[[LANEMASK_PHI:%[0-9]+]]> = phi vp<[[LANEMASK_ENTRY]]>, vp<[[LANEMASK_LOOP:%[0-9]+]]>
 ; VPLANS-NEXT:     vp<[[STEP:%[0-9]+]]>    = SCALAR-STEPS vp<[[INDV]]>, ir<1>
 ; VPLANS-NEXT:     CLONE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEP]]>
-; VPLANS-NEXT:     WIDEN store ir<%gep>, ir<%val>, vp<[[LANEMASK_PHI]]>
+; VPLANS-NEXT:     vp<[[VEC_PTR:%[0-9]+]]> = vector-pointer ir<%gep>
+; VPLANS-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<%val>, vp<[[LANEMASK_PHI]]>
 ; VPLANS-NEXT:     EMIT vp<[[INDV_UPDATE:%[0-9]+]]> = add vp<[[INDV]]>, vp<[[VFxUF]]>
 ; VPLANS-NEXT:     EMIT vp<[[INC:%[0-9]+]]> = VF * Part + vp<[[INDV]]>
 ; VPLANS-NEXT:     EMIT vp<[[LANEMASK_LOOP]]> = active lane mask vp<[[INC]]>, vp<[[NEWTC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
index 4f73d7c1a773..02eac8c02d81 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
@@ -83,18 +83,18 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP41]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP46]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP51]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 4
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]])
 ; CHECK-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP56:%.*]] = mul i64 [[TMP55]], 8
 ; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]])
 ; CHECK-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP58]], 12
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP51]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]])
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]]
 ; CHECK-NEXT:    [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
@@ -225,18 +225,18 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP41]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP46]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP51]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP52]], 4
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP55:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP56:%.*]] = mul i64 [[TMP55]], 8
 ; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP58]], 12
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP51]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD10]], zeroinitializer
@@ -251,18 +251,18 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[TMP71:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i1> [[TMP63]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP72:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i1> [[TMP64]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP73]], i32 4, <vscale x 4 x i1> [[TMP69]])
 ; CHECK-NEXT:    [[TMP74:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP74]], 4
 ; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP75]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, <vscale x 4 x i1> [[TMP70]])
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP78:%.*]] = mul i64 [[TMP77]], 8
 ; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP78]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP79]], i32 4, <vscale x 4 x i1> [[TMP71]])
 ; CHECK-NEXT:    [[TMP80:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP81:%.*]] = mul i64 [[TMP80]], 12
 ; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP81]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP73]], i32 4, <vscale x 4 x i1> [[TMP69]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, <vscale x 4 x i1> [[TMP70]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP79]], i32 4, <vscale x 4 x i1> [[TMP71]])
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, <vscale x 4 x i1> [[TMP72]])
 ; CHECK-NEXT:    [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP84]]
 ; CHECK-NEXT:    [[TMP85:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
index 2cd52bfa87b9..e35a4db36905 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
@@ -35,13 +35,13 @@ define void @vector_reverse_f64(i64 %N, ptr noalias %a, ptr noalias %b) #0{
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 1, [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x double>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl i64 [[TMP11]], 3
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub i64 0, [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = sub i64 1, [[TMP12]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x double>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x double>, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = fadd <vscale x 8 x double> [[WIDE_LOAD]], shufflevector (<vscale x 8 x double> insertelement (<vscale x 8 x double> poison, double 1.000000e+00, i64 0), <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP18:%.*]] = fadd <vscale x 8 x double> [[WIDE_LOAD1]], shufflevector (<vscale x 8 x double> insertelement (<vscale x 8 x double> poison, double 1.000000e+00, i64 0), <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer)
@@ -50,13 +50,13 @@ define void @vector_reverse_f64(i64 %N, ptr noalias %a, ptr noalias %b) #0{
 ; CHECK-NEXT:    [[TMP21:%.*]] = shl i64 [[TMP20]], 3
 ; CHECK-NEXT:    [[TMP22:%.*]] = sub i64 1, [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i64 [[TMP22]]
-; CHECK-NEXT:    store <vscale x 8 x double> [[TMP17]], ptr [[TMP23]], align 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP25:%.*]] = shl i64 [[TMP24]], 3
 ; CHECK-NEXT:    [[TMP26:%.*]] = sub i64 0, [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = sub i64 1, [[TMP25]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i64 [[TMP26]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP28]], i64 [[TMP27]]
+; CHECK-NEXT:    store <vscale x 8 x double> [[TMP17]], ptr [[TMP23]], align 8
 ; CHECK-NEXT:    store <vscale x 8 x double> [[TMP18]], ptr [[TMP29]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP31]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -137,13 +137,13 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 3
 ; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 1, [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 [[TMP12]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP13]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP15:%.*]] = shl i64 [[TMP14]], 3
 ; CHECK-NEXT:    [[TMP16:%.*]] = sub i64 0, [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub i64 1, [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP18]], i64 [[TMP17]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP13]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i64>, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 8 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i64> [[WIDE_LOAD3]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
@@ -152,13 +152,13 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP24:%.*]] = shl i64 [[TMP23]], 3
 ; CHECK-NEXT:    [[TMP25:%.*]] = sub i64 1, [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[TMP22]], i64 [[TMP25]]
-; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP20]], ptr [[TMP26]], align 8
 ; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP28:%.*]] = shl i64 [[TMP27]], 3
 ; CHECK-NEXT:    [[TMP29:%.*]] = sub i64 0, [[TMP28]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = sub i64 1, [[TMP28]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[TMP22]], i64 [[TMP29]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[TMP31]], i64 [[TMP30]]
+; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP20]], ptr [[TMP26]], align 8
 ; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP21]], ptr [[TMP32]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP34]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
index 140c469327bd..899fcce5c02a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -22,10 +22,13 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     EMIT ir<%ptr.iv.1> = WIDEN-POINTER-INDUCTION ir<%start.1>, 8
 ; CHECK-NEXT:     EMIT ir<%ptr.iv.2> = WIDEN-POINTER-INDUCTION ir<%start.2>, 1
 ; CHECK-NEXT:     WIDEN-GEP Var[Inv] ir<%ptr.iv.2.next> = getelementptr inbounds ir<%ptr.iv.2>, ir<1>
-; CHECK-NEXT:     WIDEN store ir<%ptr.iv.1>, ir<%ptr.iv.2.next>
-; CHECK-NEXT:     WIDEN ir<%lv> = load ir<%ptr.iv.2>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%ptr.iv.1>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<%ptr.iv.2.next>
+; CHECK-NEXT:     vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%ptr.iv.2>
+; CHECK-NEXT:     WIDEN ir<%lv> = load vp<[[VEC_PTR2]]>
 ; CHECK-NEXT:     WIDEN ir<%add> = add ir<%lv>, ir<1>
-; CHECK-NEXT:     WIDEN store ir<%ptr.iv.2>, ir<%add>
+; CHECK-NEXT:     vp<[[VEC_PTR3:%.+]]> = vector-pointer ir<%ptr.iv.2>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR3]]>, ir<%add>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:   No successors
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index c4386c3ce5d1..1682d0740d0f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -50,18 +50,18 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP17]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP14]], ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP12]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP19]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i64 [[TMP22]], 2
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP23]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP19]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP20]], ptr [[TMP24]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -152,17 +152,17 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i64 [[TMP8]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[NEXT_GEP5]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[NEXT_GEP5]], i64 [[TMP13]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[NEXT_GEP5]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
index a35c0a4595d9..ad0d40ee7fbd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
@@ -22,10 +22,12 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]>    = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN ir<%load> = load ir<%gep>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
+; CHECK-NEXT:     WIDEN ir<%load> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     REPLICATE ir<%call> = call @foo(ir<%load>)
 ; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN store ir<%arrayidx>, ir<%call>
+; CHECK-NEXT:     vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR2]]>, ir<%call>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
@@ -49,10 +51,12 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; CHECK-NEXT:     vp<[[STEPS]]>    = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN ir<%load> = load ir<%gep>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
+; CHECK-NEXT:     WIDEN ir<%load> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed4_nomask)
 ; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN store ir<%arrayidx>, ir<%call>
+; CHECK-NEXT:     vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR2]]>, ir<%call>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
@@ -81,10 +85,12 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]>    = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN ir<%load> = load ir<%gep>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
+; CHECK-NEXT:     WIDEN ir<%load> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed2_nomask)
 ; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN store ir<%arrayidx>, ir<%call>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<%call>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXST:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
@@ -108,10 +114,12 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]>    = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN ir<%load> = load ir<%gep>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
+; CHECK-NEXT:     WIDEN ir<%load> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN-CALL ir<%call> = call @foo(ir<%load>, ir<true>) (using library function: foo_vector_fixed4_mask)
 ; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN store ir<%arrayidx>, ir<%call>
+; CHECK-NEXT:     vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR2]]>, ir<%call>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
@@ -139,10 +147,12 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]>    = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN ir<%load> = load ir<%gep>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
+; CHECK-NEXT:     WIDEN ir<%load> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed2_nomask)
 ; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN store ir<%arrayidx>, ir<%call>
+; CHECK-NEXT:     vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR2]]>, ir<%call>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count  vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
@@ -166,10 +176,12 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]>    = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN ir<%load> = load ir<%gep>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
+; CHECK-NEXT:     WIDEN ir<%load> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed4_nomask)
 ; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN store ir<%arrayidx>, ir<%call>
+; CHECK-NEXT:     vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR2]]>, ir<%call>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
new file mode 100644
index 000000000000..ebee4fa42e9b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
@@ -0,0 +1,2641 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(cos|sin|tan|cbrt|erf|exp|gamma|log|sqrt|copysign|dim|min|mod|hypot|nextafter|pow|fma)" --version 2
+; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=SLEEF-NEON
+; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=SLEEF-SVE
+; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=ARMPL-NEON
+; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=ARMPL-SVE
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; We are checking whether loops containing function calls can be vectorized,
+; when the compiler provides TLI mappings to their vector variants. The tests
+; are checking fixed width vectorization with NEON and scalable vectorization
+; with SVE.
+
+declare double @acos(double)
+declare float @acosf(float)
+
+define void @acos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @acos_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @acos_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @acos_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vacosq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @acos_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svacos_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @acos(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @acos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @acos_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @acos_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @acos_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vacosq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @acos_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svacos_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @acosf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @acosh(double)
+declare float @acoshf(float)
+
+define void @acosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @acosh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @acosh(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @acosh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @acosh(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @acosh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vacoshq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @acosh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svacosh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @acosh(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @acosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @acosh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @acoshf(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @acosh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @acoshf(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @acosh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vacoshq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @acosh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svacosh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @acoshf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @asin(double)
+declare float @asinf(float)
+
+define void @asin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @asin_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @asin_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @asin_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vasinq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @asin_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svasin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @asin(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @asin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @asin_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @asin_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @asin_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vasinq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @asin_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svasin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @asinf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @asinh(double)
+declare float @asinhf(float)
+
+define void @asinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @asinh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @asinh(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @asinh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @asinh(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @asinh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vasinhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @asinh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svasinh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @asinh(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @asinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @asinh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @asinhf(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @asinh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @asinhf(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @asinh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vasinhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @asinh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svasinh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @asinhf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @atan(double)
+declare float @atanf(float)
+
+define void @atan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atan_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @atan_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atan_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vatanq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @atan_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svatan_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @atan(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @atan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atan_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @atan_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atan_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vatanq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @atan_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svatan_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @atanf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @atanh(double)
+declare float @atanhf(float)
+
+define void @atanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atanh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_atanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @atanh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atanh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atanh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vatanhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @atanh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svatanh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @atanh(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @atanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atanh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @atanh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atanh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vatanhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @atanh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svatanh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @atanhf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @cbrt(double)
+declare float @cbrtf(float)
+
+define void @cbrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cbrt_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @cbrt(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cbrt_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @cbrt(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cbrt_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcbrtq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cbrt_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcbrt_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @cbrt(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @cbrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cbrt_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @cbrtf(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cbrt_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @cbrtf(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cbrt_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcbrtq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cbrt_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcbrt_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @cbrtf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @cos(double)
+declare float @cosf(float)
+
+define void @cos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cos_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cos_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cos_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcosq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cos_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @cos(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @cos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cos_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cos_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cos_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcosq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cos_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @cosf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @cosh(double)
+declare float @coshf(float)
+
+define void @cosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cosh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cosh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cosh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcoshq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cosh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcosh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @cosh(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @cosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cosh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cosh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cosh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcoshq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cosh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcosh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @coshf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @erf(double)
+declare float @erff(float)
+
+define void @erf_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @erf_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @erf(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @erf_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @erf(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @erf_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_verfq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @erf_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_sverf_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @erf(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @erf_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @erf_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @erff(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @erf_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @erff(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @erf_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_verfq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @erf_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_sverf_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @erff(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @erfc(double)
+declare float @erfcf(float)
+
+define void @erfc_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @erfc_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @erfc(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @erfc_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @erfc(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @erfc_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_verfcq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @erfc_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_sverfc_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @erfc(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @erfc_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @erfc_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @erfcf(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @erfc_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @erfcf(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @erfc_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_verfcq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @erfc_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_sverfc_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @erfcf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @exp(double)
+declare float @expf(float)
+
+define void @exp_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexpq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @exp(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexpq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @expf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @exp2(double)
+declare float @exp2f(float)
+
+define void @exp2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp2_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp2_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp2_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp2_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @exp2(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp2_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp2_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp2_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp2_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @exp2f(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @exp10(double)
+declare float @exp10f(float)
+
+define void @exp10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp10_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp10_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp10_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp10_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @exp10(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp10_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp10_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp10_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp10_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @exp10f(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @expm1(double)
+declare float @expm1f(float)
+
+define void @expm1_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @expm1_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @expm1(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @expm1_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @expm1(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @expm1_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexpm1q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @expm1_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexpm1_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @expm1(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @expm1_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @expm1_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @expm1f(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @expm1_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @expm1f(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @expm1_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexpm1q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @expm1_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexpm1_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @expm1f(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @lgamma(double)
+declare float @lgammaf(float)
+
+define void @lgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @lgamma_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_lgamma(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @lgamma_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_lgamma(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @lgamma_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlgammaq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @lgamma_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlgamma_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @lgamma(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @lgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @lgamma_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_lgammaf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @lgamma_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_lgammaf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @lgamma_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlgammaq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @lgamma_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlgamma_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @lgammaf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log(double)
+declare float @logf(float)
+
+define void @log_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlogq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @log(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlogq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @logf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log1p(double)
+declare float @log1pf(float)
+
+define void @log1p_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log1p_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @log1p(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log1p_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @log1p(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log1p_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog1pq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log1p_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog1p_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @log1p(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log1p_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log1p_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @log1pf(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log1p_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @log1pf(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log1p_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog1pq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log1p_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog1p_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @log1pf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log2(double)
+declare float @log2f(float)
+
+define void @log2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log2_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log2_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log2_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log2_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @log2(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log2_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log2_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log2_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log2_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @log2f(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log10(double)
+declare float @log10f(float)
+
+define void @log10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log10_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log10_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log10_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log10_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @log10(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log10_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log10_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log10_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log10_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @log10f(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sin(double)
+declare float @sinf(float)
+
+define void @sin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sin_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sin_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sin_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sin_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @sin(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sin_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sin_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sin_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sin_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @sinf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sinh(double)
+declare float @sinhf(float)
+
+define void @sinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sinh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sinh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sinh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sinh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsinh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @sinh(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sinh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sinh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sinh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sinh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsinh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @sinhf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sinpi(double)
+declare float @sinpif(float)
+
+define void @sinpi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sinpi_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sinpi_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sinpi_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinpiq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sinpi_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsinpi_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @sinpi(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sinpi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sinpi_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sinpi_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sinpi_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinpiq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sinpi_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsinpi_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @sinpif(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sqrt(double)
+declare float @sqrtf(float)
+
+define void @sqrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sqrt_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_sqrt(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sqrt_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sqrt(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sqrt_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsqrtq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sqrt_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsqrt_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @sqrt(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sqrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sqrt_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_sqrtf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sqrt_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sqrtf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sqrt_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsqrtq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sqrt_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsqrt_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @sqrtf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @tan(double)
+declare float @tanf(float)
+
+define void @tan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tan_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tan_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tan_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vtanq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tan_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svtan_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @tan(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tan_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tan_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tan_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vtanq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tan_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svtan_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @tanf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @tanh(double)
+declare float @tanhf(float)
+
+define void @tanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tanh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tanh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tanh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vtanhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tanh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svtanh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @tanh(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tanh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tanh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tanh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vtanhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tanh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svtanh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @tanhf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @tgamma(double)
+declare float @tgammaf(float)
+
+define void @tgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tgamma_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_tgamma(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tgamma_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tgamma(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tgamma_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vtgammaq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tgamma_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svtgamma_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @tgamma(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tgamma_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_tgammaf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tgamma_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tgammaf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tgamma_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vtgammaq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tgamma_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svtgamma_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @tgammaf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @atan2(double, double)
+declare float @atan2f(float, float)
+
+define void @atan2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atan2_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @atan2_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atan2_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vatan2q_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @atan2_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svatan2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @atan2(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @atan2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atan2_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @atan2_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atan2_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vatan2q_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @atan2_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svatan2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @atan2f(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @copysign(double, double)
+declare float @copysignf(float, float)
+
+define void @copysign_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @copysign_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @copysign_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @copysign_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcopysignq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @copysign_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcopysign_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @copysign(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @copysign_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @copysign_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @copysign_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @copysign_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcopysignq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @copysign_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcopysign_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @copysignf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @fdim(double, double)
+declare float @fdimf(float, float)
+
+define void @fdim_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fdim_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @fdim_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @fdim_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfdimq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fdim_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfdim_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @fdim(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @fdim_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fdim_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @fdim_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @fdim_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfdimq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fdim_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfdim_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @fdimf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @fmin(double, double)
+declare float @fminf(float, float)
+
+define void @fmin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fmin_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @fmin_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @fmin_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfminq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fmin_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfmin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @fmin(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @fmin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fmin_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @fmin_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @fmin_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfminq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fmin_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfmin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @fminf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @fmod(double, double)
+declare float @fmodf(float, float)
+
+define void @fmod_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fmod_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2vv_fmod(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @fmod_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_fmod(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @fmod_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfmodq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fmod_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfmod_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @fmod(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @fmod_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fmod_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4vv_fmodf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @fmod_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_fmodf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @fmod_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfmodq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fmod_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfmod_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @fmodf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @hypot(double, double)
+declare float @hypotf(float, float)
+
+define void @hypot_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @hypot_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @hypot(double [[IN:%.*]], double [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @hypot_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @hypot(double [[IN:%.*]], double [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @hypot_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vhypotq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @hypot_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svhypot_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @hypot(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @hypot_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @hypot_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @hypotf(float [[IN:%.*]], float [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @hypot_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @hypotf(float [[IN:%.*]], float [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @hypot_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vhypotq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @hypot_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svhypot_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @hypotf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @nextafter(double, double)
+declare float @nextafterf(float, float)
+
+define void @nextafter_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @nextafter_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @nextafter_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @nextafter_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vnextafterq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @nextafter_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svnextafter_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @nextafter(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @nextafter_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @nextafter_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @nextafter_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @nextafter_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vnextafterq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @nextafter_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svnextafter_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @nextafterf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @pow(double, double)
+declare float @powf(float, float)
+
+define void @pow_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @pow_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @pow_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @pow_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vpowq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @pow_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svpow_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @pow(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @pow_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @pow_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @pow_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @pow_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vpowq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @pow_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svpow_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @powf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @fma(double, double, double)
+declare float @fmaf(float, float, float)
+
+define void @fma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fma_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @fma_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @fma_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfmaq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fma_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfma_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @fma(double %in, double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @fma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fma_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @fma_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @fma_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfmaq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fma_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfma_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @fmaf(float %in, float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
new file mode 100644
index 000000000000..d59c28849bfb
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
@@ -0,0 +1,1547 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(cos|exp|log|sin|pow|ceil|copysign|fabs|floor|fma|m..num|nearbyint|rint|round|sqrt|trunc)" --version 2
+
+; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=SLEEF-NEON
+; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=SLEEF-SVE
+; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=ARMPL-NEON
+; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=ARMPL-SVE
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; We are checking whether loops containing intrinsic calls can be vectorized,
+; when the compiler provides TLI mappings to their vector variants. The tests
+; are checking fixed width vectorization with NEON and scalable vectorization
+; with SVE.
+
+declare double @llvm.cos.f64(double)
+declare float @llvm.cos.f32(float)
+
+define void @cos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cos_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cos_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cos_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcosq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cos_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.cos.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @cos_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cos_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cos_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cos_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcosq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cos_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.cos.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.exp.f64(double)
+declare float @llvm.exp.f32(float)
+
+define void @exp_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexpq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.exp.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexpq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.exp.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.exp2.f64(double)
+declare float @llvm.exp2.f32(float)
+
+define void @exp2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp2_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp2_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp2_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp2_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.exp2.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp2_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp2_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp2_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp2_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.exp2.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.exp10.f64(double)
+declare float @llvm.exp10.f32(float)
+
+define void @exp10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp10_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp10_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp10_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp10_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.exp10.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp10_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp10_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp10_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp10_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.exp10.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.log.f64(double)
+declare float @llvm.log.f32(float)
+
+define void @log_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlogq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.log.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlogq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.log.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.log2.f64(double)
+declare float @llvm.log2.f32(float)
+
+define void @log2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log2_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log2_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log2_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log2_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.log2.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log2_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log2_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log2_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log2_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.log2.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.log10.f64(double)
+declare float @llvm.log10.f32(float)
+
+define void @log10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log10_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log10_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log10_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log10_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.log10.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log10_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log10_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log10_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log10_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.log10.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.sin.f64(double)
+declare float @llvm.sin.f32(float)
+
+define void @sin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sin_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sin_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sin_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sin_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.sin.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sin_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sin_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sin_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sin_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sin_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.sin.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.pow.f64(double, double)
+declare float @llvm.pow.f32(float, float)
+
+define void @pow_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @pow_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @pow_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @pow_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vpowq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @pow_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svpow_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.pow.f64(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @pow_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @pow_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @pow_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @pow_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vpowq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @pow_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svpow_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.pow.f32(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.ceil.f64(double)
+declare float @llvm.ceil.f32(float)
+
+define void @ceil_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @ceil_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @ceil_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @ceil_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @ceil_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.ceil.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @ceil_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @ceil_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @ceil_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @ceil_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @ceil_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.ceil.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.copysign.f64(double, double)
+declare float @llvm.copysign.f32(float, float)
+
+define void @copysign_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @copysign_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @copysign_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @copysign_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @copysign_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.copysign.f64(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @copysign_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @copysign_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @copysign_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @copysign_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @copysign_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.copysign.f32(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.fabs.f64(double)
+declare float @llvm.fabs.f32(float)
+
+define void @fabs_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fabs_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @fabs_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @fabs_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @fabs_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.fabs.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @fabs_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fabs_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @fabs_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @fabs_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @fabs_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.fabs.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.floor.f64(double)
+declare float @llvm.floor.f32(float)
+
+define void @floor_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @floor_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @floor_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @floor_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @floor_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.floor.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @floor_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @floor_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @floor_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @floor_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @floor_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.floor.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.fma.f64(double, double, double)
+declare float @llvm.fma.f32(float, float, float)
+
+define void @fma_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fma_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @fma_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @fma_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fma_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.fma.f64(double %in, double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @fma_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fma_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @fma_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @fma_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fma_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.fma.f32(float %in, float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.maxnum.f64(double, double)
+declare float @llvm.maxnum.f32(float, float)
+
+define void @maxnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @maxnum_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @maxnum_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @maxnum_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @maxnum_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.maxnum.f64(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @maxnum_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @maxnum_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @maxnum_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @maxnum_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @maxnum_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.maxnum.f32(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.minnum.f64(double, double)
+declare float @llvm.minnum.f32(float, float)
+
+define void @minnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @minnum_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @minnum_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @minnum_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @minnum_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.minnum.f64(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @minnum_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @minnum_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @minnum_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @minnum_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @minnum_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.minnum.f32(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.nearbyint.f64(double)
+declare float @llvm.nearbyint.f32(float)
+
+define void @nearbyint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @nearbyint_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @nearbyint_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @nearbyint_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @nearbyint_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.nearbyint.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @nearbyint_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @nearbyint_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @nearbyint_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @nearbyint_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @nearbyint_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.nearbyint.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.rint.f64(double)
+declare float @llvm.rint.f32(float)
+
+define void @rint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @rint_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @rint_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @rint_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @rint_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.rint.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @rint_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @rint_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @rint_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @rint_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @rint_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.rint.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.round.f64(double)
+declare float @llvm.round.f32(float)
+
+define void @round_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @round_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @round_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @round_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @round_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.round.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @round_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @round_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @round_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @round_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @round_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.round.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.sqrt.f64(double)
+declare float @llvm.sqrt.f32(float)
+
+define void @sqrt_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sqrt_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sqrt_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sqrt_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sqrt_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.sqrt.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sqrt_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sqrt_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sqrt_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sqrt_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sqrt_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.sqrt.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.trunc.f64(double)
+declare float @llvm.trunc.f32(float)
+
+define void @trunc_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @trunc_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @trunc_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @trunc_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @trunc_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.trunc.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @trunc_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @trunc_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @trunc_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @trunc_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @trunc_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.trunc.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll
index 876d58131bd7..16506b3a5757 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(foo|bar|baz|quux)" --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(foo|bar|baz|quux|goo)" --version 2
 ; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=NEON
 ; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=SVE_OR_NEON
 ; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -S -prefer-predicate-over-epilogue=predicate-dont-vectorize | FileCheck %s --check-prefixes=SVE_TF
@@ -15,13 +15,13 @@ define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) {
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linear8
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; SVE_OR_NEON:    [[TMP13:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_nomask_sve(ptr [[TMP12:%.*]])
+; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_nomask_sve(ptr [[TMP14:%.*]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]]
 ;
 ; SVE_TF-LABEL: define void @test_linear8
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; SVE_TF:    [[TMP19:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP18:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]]
+; SVE_TF:    [[TMP21:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP20:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
 ;
 entry:
   br label %for.body
@@ -48,12 +48,12 @@ define void @test_vector_linear4(ptr noalias %a, ptr readnone %b, ptr readonly %
 ;
 ; SVE_OR_NEON-LABEL: define void @test_vector_linear4
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 4 x i32> @vec_baz_vector_linear4_nomask_sve(<vscale x 4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP14:%.*]])
+; SVE_OR_NEON:    [[TMP17:%.*]] = call <vscale x 4 x i32> @vec_baz_vector_linear4_nomask_sve(<vscale x 4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]]
 ;
 ; SVE_TF-LABEL: define void @test_vector_linear4
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
+; SVE_TF:    [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]]
 ;
 entry:
   br label %for.body
@@ -85,7 +85,7 @@ define void @test_linear8_bad_stride(ptr noalias %a, ptr readnone %b, i64 %n) {
 ;
 ; SVE_TF-LABEL: define void @test_linear8_bad_stride
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]]
+; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]]
 ;
 entry:
   br label %for.body
@@ -112,12 +112,12 @@ define void @test_linear16_wide_stride(ptr noalias %a, ptr readnone %b, i64 %n)
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linear16_wide_stride
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP14:%.*]] = call <vscale x 2 x i64> @vec_foo_linear16_nomask_sve(ptr [[TMP13:%.*]])
+; SVE_OR_NEON:    [[TMP16:%.*]] = call <vscale x 2 x i64> @vec_foo_linear16_nomask_sve(ptr [[TMP15:%.*]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4]]
 ;
 ; SVE_TF-LABEL: define void @test_linear16_wide_stride
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR5]]
+; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6]]
 ;
 entry:
   br label %for.body
@@ -145,13 +145,13 @@ define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linear4_linear8
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP13:%.*]], ptr [[TMP14:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE_OR_NEON:    [[TMP17:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP15:%.*]], ptr [[TMP16:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]]
 ;
 ; SVE_TF-LABEL: define void @test_linear4_linear8
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    [[TMP21:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP19:%.*]], ptr [[TMP20:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE_TF:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]]
+; SVE_TF:    [[TMP23:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP21:%.*]], ptr [[TMP22:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]]
 ;
 entry:
   br label %for.body
@@ -179,12 +179,12 @@ define void @test_linear3_non_ptr(ptr noalias %a, i64 %n) {
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linear3_non_ptr
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP13:%.*]] = call <vscale x 4 x i32> @vec_bar_linear3_nomask_sve(i32 [[TMP12:%.*]])
+; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 4 x i32> @vec_bar_linear3_nomask_sve(i32 [[TMP14:%.*]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR6:[0-9]+]]
 ;
 ; SVE_TF-LABEL: define void @test_linear3_non_ptr
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR7:[0-9]+]]
+; SVE_TF:    [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR8:[0-9]+]]
 ;
 entry:
   br label %for.body
@@ -212,12 +212,12 @@ define void @test_linearn5_non_ptr_neg_stride(ptr noalias %a, i64 %n) {
 ;
 ; SVE_OR_NEON-LABEL: define void @test_linearn5_non_ptr_neg_stride
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP13:%.*]] = call <vscale x 4 x i32> @vec_bar_linearn5_nomask_sve(i32 [[TMP12:%.*]])
+; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 4 x i32> @vec_bar_linearn5_nomask_sve(i32 [[TMP14:%.*]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR7:[0-9]+]]
 ;
 ; SVE_TF-LABEL: define void @test_linearn5_non_ptr_neg_stride
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR8:[0-9]+]]
+; SVE_TF:    [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR9:[0-9]+]]
 ;
 entry:
   br label %for.body
@@ -237,10 +237,44 @@ for.cond.cleanup:
   ret void
 }
 
+define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n) {
+; NEON-LABEL: define void @test_linear8_return_void
+; NEON-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) {
+; NEON:    call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]])
+; NEON:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR6:[0-9]+]]
+;
+; SVE_OR_NEON-LABEL: define void @test_linear8_return_void
+; SVE_OR_NEON-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_OR_NEON:    call void @vec_goo_linear8_nomask_sve(<vscale x 2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]])
+; SVE_OR_NEON:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR8:[0-9]+]]
+;
+; SVE_TF-LABEL: define void @test_linear8_return_void
+; SVE_TF-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_TF:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %gep.in = getelementptr i64, ptr %in, i64 %indvars.iv
+  %num = load i64, ptr %gep.in, align 8
+  %gep.out = getelementptr i64, ptr %out, i64 %indvars.iv
+  call void @goo(i64 %num, ptr %gep.out) #6
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
 declare i64 @foo(ptr)
 declare i32 @baz(i32, ptr)
 declare i32 @quux(ptr, ptr)
 declare i32 @bar(i32)
+declare void @goo(i64, ptr)
 
 ; neon vector variants of foo
 declare <2 x i64> @vec_foo_linear8_nomask_neon(ptr)
@@ -249,6 +283,7 @@ declare <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32>, ptr)
 declare <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr, ptr)
 declare <4 x i32> @vec_bar_linear3_nomask_neon(i32)
 declare <4 x i32> @vec_bar_linearn5_nomask_neon(i32)
+declare void @vec_goo_linear8_nomask_neon(<2 x i64>, ptr)
 
 ; scalable vector variants of foo
 declare <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr, <vscale x 2 x i1>)
@@ -258,6 +293,8 @@ declare <vscale x 4 x i32> @vec_baz_vector_linear4_nomask_sve(<vscale x 4 x i32>
 declare <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr, ptr, <vscale x 4 x i1>)
 declare <vscale x 4 x i32> @vec_bar_linear3_nomask_sve(i32)
 declare <vscale x 4 x i32> @vec_bar_linearn5_nomask_sve(i32)
+declare void @vec_goo_linear8_nomask_sve(<vscale x 2 x i64>, ptr)
+declare void @vec_goo_linear8_mask_sve(<vscale x 2 x i64>, ptr, <vscale x 2 x i1>)
 
 attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVsNxl8_foo(vec_foo_linear8_nomask_sve),_ZGVsMxl8_foo(vec_foo_linear8_mask_sve),_ZGVnN2l8_foo(vec_foo_linear8_nomask_neon)" }
 attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVsNxvl4_baz(vec_baz_vector_linear4_nomask_sve),_ZGVnN4vl4_baz(vec_baz_vector_linear4_nomask_neon)" }
@@ -265,3 +302,4 @@ attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVsNxl16_foo(vec_foo
 attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVsMxl4l8_quux(vec_quux_linear4_linear8_mask_sve),_ZGVnN4l4l8_quux(vec_quux_linear4_linear8_nomask_neon)" }
 attributes #4 = { nounwind "vector-function-abi-variant"="_ZGVsNxl3_bar(vec_bar_linear3_nomask_sve),_ZGVnN4l3_bar(vec_bar_linear3_nomask_neon)" }
 attributes #5 = { nounwind "vector-function-abi-variant"="_ZGVsNxln5_bar(vec_bar_linearn5_nomask_sve),_ZGVnN4ln5_bar(vec_bar_linearn5_nomask_neon)" }
+attributes #6 = { nounwind "vector-function-abi-variant"="_ZGVsNxvl8_goo(vec_goo_linear8_nomask_sve),_ZGVsMxvl8_goo(vec_goo_linear8_mask_sve),_ZGVsN2vl8_goo(vec_goo_linear8_nomask_neon)" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
index bf30e41d5573..347ec55f6a99 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
@@ -35,19 +35,19 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], [[N]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[COND:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 -3
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 -7
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 -7
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x double> [[WIDE_LOAD1]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp une <4 x double> [[REVERSE]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp une <4 x double> [[REVERSE2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[TMP7]], i64 -3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[TMP7]], i64 -7
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[TMP7]], i64 -3
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE3]], <4 x double> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[TMP7]], i64 -7
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE4]], <4 x double> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD6]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
index 6bd305fe3363..2abc787061b5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
@@ -17,9 +17,9 @@ define void @vector_reverse_f64(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x double> %[[WIDE]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT: %[[FADD:.*]] = fadd <8 x double> %[[REVERSE]]
 ; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, ptr {{.*}}, i64 {{.*}}
-; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x double> %[[FADD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, ptr %[[GEP2]], i32 0
 ; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds double, ptr %[[GEP3]], i32 -7
+; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x double> %[[FADD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  store <8 x double> %[[REVERSE6]], ptr %[[GEP4]], align 8
 
 entry:
@@ -50,9 +50,9 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x i64> %[[WIDE]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT: %[[FADD:.*]] = add <8 x i64> %[[REVERSE]]
 ; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, ptr {{.*}}, i64 {{.*}}
-; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x i64> %[[FADD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, ptr %[[GEP2]], i32 0
 ; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds i64, ptr %[[GEP3]], i32 -7
+; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x i64> %[[FADD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  store <8 x i64> %[[REVERSE6]], ptr %[[GEP4]], align 8
 
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
index a3a2223a2528..f67b932b04a0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
@@ -20,7 +20,8 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]>    = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN ir<%l> = load ir<%gep.src>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.src>
+; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN-CAST ir<%conv> = fpext ir<%l> to double
 ; CHECK-NEXT:     WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64)
 ; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]>
@@ -48,7 +49,8 @@ target triple = "arm64-apple-ios"
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]>    = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN ir<%l> = load ir<%gep.src>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.src>
+; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN-CAST ir<%conv> = fpext ir<%l> to double
 ; CHECK-NEXT:     WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using vector intrinsic)
 ; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]>
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
index b29abbd78ed7..4988ba0d7d37 100644
--- a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
@@ -14,8 +14,8 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) {
 ; GFX9-NEXT:    [[VEC_PHI:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; GFX9-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; GFX9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[S:%.*]], i64 [[INDEX]]
-; GFX9-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 2
 ; GFX9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP0]], i64 2
+; GFX9-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 2
 ; GFX9-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP2]], align 2
 ; GFX9-NEXT:    [[TMP4]] = fadd fast <2 x half> [[VEC_PHI]], [[WIDE_LOAD]]
 ; GFX9-NEXT:    [[TMP5]] = fadd fast <2 x half> [[VEC_PHI1]], [[WIDE_LOAD2]]
@@ -44,8 +44,8 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) {
 ; VI-NEXT:    [[VEC_PHI:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; VI-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; VI-NEXT:    [[TMP0:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[S:%.*]], i64 [[INDEX]]
-; VI-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 2
 ; VI-NEXT:    [[TMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP0]], i64 2
+; VI-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 2
 ; VI-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP2]], align 2
 ; VI-NEXT:    [[TMP4]] = fadd fast <2 x half> [[VEC_PHI]], [[WIDE_LOAD]]
 ; VI-NEXT:    [[TMP5]] = fadd fast <2 x half> [[VEC_PHI1]], [[WIDE_LOAD2]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
index c00f747d54db..c3f307c234ed 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
@@ -753,8 +753,8 @@ define hidden void @pointer_phi_v4i32_uf2(ptr noalias nocapture readonly %A, ptr
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 192
@@ -824,12 +824,12 @@ define hidden void @pointer_phi_v4i32_uf4(ptr noalias nocapture readonly %A, ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER9]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 8
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 12
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 384
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
index a448b3086c44..c358c835597d 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
@@ -48,28 +48,28 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-TWO-CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP10]]
 ; VF-TWO-CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP11]]
 ; VF-TWO-CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP26]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 8
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP28]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 12
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP30]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 16
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP32]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 20
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 24
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 28
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP38]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 32
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP40]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 36
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP42]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 40
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP44]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 44
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP26]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP28]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP30]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP32]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP38]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP40]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP42]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP44]], align 4
 ; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP46]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[CC:%.*]], i64 [[TMP0]]
 ; VF-TWO-CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP1]]
@@ -84,28 +84,28 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-TWO-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP10]]
 ; VF-TWO-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP11]]
 ; VF-TWO-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP60]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 4
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP62]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 8
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP64]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 12
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP66]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 16
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP68]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 20
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP70]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 24
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP72]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 28
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x float>, ptr [[TMP74]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 32
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <4 x float>, ptr [[TMP76]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 36
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x float>, ptr [[TMP78]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 40
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x float>, ptr [[TMP80]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 44
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP60]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP62]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP64]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP66]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP68]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP70]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP72]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x float>, ptr [[TMP74]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <4 x float>, ptr [[TMP76]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x float>, ptr [[TMP78]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x float>, ptr [[TMP80]], align 4
 ; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x float>, ptr [[TMP82]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD13]]
 ; VF-TWO-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD14]]
@@ -132,28 +132,28 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-TWO-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP10]]
 ; VF-TWO-CHECK-NEXT:    [[TMP107:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP11]]
 ; VF-TWO-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 0
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP84]], ptr [[TMP108]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP85]], ptr [[TMP110]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 8
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP86]], ptr [[TMP112]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 12
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP87]], ptr [[TMP114]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP116:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 16
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP88]], ptr [[TMP116]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 20
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP89]], ptr [[TMP118]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 24
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP90]], ptr [[TMP120]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 28
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP91]], ptr [[TMP122]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 32
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP92]], ptr [[TMP124]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP126:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 36
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP93]], ptr [[TMP126]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP128:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 40
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP94]], ptr [[TMP128]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP130:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 44
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP84]], ptr [[TMP108]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP85]], ptr [[TMP110]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP86]], ptr [[TMP112]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP87]], ptr [[TMP114]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP88]], ptr [[TMP116]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP89]], ptr [[TMP118]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP90]], ptr [[TMP120]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP91]], ptr [[TMP122]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP92]], ptr [[TMP124]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP93]], ptr [[TMP126]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP94]], ptr [[TMP128]], align 4
 ; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP95]], ptr [[TMP130]], align 4
 ; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 48
 ; VF-TWO-CHECK-NEXT:    [[TMP132:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -251,28 +251,28 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-FOUR-CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP10]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP11]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP26]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 8
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP28]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 12
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP30]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 16
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP32]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 20
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 24
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 28
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP38]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 32
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP40]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 36
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP42]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 40
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP44]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 44
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP26]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP28]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP30]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP32]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP38]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP40]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP42]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP44]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP46]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[CC:%.*]], i64 [[TMP0]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP1]]
@@ -287,28 +287,28 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-FOUR-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP10]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP11]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP60]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 4
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP62]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 8
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP64]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 12
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP66]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 16
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP68]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 20
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP70]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 24
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP72]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 28
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x float>, ptr [[TMP74]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 32
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <4 x float>, ptr [[TMP76]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 36
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x float>, ptr [[TMP78]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 40
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x float>, ptr [[TMP80]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 44
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP60]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP62]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP64]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP66]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP68]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP70]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP72]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x float>, ptr [[TMP74]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <4 x float>, ptr [[TMP76]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x float>, ptr [[TMP78]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x float>, ptr [[TMP80]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x float>, ptr [[TMP82]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD13]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD14]]
@@ -335,28 +335,28 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-FOUR-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP10]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP107:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP11]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 0
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP84]], ptr [[TMP108]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP85]], ptr [[TMP110]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 8
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP86]], ptr [[TMP112]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 12
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP87]], ptr [[TMP114]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP116:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 16
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP88]], ptr [[TMP116]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 20
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP89]], ptr [[TMP118]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 24
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP90]], ptr [[TMP120]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 28
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP91]], ptr [[TMP122]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 32
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP92]], ptr [[TMP124]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP126:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 36
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP93]], ptr [[TMP126]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP128:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 40
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP94]], ptr [[TMP128]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP130:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 44
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP84]], ptr [[TMP108]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP85]], ptr [[TMP110]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP86]], ptr [[TMP112]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP87]], ptr [[TMP114]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP88]], ptr [[TMP116]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP89]], ptr [[TMP118]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP90]], ptr [[TMP120]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP91]], ptr [[TMP122]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP92]], ptr [[TMP124]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP93]], ptr [[TMP126]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP94]], ptr [[TMP128]], align 4
 ; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP95]], ptr [[TMP130]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 48
 ; VF-FOUR-CHECK-NEXT:    [[TMP132:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -524,34 +524,34 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-TWO-CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP47]]
 ; VF-TWO-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP56]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP57]], align 4
-; VF-TWO-CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-TWO-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -4
 ; VF-TWO-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP59]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP60]], align 4
-; VF-TWO-CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-TWO-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -8
 ; VF-TWO-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[TMP62]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP63]], align 4
-; VF-TWO-CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-TWO-CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -12
 ; VF-TWO-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP65]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP66]], align 4
-; VF-TWO-CHECK-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-TWO-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -16
 ; VF-TWO-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP69]], align 4
-; VF-TWO-CHECK-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-TWO-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -20
 ; VF-TWO-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP71]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP72]], align 4
-; VF-TWO-CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-TWO-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -24
 ; VF-TWO-CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, ptr [[TMP74]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP75]], align 4
-; VF-TWO-CHECK-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-TWO-CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -28
 ; VF-TWO-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, ptr [[TMP77]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP57]], align 4
+; VF-TWO-CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP60]], align 4
+; VF-TWO-CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP63]], align 4
+; VF-TWO-CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP66]], align 4
+; VF-TWO-CHECK-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP69]], align 4
+; VF-TWO-CHECK-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP72]], align 4
+; VF-TWO-CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP75]], align 4
+; VF-TWO-CHECK-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP78]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-TWO-CHECK-NEXT:    [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
@@ -571,20 +571,20 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-TWO-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]]
 ; VF-TWO-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]]
 ; VF-TWO-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 0
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP80]], ptr [[TMP96]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 4
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP81]], ptr [[TMP98]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 8
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP82]], ptr [[TMP100]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 12
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP83]], ptr [[TMP102]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 16
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP84]], ptr [[TMP104]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 20
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP85]], ptr [[TMP106]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 24
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP86]], ptr [[TMP108]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 28
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP80]], ptr [[TMP96]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP81]], ptr [[TMP98]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP82]], ptr [[TMP100]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP83]], ptr [[TMP102]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP84]], ptr [[TMP104]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP85]], ptr [[TMP106]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP86]], ptr [[TMP108]], align 4
 ; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP87]], ptr [[TMP110]], align 4
 ; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; VF-TWO-CHECK-NEXT:    [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -731,34 +731,34 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-FOUR-CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP47]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP56]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP57]], align 4
-; VF-FOUR-CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-FOUR-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -4
 ; VF-FOUR-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP59]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP60]], align 4
-; VF-FOUR-CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-FOUR-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -8
 ; VF-FOUR-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[TMP62]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP63]], align 4
-; VF-FOUR-CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-FOUR-CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -12
 ; VF-FOUR-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP65]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP66]], align 4
-; VF-FOUR-CHECK-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-FOUR-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -16
 ; VF-FOUR-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP69]], align 4
-; VF-FOUR-CHECK-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-FOUR-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -20
 ; VF-FOUR-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP71]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP72]], align 4
-; VF-FOUR-CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-FOUR-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -24
 ; VF-FOUR-CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, ptr [[TMP74]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP75]], align 4
-; VF-FOUR-CHECK-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-FOUR-CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -28
 ; VF-FOUR-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, ptr [[TMP77]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP57]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP60]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP63]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP66]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP69]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP72]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP75]], align 4
+; VF-FOUR-CHECK-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP78]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-FOUR-CHECK-NEXT:    [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
@@ -778,20 +778,20 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-FOUR-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 0
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP80]], ptr [[TMP96]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 4
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP81]], ptr [[TMP98]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 8
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP82]], ptr [[TMP100]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 12
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP83]], ptr [[TMP102]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 16
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP84]], ptr [[TMP104]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 20
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP85]], ptr [[TMP106]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 24
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP86]], ptr [[TMP108]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 28
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP80]], ptr [[TMP96]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP81]], ptr [[TMP98]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP82]], ptr [[TMP100]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP83]], ptr [[TMP102]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP84]], ptr [[TMP104]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP85]], ptr [[TMP106]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP86]], ptr [[TMP108]], align 4
 ; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP87]], ptr [[TMP110]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; VF-FOUR-CHECK-NEXT:    [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index 1f1891ee4548..dcd78aa7f1e3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -67,8 +67,8 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; FIXED-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; FIXED-NEXT:    [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
@@ -171,8 +171,8 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; FIXED-NEXT:    [[TMP6:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; FIXED-NEXT:    [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
@@ -275,8 +275,8 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; FIXED-NEXT:    [[TMP6:%.*]] = urem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; FIXED-NEXT:    [[TMP7:%.*]] = urem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
@@ -379,8 +379,8 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; FIXED-NEXT:    [[TMP6:%.*]] = srem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; FIXED-NEXT:    [[TMP7:%.*]] = srem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
@@ -493,8 +493,8 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
 ; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
@@ -627,8 +627,8 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
 ; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
@@ -756,8 +756,8 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], <i64 42, i64 42, i64 42, i64 42>
 ; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], <i64 42, i64 42, i64 42, i64 42>
@@ -883,8 +883,8 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], <i64 42, i64 42, i64 42, i64 42>
 ; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], <i64 42, i64 42, i64 42, i64 42>
@@ -1011,8 +1011,8 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 32
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD]], <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
 ; FIXED-NEXT:    [[TMP7:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD1]], <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index a9cd91b5a8cb..6fa197591ab3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -553,8 +553,8 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[Q]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8
+; CHECK-NEXT:    store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -633,8 +633,8 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[Q]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4
+; CHECK-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8
 ; CHECK-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index ca8c85712fc4..1bcd7a2e009e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -67,10 +67,12 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
 ; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
 ; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    WIDEN ir<%1> = load ir<%arrayidx>
+; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-pointer (reverse) ir<%arrayidx>
+; CHECK-NEXT:    WIDEN ir<%1> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:    WIDEN ir<%add9> = add ir<%1>, ir<1>
 ; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    WIDEN store ir<%arrayidx3>, ir<%add9>
+; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-pointer (reverse) ir<%arrayidx3>
+; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
 ; CHECK-NEXT:    EMIT vp<[[IV_INC:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:    EMIT branch-on-count vp<[[IV_INC]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors
@@ -205,10 +207,12 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
 ; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
 ; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    WIDEN ir<%1> = load ir<%arrayidx>
+; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-pointer (reverse) ir<%arrayidx>
+; CHECK-NEXT:    WIDEN ir<%1> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:    WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
 ; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    WIDEN store ir<%arrayidx3>, ir<%conv1>
+; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-pointer (reverse) ir<%arrayidx3>
+; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1>
 ; CHECK-NEXT:    EMIT vp<[[IV_INC:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:    EMIT branch-on-count vp<[[IV_INC]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:    No successors
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index a12dfbaec195..8dc5e5d3ffd1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -68,8 +68,8 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -244,8 +244,8 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -402,8 +402,8 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; FIXEDLEN-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
 ; FIXEDLEN-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[PREDPHI3]], ptr [[TMP9]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
@@ -628,8 +628,8 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -803,8 +803,8 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -995,8 +995,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; FIXEDLEN-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
 ; FIXEDLEN-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP10]], align 8
 ; FIXEDLEN-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP10]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP11]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -1235,8 +1235,8 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
 ; FIXEDLEN-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
@@ -1459,8 +1459,8 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index f1518189c41a..77c41453f486 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -139,12 +139,12 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    store <2 x float> [[TMP12]], ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 2
-; CHECK-NEXT:    store <2 x float> [[TMP13]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 4
-; CHECK-NEXT:    store <2 x float> [[TMP14]], ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 6
+; CHECK-NEXT:    store <2 x float> [[TMP12]], ptr [[TMP20]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP13]], ptr [[TMP21]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP14]], ptr [[TMP22]], align 4
 ; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[TMP23]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD2]], <i64 2, i64 2>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
index 90dae0421007..c1be67853bf7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
@@ -174,12 +174,12 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP15]], align 2
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 16
-; CHECK-NEXT:    store <16 x i16> [[TMP8]], ptr [[TMP16]], align 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 32
-; CHECK-NEXT:    store <16 x i16> [[TMP9]], ptr [[TMP17]], align 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 48
+; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP15]], align 2
+; CHECK-NEXT:    store <16 x i16> [[TMP8]], ptr [[TMP16]], align 2
+; CHECK-NEXT:    store <16 x i16> [[TMP9]], ptr [[TMP17]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP10]], ptr [[TMP18]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i16> [[STEP_ADD5]], [[DOTSPLAT3]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index a96cfa7cd99e..8004563f3816 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -30,8 +30,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD1]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i8> [[VECTOR_RECUR]], <16 x i8> [[WIDE_LOAD]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> [[WIDE_LOAD1]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -40,8 +40,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -129,8 +129,8 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD5]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i8> [[VECTOR_RECUR]], <16 x i8> [[WIDE_LOAD]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-NEXT:    [[TMP8]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> [[WIDE_LOAD5]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -147,8 +147,8 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[TMP17]], ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 16
+; CHECK-NEXT:    store <16 x i8> [[TMP17]], ptr [[TMP21]], align 1
 ; CHECK-NEXT:    store <16 x i8> [[TMP18]], ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index fb87228a9ff9..c8c791b30163 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -36,12 +36,12 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
 ; AUTO_VEC-NEXT:    [[STEP_ADD2:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
 ; AUTO_VEC-NEXT:    [[STEP_ADD3:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01>
 ; AUTO_VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; AUTO_VEC-NEXT:    store <8 x float> [[VEC_IND]], ptr [[TMP1]], align 4
 ; AUTO_VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
-; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD]], ptr [[TMP2]], align 4
 ; AUTO_VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 16
-; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD2]], ptr [[TMP3]], align 4
 ; AUTO_VEC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 24
+; AUTO_VEC-NEXT:    store <8 x float> [[VEC_IND]], ptr [[TMP1]], align 4
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD]], ptr [[TMP2]], align 4
+; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD2]], ptr [[TMP3]], align 4
 ; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD3]], ptr [[TMP4]], align 4
 ; AUTO_VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AUTO_VEC-NEXT:    [[VEC_IND_NEXT]] = fadd fast <8 x float> [[VEC_IND]], <float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01>
@@ -211,12 +211,12 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) {
 ; AUTO_VEC-NEXT:    [[STEP_ADD2:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 2.400000e+01, double 2.400000e+01, double 2.400000e+01, double 2.400000e+01>
 ; AUTO_VEC-NEXT:    [[STEP_ADD3:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 3.600000e+01, double 3.600000e+01, double 3.600000e+01, double 3.600000e+01>
 ; AUTO_VEC-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[INDEX]]
-; AUTO_VEC-NEXT:    store <4 x double> [[VEC_IND]], ptr [[TMP1]], align 8
 ; AUTO_VEC-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[TMP1]], i64 4
-; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD]], ptr [[TMP2]], align 8
 ; AUTO_VEC-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 8
-; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD2]], ptr [[TMP3]], align 8
 ; AUTO_VEC-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[TMP1]], i64 12
+; AUTO_VEC-NEXT:    store <4 x double> [[VEC_IND]], ptr [[TMP1]], align 8
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD]], ptr [[TMP2]], align 8
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD2]], ptr [[TMP3]], align 8
 ; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD3]], ptr [[TMP4]], align 8
 ; AUTO_VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AUTO_VEC-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x double> [[VEC_IND]], <double 4.800000e+01, double 4.800000e+01, double 4.800000e+01, double 4.800000e+01>
@@ -377,12 +377,12 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
 ; AUTO_VEC-NEXT:    [[STEP_ADD2:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], <float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02>
 ; AUTO_VEC-NEXT:    [[STEP_ADD3:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], <float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02>
 ; AUTO_VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 [[INDEX]]
-; AUTO_VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
 ; AUTO_VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 8
-; AUTO_VEC-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
 ; AUTO_VEC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 16
-; AUTO_VEC-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x float>, ptr [[TMP4]], align 4
 ; AUTO_VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 24
+; AUTO_VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
+; AUTO_VEC-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
+; AUTO_VEC-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x float>, ptr [[TMP4]], align 4
 ; AUTO_VEC-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
 ; AUTO_VEC-NEXT:    [[TMP6:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], [[WIDE_LOAD]]
 ; AUTO_VEC-NEXT:    [[TMP7:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], [[WIDE_LOAD5]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
index 1e029ba15904..55757157fce9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -24,12 +24,12 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; SSE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; SSE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
-; SSE-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; SSE-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
-; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 4
-; SSE-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4
+; SSE-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
+; SSE-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 4
+; SSE-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
+; SSE-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP11]], align 4
 ; SSE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; SSE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; SSE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -72,18 +72,18 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX1-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
-; AVX1-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
-; AVX1-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
-; AVX1-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
-; AVX1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; AVX1-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP15]], align 4
-; AVX1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 4
-; AVX1-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP16]], align 4
-; AVX1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 8
-; AVX1-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP17]], align 4
-; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 12
-; AVX1-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP18]], align 4
+; AVX1-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
+; AVX1-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
+; AVX1-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
+; AVX1-NEXT:    [[TMP18:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
+; AVX1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 4
+; AVX1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 8
+; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 12
+; AVX1-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP19]], align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP16]], ptr [[TMP21]], align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP23]], align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP18]], ptr [[TMP25]], align 4
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; AVX1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -126,18 +126,18 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX2-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; AVX2-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; AVX2-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; AVX2-NEXT:    [[TMP11:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
-; AVX2-NEXT:    [[TMP12:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
-; AVX2-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
-; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
-; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; AVX2-NEXT:    store <8 x i32> [[TMP11]], ptr [[TMP15]], align 4
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 8
-; AVX2-NEXT:    store <8 x i32> [[TMP12]], ptr [[TMP16]], align 4
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 16
-; AVX2-NEXT:    store <8 x i32> [[TMP13]], ptr [[TMP17]], align 4
-; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 24
-; AVX2-NEXT:    store <8 x i32> [[TMP14]], ptr [[TMP18]], align 4
+; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
+; AVX2-NEXT:    [[TMP16:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
+; AVX2-NEXT:    [[TMP17:%.*]] = add nsw <8 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
+; AVX2-NEXT:    [[TMP18:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 8
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 16
+; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 24
+; AVX2-NEXT:    store <8 x i32> [[TMP15]], ptr [[TMP19]], align 4
+; AVX2-NEXT:    store <8 x i32> [[TMP16]], ptr [[TMP21]], align 4
+; AVX2-NEXT:    store <8 x i32> [[TMP17]], ptr [[TMP23]], align 4
+; AVX2-NEXT:    store <8 x i32> [[TMP18]], ptr [[TMP25]], align 4
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; AVX2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
index 5111e78bf8bc..8527fb433c63 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
@@ -36,23 +36,23 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[NEXT_GEP]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 32
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 48
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD]], <16 x i16> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD9]], <16 x i16> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD10]], <16 x i16> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD11]], <16 x i16> [[BROADCAST_SPLAT]])
-; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr [[NEXT_GEP5]], align 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 16
-; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 32
-; CHECK-NEXT:    store <16 x i16> [[TMP8]], ptr [[TMP11]], align 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 48
+; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr [[NEXT_GEP5]], align 2
+; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP10]], align 2
+; CHECK-NEXT:    store <16 x i16> [[TMP8]], ptr [[TMP11]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP9]], ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -167,23 +167,23 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[NEXT_GEP]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <32 x i8>, ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <32 x i8>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[NEXT_GEP]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <32 x i8>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <32 x i8>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <32 x i8>, ptr [[TMP3]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD]], <32 x i8> [[WIDE_LOAD]], <32 x i8> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD9]], <32 x i8> [[WIDE_LOAD9]], <32 x i8> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD10]], <32 x i8> [[WIDE_LOAD10]], <32 x i8> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD11]], <32 x i8> [[WIDE_LOAD11]], <32 x i8> [[BROADCAST_SPLAT]])
-; CHECK-NEXT:    store <32 x i8> [[TMP4]], ptr [[NEXT_GEP5]], align 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 32
-; CHECK-NEXT:    store <32 x i8> [[TMP5]], ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 64
-; CHECK-NEXT:    store <32 x i8> [[TMP6]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 96
+; CHECK-NEXT:    store <32 x i8> [[TMP4]], ptr [[NEXT_GEP5]], align 2
+; CHECK-NEXT:    store <32 x i8> [[TMP5]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store <32 x i8> [[TMP6]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    store <32 x i8> [[TMP7]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index 995973a64523..292ab4e4b2c4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -36,12 +36,12 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b)
 ; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope !0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP2]], align 8, !alias.scope !0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 32
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP3]], align 8, !alias.scope !0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 48
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP2]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP3]], align 8, !alias.scope !0
 ; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i32>, ptr [[TMP4]], align 8, !alias.scope !0
 ; CHECK-NEXT:    [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP6]] = add <16 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 09be042140fd..19ac52cf0d0b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -47,12 +47,12 @@ define i32 @test_explicit_pred(i64 %len) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP8]], i32 12
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
@@ -207,12 +207,12 @@ define i32 @test_explicit_pred_generic(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4
 ; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
@@ -848,12 +848,12 @@ define i32 @test_max_trip_count(i64 %len, ptr %test_base, i64 %n) {
 ; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i1> [[TMP48]], <i1 true, i1 true, i1 true, i1 true>
@@ -1015,12 +1015,12 @@ define i32 @test_non_zero_start(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4
 ; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
@@ -1423,12 +1423,12 @@ define i32 @neg_off_by_many(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
@@ -1583,12 +1583,12 @@ define i32 @neg_off_by_one_iteration(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
@@ -1743,12 +1743,12 @@ define i32 @neg_off_by_one_byte(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
@@ -1912,12 +1912,12 @@ define i32 @test_constant_max(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4
 ; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP72]], align 4
 ; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i1> [[TMP48]], <i1 true, i1 true, i1 true, i1 true>
@@ -2080,12 +2080,12 @@ define i32 @test_allocsize(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
@@ -2241,12 +2241,12 @@ define i32 @test_allocsize_array(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
@@ -2412,12 +2412,12 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], <i1 true, i1 true, i1 true, i1 true>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index 5cc4d43ec2e4..cf4327791628 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -101,12 +101,12 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP4]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP5]]
 ; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4
 ; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 8
-; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4
 ; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 16
-; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP12]], align 4
 ; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 24
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4
+; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4
+; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP12]], align 4
 ; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4
 ; AVX2-NEXT:    [[TMP14:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX2-NEXT:    [[TMP15:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
@@ -117,12 +117,12 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP4]]
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP5]]
 ; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP22]], i32 4, <8 x i1> [[TMP14]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP18]], i32 8
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP23]], i32 4, <8 x i1> [[TMP15]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP18]], i32 16
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP24]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP18]], i32 24
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP22]], i32 4, <8 x i1> [[TMP14]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP23]], i32 4, <8 x i1> [[TMP15]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP24]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP25]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP26:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX2-NEXT:    [[TMP27:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
@@ -133,12 +133,12 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]]
 ; AVX2-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP5]]
 ; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[TMP30]], i32 0
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP26]], ptr [[TMP34]], i32 4, <8 x i1> [[TMP14]])
 ; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[TMP30]], i32 8
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP27]], ptr [[TMP35]], i32 4, <8 x i1> [[TMP15]])
 ; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[TMP30]], i32 16
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP28]], ptr [[TMP36]], i32 4, <8 x i1> [[TMP16]])
 ; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr i32, ptr [[TMP30]], i32 24
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP26]], ptr [[TMP34]], i32 4, <8 x i1> [[TMP14]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP27]], ptr [[TMP35]], i32 4, <8 x i1> [[TMP15]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP28]], ptr [[TMP36]], i32 4, <8 x i1> [[TMP16]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP29]], ptr [[TMP37]], i32 4, <8 x i1> [[TMP17]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
@@ -196,12 +196,12 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP4]]
 ; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP5]]
 ; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4
 ; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 16
-; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP11]], align 4
 ; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 32
-; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP12]], align 4
 ; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 48
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP11]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP12]], align 4
 ; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP13]], align 4
 ; AVX512-NEXT:    [[TMP14:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX512-NEXT:    [[TMP15:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
@@ -212,12 +212,12 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP4]]
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP5]]
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 4, <16 x i1> [[TMP14]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP18]], i32 16
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP23]], i32 4, <16 x i1> [[TMP15]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP18]], i32 32
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP24]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP18]], i32 48
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 4, <16 x i1> [[TMP14]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP23]], i32 4, <16 x i1> [[TMP15]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP24]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP25]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP26:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX512-NEXT:    [[TMP27:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
@@ -228,12 +228,12 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]]
 ; AVX512-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP5]]
 ; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[TMP30]], i32 0
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP26]], ptr [[TMP34]], i32 4, <16 x i1> [[TMP14]])
 ; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[TMP30]], i32 16
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP27]], ptr [[TMP35]], i32 4, <16 x i1> [[TMP15]])
 ; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[TMP30]], i32 32
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP28]], ptr [[TMP36]], i32 4, <16 x i1> [[TMP16]])
 ; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr i32, ptr [[TMP30]], i32 48
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP26]], ptr [[TMP34]], i32 4, <16 x i1> [[TMP14]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP27]], ptr [[TMP35]], i32 4, <16 x i1> [[TMP15]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP28]], ptr [[TMP36]], i32 4, <16 x i1> [[TMP16]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP29]], ptr [[TMP37]], i32 4, <16 x i1> [[TMP17]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
@@ -400,12 +400,12 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP4]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP5]]
 ; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 0
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP10]], align 4
 ; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 8
-; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP11]], align 4
 ; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 16
-; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP12]], align 4
 ; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 24
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP10]], align 4
+; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP11]], align 4
+; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP12]], align 4
 ; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP13]], align 4
 ; AVX2-NEXT:    [[TMP14:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX2-NEXT:    [[TMP15:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
@@ -416,12 +416,12 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP4]]
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP5]]
 ; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 0
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP22]], i32 4, <8 x i1> [[TMP14]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 8
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP23]], i32 4, <8 x i1> [[TMP15]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 16
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP24]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 24
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP22]], i32 4, <8 x i1> [[TMP14]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP23]], i32 4, <8 x i1> [[TMP15]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP24]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP25]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP26:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX2-NEXT:    [[TMP27:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
@@ -432,12 +432,12 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP4]]
 ; AVX2-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP5]]
 ; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 0
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP26]], ptr addrspace(1) [[TMP34]], i32 4, <8 x i1> [[TMP14]])
 ; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 8
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP27]], ptr addrspace(1) [[TMP35]], i32 4, <8 x i1> [[TMP15]])
 ; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 16
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP28]], ptr addrspace(1) [[TMP36]], i32 4, <8 x i1> [[TMP16]])
 ; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 24
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP26]], ptr addrspace(1) [[TMP34]], i32 4, <8 x i1> [[TMP14]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP27]], ptr addrspace(1) [[TMP35]], i32 4, <8 x i1> [[TMP15]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP28]], ptr addrspace(1) [[TMP36]], i32 4, <8 x i1> [[TMP16]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP29]], ptr addrspace(1) [[TMP37]], i32 4, <8 x i1> [[TMP17]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
@@ -495,12 +495,12 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP4]]
 ; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP5]]
 ; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP10]], align 4
 ; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 16
-; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP11]], align 4
 ; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 32
-; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP12]], align 4
 ; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP6]], i32 48
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP10]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP11]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP12]], align 4
 ; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP13]], align 4
 ; AVX512-NEXT:    [[TMP14:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX512-NEXT:    [[TMP15:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
@@ -511,12 +511,12 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP4]]
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP5]]
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 0
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP22]], i32 4, <16 x i1> [[TMP14]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 16
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP23]], i32 4, <16 x i1> [[TMP15]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 32
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP24]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP18]], i32 48
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP22]], i32 4, <16 x i1> [[TMP14]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP23]], i32 4, <16 x i1> [[TMP15]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP24]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP25]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP26:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX512-NEXT:    [[TMP27:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
@@ -527,12 +527,12 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP4]]
 ; AVX512-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP5]]
 ; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 0
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP26]], ptr addrspace(1) [[TMP34]], i32 4, <16 x i1> [[TMP14]])
 ; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 16
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP27]], ptr addrspace(1) [[TMP35]], i32 4, <16 x i1> [[TMP15]])
 ; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 32
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP28]], ptr addrspace(1) [[TMP36]], i32 4, <16 x i1> [[TMP16]])
 ; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP30]], i32 48
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP26]], ptr addrspace(1) [[TMP34]], i32 4, <16 x i1> [[TMP14]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP27]], ptr addrspace(1) [[TMP35]], i32 4, <16 x i1> [[TMP15]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP28]], ptr addrspace(1) [[TMP36]], i32 4, <16 x i1> [[TMP16]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP29]], ptr addrspace(1) [[TMP37]], i32 4, <16 x i1> [[TMP17]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
@@ -710,12 +710,12 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP4]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP5]]
 ; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4
 ; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 8
-; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4
 ; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 16
-; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP12]], align 4
 ; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 24
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4
+; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4
+; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP12]], align 4
 ; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4
 ; AVX2-NEXT:    [[TMP14:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX2-NEXT:    [[TMP15:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
@@ -726,12 +726,12 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP4]]
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP5]]
 ; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr float, ptr [[TMP18]], i32 0
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP22]], i32 4, <8 x i1> [[TMP14]], <8 x float> poison)
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr float, ptr [[TMP18]], i32 8
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP23]], i32 4, <8 x i1> [[TMP15]], <8 x float> poison)
 ; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr float, ptr [[TMP18]], i32 16
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP24]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison)
 ; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr float, ptr [[TMP18]], i32 24
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP22]], i32 4, <8 x i1> [[TMP14]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP23]], i32 4, <8 x i1> [[TMP15]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP24]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP25]], i32 4, <8 x i1> [[TMP17]], <8 x float> poison)
 ; AVX2-NEXT:    [[TMP26:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
 ; AVX2-NEXT:    [[TMP27:%.*]] = sitofp <8 x i32> [[WIDE_LOAD5]] to <8 x float>
@@ -746,12 +746,12 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]]
 ; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP5]]
 ; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr float, ptr [[TMP34]], i32 0
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP30]], ptr [[TMP38]], i32 4, <8 x i1> [[TMP14]])
 ; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[TMP34]], i32 8
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP31]], ptr [[TMP39]], i32 4, <8 x i1> [[TMP15]])
 ; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr float, ptr [[TMP34]], i32 16
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP32]], ptr [[TMP40]], i32 4, <8 x i1> [[TMP16]])
 ; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr float, ptr [[TMP34]], i32 24
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP30]], ptr [[TMP38]], i32 4, <8 x i1> [[TMP14]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP31]], ptr [[TMP39]], i32 4, <8 x i1> [[TMP15]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP32]], ptr [[TMP40]], i32 4, <8 x i1> [[TMP16]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP33]], ptr [[TMP41]], i32 4, <8 x i1> [[TMP17]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
@@ -810,12 +810,12 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP4]]
 ; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP5]]
 ; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4
 ; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 16
-; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP11]], align 4
 ; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 32
-; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP12]], align 4
 ; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 48
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP11]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP12]], align 4
 ; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP13]], align 4
 ; AVX512-NEXT:    [[TMP14:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX512-NEXT:    [[TMP15:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
@@ -826,12 +826,12 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP4]]
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP5]]
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr float, ptr [[TMP18]], i32 0
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP22]], i32 4, <16 x i1> [[TMP14]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr float, ptr [[TMP18]], i32 16
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP23]], i32 4, <16 x i1> [[TMP15]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr float, ptr [[TMP18]], i32 32
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP24]], i32 4, <16 x i1> [[TMP16]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr float, ptr [[TMP18]], i32 48
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP22]], i32 4, <16 x i1> [[TMP14]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP23]], i32 4, <16 x i1> [[TMP15]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP24]], i32 4, <16 x i1> [[TMP16]], <16 x float> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP25]], i32 4, <16 x i1> [[TMP17]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP26:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
 ; AVX512-NEXT:    [[TMP27:%.*]] = sitofp <16 x i32> [[WIDE_LOAD5]] to <16 x float>
@@ -846,12 +846,12 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP4]]
 ; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP5]]
 ; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr float, ptr [[TMP34]], i32 0
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP30]], ptr [[TMP38]], i32 4, <16 x i1> [[TMP14]])
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[TMP34]], i32 16
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP31]], ptr [[TMP39]], i32 4, <16 x i1> [[TMP15]])
 ; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr float, ptr [[TMP34]], i32 32
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP32]], ptr [[TMP40]], i32 4, <16 x i1> [[TMP16]])
 ; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr float, ptr [[TMP34]], i32 48
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP30]], ptr [[TMP38]], i32 4, <16 x i1> [[TMP14]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP31]], ptr [[TMP39]], i32 4, <16 x i1> [[TMP15]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP32]], ptr [[TMP40]], i32 4, <16 x i1> [[TMP16]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP33]], ptr [[TMP41]], i32 4, <16 x i1> [[TMP17]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
@@ -975,12 +975,12 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
 ; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]]
 ; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4, !alias.scope !8
 ; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; AVX-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope !8
 ; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
-; AVX-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4, !alias.scope !8
 ; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4, !alias.scope !8
+; AVX-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope !8
+; AVX-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4, !alias.scope !8
 ; AVX-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope !8
 ; AVX-NEXT:    [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
 ; AVX-NEXT:    [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100>
@@ -991,12 +991,12 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX-NEXT:    [[TMP18:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP2]]
 ; AVX-NEXT:    [[TMP19:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP3]]
 ; AVX-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[TMP16]], i32 0
-; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP12]], <4 x double> poison), !alias.scope !11
 ; AVX-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[TMP16]], i32 4
-; AVX-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> poison), !alias.scope !11
 ; AVX-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[TMP16]], i32 8
-; AVX-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP14]], <4 x double> poison), !alias.scope !11
 ; AVX-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[TMP16]], i32 12
+; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP12]], <4 x double> poison), !alias.scope !11
+; AVX-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> poison), !alias.scope !11
+; AVX-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP14]], <4 x double> poison), !alias.scope !11
 ; AVX-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double> poison), !alias.scope !11
 ; AVX-NEXT:    [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
 ; AVX-NEXT:    [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double>
@@ -1011,12 +1011,12 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP2]]
 ; AVX-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
 ; AVX-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP32]], i32 0
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP28]], ptr [[TMP36]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !13, !noalias !15
 ; AVX-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP32]], i32 4
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP29]], ptr [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !13, !noalias !15
 ; AVX-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP32]], i32 8
-; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP30]], ptr [[TMP38]], i32 8, <4 x i1> [[TMP14]]), !alias.scope !13, !noalias !15
 ; AVX-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP32]], i32 12
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP28]], ptr [[TMP36]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !13, !noalias !15
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP29]], ptr [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !13, !noalias !15
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP30]], ptr [[TMP38]], i32 8, <4 x i1> [[TMP14]]), !alias.scope !13, !noalias !15
 ; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP31]], ptr [[TMP39]], i32 8, <4 x i1> [[TMP15]]), !alias.scope !13, !noalias !15
 ; AVX-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
@@ -1075,12 +1075,12 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]]
 ; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4, !alias.scope !11
 ; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
-; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope !11
 ; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 16
-; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4, !alias.scope !11
 ; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 24
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP10]], align 4, !alias.scope !11
 ; AVX512-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4, !alias.scope !11
 ; AVX512-NEXT:    [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX512-NEXT:    [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
@@ -1091,12 +1091,12 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP3]]
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[TMP16]], i32 0
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP12]], <8 x double> poison), !alias.scope !14
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[TMP16]], i32 8
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> poison), !alias.scope !14
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[TMP16]], i32 16
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP14]], <8 x double> poison), !alias.scope !14
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[TMP16]], i32 24
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP12]], <8 x double> poison), !alias.scope !14
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> poison), !alias.scope !14
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP14]], <8 x double> poison), !alias.scope !14
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double> poison), !alias.scope !14
 ; AVX512-NEXT:    [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double>
 ; AVX512-NEXT:    [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x double>
@@ -1111,12 +1111,12 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
 ; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP32]], i32 0
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP28]], ptr [[TMP36]], i32 8, <8 x i1> [[TMP12]]), !alias.scope !16, !noalias !18
 ; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP32]], i32 8
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP29]], ptr [[TMP37]], i32 8, <8 x i1> [[TMP13]]), !alias.scope !16, !noalias !18
 ; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP32]], i32 16
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP30]], ptr [[TMP38]], i32 8, <8 x i1> [[TMP14]]), !alias.scope !16, !noalias !18
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP32]], i32 24
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP28]], ptr [[TMP36]], i32 8, <8 x i1> [[TMP12]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP29]], ptr [[TMP37]], i32 8, <8 x i1> [[TMP13]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP30]], ptr [[TMP38]], i32 8, <8 x i1> [[TMP14]]), !alias.scope !16, !noalias !18
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP31]], ptr [[TMP39]], i32 8, <8 x i1> [[TMP15]]), !alias.scope !16, !noalias !18
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
@@ -1369,18 +1369,18 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]]
 ; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -3
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope !18
-; AVX2-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -4
 ; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 -3
-; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope !18
-; AVX2-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD6]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -8
 ; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 -3
-; AVX2-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4, !alias.scope !18
-; AVX2-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD8]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -12
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 -3
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope !18
+; AVX2-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope !18
+; AVX2-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD6]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4, !alias.scope !18
+; AVX2-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD8]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope !18
 ; AVX2-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
@@ -1391,24 +1391,24 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP1]]
 ; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP2]]
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP20]], i32 0
+; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP24]], i32 -3
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP20]], i32 -4
+; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP26]], i32 -3
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[TMP20]], i32 -8
+; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 -3
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -12
+; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -3
 ; AVX2-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP18]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP19]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP20]], i32 0
-; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP24]], i32 -3
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP25]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope !21
 ; AVX2-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP20]], i32 -4
-; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP26]], i32 -3
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP27]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope !21
 ; AVX2-NEXT:    [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD15]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[TMP20]], i32 -8
-; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 -3
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope !21
 ; AVX2-NEXT:    [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -12
-; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -3
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !21
 ; AVX2-NEXT:    [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[TMP32:%.*]] = fadd <4 x double> [[REVERSE13]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
@@ -1419,21 +1419,21 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]]
 ; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]]
 ; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]]
-; AVX2-NEXT:    [[REVERSE23:%.*]] = shufflevector <4 x double> [[TMP32]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
 ; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr double, ptr [[TMP40]], i32 -3
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE23]], ptr [[TMP41]], i32 8, <4 x i1> [[REVERSE12]]), !alias.scope !23, !noalias !25
-; AVX2-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x double> [[TMP33]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[TMP42:%.*]] = getelementptr double, ptr [[TMP36]], i32 -4
 ; AVX2-NEXT:    [[TMP43:%.*]] = getelementptr double, ptr [[TMP42]], i32 -3
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE25]], ptr [[TMP43]], i32 8, <4 x i1> [[REVERSE14]]), !alias.scope !23, !noalias !25
-; AVX2-NEXT:    [[REVERSE27:%.*]] = shufflevector <4 x double> [[TMP34]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr double, ptr [[TMP36]], i32 -8
 ; AVX2-NEXT:    [[TMP45:%.*]] = getelementptr double, ptr [[TMP44]], i32 -3
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE27]], ptr [[TMP45]], i32 8, <4 x i1> [[REVERSE17]]), !alias.scope !23, !noalias !25
-; AVX2-NEXT:    [[REVERSE29:%.*]] = shufflevector <4 x double> [[TMP35]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr double, ptr [[TMP36]], i32 -12
 ; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr double, ptr [[TMP46]], i32 -3
+; AVX2-NEXT:    [[REVERSE23:%.*]] = shufflevector <4 x double> [[TMP32]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE23]], ptr [[TMP41]], i32 8, <4 x i1> [[REVERSE12]]), !alias.scope !23, !noalias !25
+; AVX2-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x double> [[TMP33]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE25]], ptr [[TMP43]], i32 8, <4 x i1> [[REVERSE14]]), !alias.scope !23, !noalias !25
+; AVX2-NEXT:    [[REVERSE27:%.*]] = shufflevector <4 x double> [[TMP34]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE27]], ptr [[TMP45]], i32 8, <4 x i1> [[REVERSE17]]), !alias.scope !23, !noalias !25
+; AVX2-NEXT:    [[REVERSE29:%.*]] = shufflevector <4 x double> [[TMP35]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE29]], ptr [[TMP47]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !23, !noalias !25
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
@@ -1493,18 +1493,18 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP3]]
 ; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
 ; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -7
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope !31
-; AVX512-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -8
 ; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 -7
-; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4, !alias.scope !31
-; AVX512-NEXT:    [[REVERSE7:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD6]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -16
 ; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 -7
-; AVX512-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4, !alias.scope !31
-; AVX512-NEXT:    [[REVERSE9:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD8]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -24
 ; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 -7
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP11]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[REVERSE7:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD6]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[REVERSE9:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD8]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP15]], align 4, !alias.scope !31
 ; AVX512-NEXT:    [[REVERSE11:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD10]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer
@@ -1515,24 +1515,24 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP1]]
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP3]]
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP20]], i32 0
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP24]], i32 -7
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP20]], i32 -8
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP26]], i32 -7
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[TMP20]], i32 -16
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 -7
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -24
+; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -7
 ; AVX512-NEXT:    [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP18]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP19]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP20]], i32 0
-; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP24]], i32 -7
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP25]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope !34
 ; AVX512-NEXT:    [[REVERSE13:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP20]], i32 -8
-; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP26]], i32 -7
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP27]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope !34
 ; AVX512-NEXT:    [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD15]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[TMP20]], i32 -16
-; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 -7
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope !34
 ; AVX512-NEXT:    [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -24
-; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -7
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !34
 ; AVX512-NEXT:    [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP32:%.*]] = fadd <8 x double> [[REVERSE13]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
@@ -1543,21 +1543,21 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP1]]
 ; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP3]]
-; AVX512-NEXT:    [[REVERSE23:%.*]] = shufflevector <8 x double> [[TMP32]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
 ; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr double, ptr [[TMP40]], i32 -7
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE23]], ptr [[TMP41]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope !36, !noalias !38
-; AVX512-NEXT:    [[REVERSE25:%.*]] = shufflevector <8 x double> [[TMP33]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr double, ptr [[TMP36]], i32 -8
 ; AVX512-NEXT:    [[TMP43:%.*]] = getelementptr double, ptr [[TMP42]], i32 -7
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE25]], ptr [[TMP43]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope !36, !noalias !38
-; AVX512-NEXT:    [[REVERSE27:%.*]] = shufflevector <8 x double> [[TMP34]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr double, ptr [[TMP36]], i32 -16
 ; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr double, ptr [[TMP44]], i32 -7
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE27]], ptr [[TMP45]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope !36, !noalias !38
-; AVX512-NEXT:    [[REVERSE29:%.*]] = shufflevector <8 x double> [[TMP35]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr double, ptr [[TMP36]], i32 -24
 ; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr double, ptr [[TMP46]], i32 -7
+; AVX512-NEXT:    [[REVERSE23:%.*]] = shufflevector <8 x double> [[TMP32]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE23]], ptr [[TMP41]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[REVERSE25:%.*]] = shufflevector <8 x double> [[TMP33]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE25]], ptr [[TMP43]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[REVERSE27:%.*]] = shufflevector <8 x double> [[TMP34]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE27]], ptr [[TMP45]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[REVERSE29:%.*]] = shufflevector <8 x double> [[TMP35]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE29]], ptr [[TMP47]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !36, !noalias !38
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
@@ -1646,12 +1646,12 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]]
 ; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]]
 ; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
 ; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4
-; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
 ; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
-; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
 ; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
 ; AVX1-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
 ; AVX1-NEXT:    [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
 ; AVX1-NEXT:    [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
@@ -1670,12 +1670,12 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX1-NEXT:    [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX1-NEXT:    [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX1-NEXT:    [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
@@ -1694,12 +1694,12 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]])
 ; AVX1-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]])
 ; AVX1-NEXT:    [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]])
 ; AVX1-NEXT:    [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]])
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1758,12 +1758,12 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]]
 ; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4
-; AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
 ; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
-; AVX2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
 ; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; AVX2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
 ; AVX2-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
 ; AVX2-NEXT:    [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
 ; AVX2-NEXT:    [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
@@ -1782,12 +1782,12 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX2-NEXT:    [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX2-NEXT:    [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
@@ -1806,12 +1806,12 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]])
 ; AVX2-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]])
 ; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]])
 ; AVX2-NEXT:    [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1870,12 +1870,12 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]]
 ; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP8]], align 1
 ; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
-; AVX512-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1
 ; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
-; AVX512-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1
 ; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 24
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP8]], align 1
+; AVX512-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1
+; AVX512-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1
 ; AVX512-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP11]], align 1
 ; AVX512-NEXT:    [[TMP12:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 ; AVX512-NEXT:    [[TMP13:%.*]] = and <8 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -1894,12 +1894,12 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP26:%.*]] = xor <8 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; AVX512-NEXT:    [[TMP27:%.*]] = xor <8 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP24]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP25]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 16
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP26]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 24
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP24]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP25]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP26]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP27]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP32:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
@@ -1918,12 +1918,12 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP46:%.*]] = select <8 x i1> [[TMP26]], <8 x i1> [[TMP42]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP47:%.*]] = select <8 x i1> [[TMP27]], <8 x i1> [[TMP43]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <8 x i1> [[TMP44]])
 ; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 8
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <8 x i1> [[TMP45]])
 ; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 16
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <8 x i1> [[TMP46]])
 ; AVX512-NEXT:    [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 24
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <8 x i1> [[TMP44]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <8 x i1> [[TMP45]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <8 x i1> [[TMP46]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP51]], i32 8, <8 x i1> [[TMP47]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2027,12 +2027,12 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]]
 ; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]]
 ; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
 ; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4
-; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
 ; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
-; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
 ; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
 ; AVX1-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
 ; AVX1-NEXT:    [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
 ; AVX1-NEXT:    [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
@@ -2051,12 +2051,12 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX1-NEXT:    [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX1-NEXT:    [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX1-NEXT:    [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
@@ -2075,12 +2075,12 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]])
 ; AVX1-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]])
 ; AVX1-NEXT:    [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]])
 ; AVX1-NEXT:    [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]])
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2139,12 +2139,12 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]]
 ; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
 ; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4
-; AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
 ; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
-; AVX2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
 ; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; AVX2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
 ; AVX2-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
 ; AVX2-NEXT:    [[TMP12:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
 ; AVX2-NEXT:    [[TMP13:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
@@ -2163,12 +2163,12 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP26:%.*]] = xor <4 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX2-NEXT:    [[TMP27:%.*]] = xor <4 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true>
 ; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 4
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 12
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP28]], i32 8, <4 x i1> [[TMP24]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP29]], i32 8, <4 x i1> [[TMP25]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP30]], i32 8, <4 x i1> [[TMP26]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP31]], i32 8, <4 x i1> [[TMP27]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP32:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX2-NEXT:    [[TMP33:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
@@ -2187,12 +2187,12 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP46:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP42]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP47:%.*]] = select <4 x i1> [[TMP27]], <4 x i1> [[TMP43]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]])
 ; AVX2-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 4
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]])
 ; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 8
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]])
 ; AVX2-NEXT:    [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 12
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <4 x i1> [[TMP44]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <4 x i1> [[TMP45]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <4 x i1> [[TMP46]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP51]], i32 8, <4 x i1> [[TMP47]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2251,12 +2251,12 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[TMP3]]
 ; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP8]], align 1
 ; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
-; AVX512-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1
 ; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
-; AVX512-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1
 ; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 24
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP8]], align 1
+; AVX512-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1
+; AVX512-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP10]], align 1
 ; AVX512-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP11]], align 1
 ; AVX512-NEXT:    [[TMP12:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 ; AVX512-NEXT:    [[TMP13:%.*]] = and <8 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -2275,12 +2275,12 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP26:%.*]] = xor <8 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; AVX512-NEXT:    [[TMP27:%.*]] = xor <8 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 0
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP24]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 8
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP25]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 16
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP26]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr ptr, ptr [[TMP20]], i32 24
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP28]], i32 8, <8 x i1> [[TMP24]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP29]], i32 8, <8 x i1> [[TMP25]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP30]], i32 8, <8 x i1> [[TMP26]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP31]], i32 8, <8 x i1> [[TMP27]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP32:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
@@ -2299,12 +2299,12 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP46:%.*]] = select <8 x i1> [[TMP26]], <8 x i1> [[TMP42]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP47:%.*]] = select <8 x i1> [[TMP27]], <8 x i1> [[TMP43]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[TMP36]], i32 0
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <8 x i1> [[TMP44]])
 ; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[TMP36]], i32 8
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <8 x i1> [[TMP45]])
 ; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr double, ptr [[TMP36]], i32 16
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <8 x i1> [[TMP46]])
 ; AVX512-NEXT:    [[TMP51:%.*]] = getelementptr double, ptr [[TMP36]], i32 24
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP48]], i32 8, <8 x i1> [[TMP44]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP49]], i32 8, <8 x i1> [[TMP45]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP50]], i32 8, <8 x i1> [[TMP46]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP51]], i32 8, <8 x i1> [[TMP47]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
index 3a5db926082f..4472ae4fecd3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
@@ -30,20 +30,20 @@ define void @foo(ptr addrspace(1) align 8 dereferenceable_or_null(16), ptr addrs
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT12]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP5]], align 8, !alias.scope !0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP5]], i64 4
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP6]], align 8, !alias.scope !0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP5]], i64 8
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP7]], align 8, !alias.scope !0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP5]], i64 12
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP5]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP6]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP7]], align 8, !alias.scope !0
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP8]], align 8, !alias.scope !0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT10]], i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD]], ptr addrspace(1) [[TMP9]], align 8, !alias.scope !3, !noalias !0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP9]], i64 4
-; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD3]], ptr addrspace(1) [[TMP10]], align 8, !alias.scope !3, !noalias !0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP9]], i64 8
-; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD4]], ptr addrspace(1) [[TMP11]], align 8, !alias.scope !3, !noalias !0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP9]], i64 12
+; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD]], ptr addrspace(1) [[TMP9]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD3]], ptr addrspace(1) [[TMP10]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD4]], ptr addrspace(1) [[TMP11]], align 8, !alias.scope !3, !noalias !0
 ; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD5]], ptr addrspace(1) [[TMP12]], align 8, !alias.scope !3, !noalias !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
index 36dee12e4d7d..a49090eecac4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
@@ -76,8 +76,8 @@ define i32 @main(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i32 [[TMP21]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP24]], align 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 4
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP24]], align 4
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP25]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -98,7 +98,7 @@ define i32 @main(ptr %ptr) {
 ; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[CONV5:%.*]] = zext i8 [[DEC]] to i32
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp ult i32 [[TMP0]], [[CONV5]]
-; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY8]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY8]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.cond4.for.inc9_crit_edge:
 ; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[FOR_BODY8]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    store i32 [[INC_LCSSA]], ptr @a, align 16
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
index 4fa65af14270..97e1c1c4362f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
@@ -135,8 +135,8 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; SSE41-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]]
 ; SSE41-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP1]]
 ; SSE41-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0
-; SSE41-NEXT:    store <4 x i32> [[TMP24]], ptr [[TMP28]], align 4
 ; SSE41-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 4
+; SSE41-NEXT:    store <4 x i32> [[TMP24]], ptr [[TMP28]], align 4
 ; SSE41-NEXT:    store <4 x i32> [[TMP25]], ptr [[TMP29]], align 4
 ; SSE41-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; SSE41-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -271,12 +271,12 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; AVX1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP2]]
 ; AVX1-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP3]]
 ; AVX1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 0
-; AVX1-NEXT:    store <4 x i32> [[TMP48]], ptr [[TMP56]], align 4
 ; AVX1-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 4
-; AVX1-NEXT:    store <4 x i32> [[TMP49]], ptr [[TMP57]], align 4
 ; AVX1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 8
-; AVX1-NEXT:    store <4 x i32> [[TMP50]], ptr [[TMP58]], align 4
 ; AVX1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 12
+; AVX1-NEXT:    store <4 x i32> [[TMP48]], ptr [[TMP56]], align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP49]], ptr [[TMP57]], align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP50]], ptr [[TMP58]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP51]], ptr [[TMP59]], align 4
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
index 6c2cb100311e..804bcb54b4ba 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
@@ -64,8 +64,8 @@ define float @reduction_sum_float_fastmath(i32 %n, ptr %array) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP7]] = fadd fast <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
@@ -88,7 +88,7 @@ define float @reduction_sum_float_fastmath(i32 %n, ptr %array) {
 ; CHECK-NEXT:    [[SUM_INC]] = fadd fast float [[SUM]], [[VALUE]]
 ; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
 ; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
-; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       loop.exit.loopexit:
 ; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP_EXIT]]
@@ -133,8 +133,8 @@ define float @reduction_sum_float_only_reassoc(i32 %n, ptr %array) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6]] = fadd reassoc <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP7]] = fadd reassoc <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
@@ -202,8 +202,8 @@ define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, ptr %array)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6]] = fadd reassoc contract <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP7]] = fadd reassoc contract <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
@@ -278,8 +278,8 @@ define float @PR35538(ptr nocapture readonly %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
@@ -363,8 +363,8 @@ define float @PR35538_more_FMF(ptr nocapture readonly %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
diff --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
index 23945f9df0af..04b59475ca4b 100644
--- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
+++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
@@ -21,10 +21,10 @@ define dso_local void @constTC(ptr noalias nocapture %A) optsize {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <2 x i32> <i32 13, i32 13>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2
-; CHECK-NEXT:    store <2 x i32> <i32 13, i32 13>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 4
+; CHECK-NEXT:    store <2 x i32> <i32 13, i32 13>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    store <2 x i32> <i32 13, i32 13>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    store <2 x i32> <i32 13, i32 13>, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1800
@@ -40,7 +40,7 @@ define dso_local void @constTC(ptr noalias nocapture %A) optsize {
 ; CHECK-NEXT:    store i32 13, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 1800
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index d93cf2b14f8a..15e77f3a4847 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -19,11 +19,13 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) {
 ; CHECK-NEXT:     FIRST-ORDER-RECURRENCE-PHI ir<%for.2> = phi ir<33>, vp<[[FOR1_SPLICE:%.+]]>
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]>    = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep.ptr> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN ir<%for.1.next> = load ir<%gep.ptr>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.ptr>
+; CHECK-NEXT:     WIDEN ir<%for.1.next> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     EMIT vp<[[FOR1_SPLICE]]> = first-order splice ir<%for.1>, ir<%for.1.next>
 ; CHECK-NEXT:     EMIT vp<[[FOR2_SPLICE:%.+]]> = first-order splice ir<%for.2>, vp<[[FOR1_SPLICE]]>
 ; CHECK-NEXT:     WIDEN ir<%add> = add vp<[[FOR1_SPLICE]]>, vp<[[FOR2_SPLICE]]>
-; CHECK-NEXT:     WIDEN store ir<%gep.ptr>, ir<%add>
+; CHECK-NEXT:     vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%gep.ptr>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR2]]>, ir<%add>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
@@ -71,13 +73,15 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT:     FIRST-ORDER-RECURRENCE-PHI ir<%for.3> = phi ir<33>, vp<[[FOR2_SPLICE:%.+]]>
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]>    = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep.ptr> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]>
-; CHECK-NEXT:     WIDEN ir<%for.1.next> = load ir<%gep.ptr>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.ptr>
+; CHECK-NEXT:     WIDEN ir<%for.1.next> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     EMIT vp<[[FOR1_SPLICE]]> = first-order splice ir<%for.1>, ir<%for.1.next>
 ; CHECK-NEXT:     EMIT vp<[[FOR2_SPLICE]]> = first-order splice ir<%for.2>, vp<[[FOR1_SPLICE]]>
 ; CHECK-NEXT:     EMIT vp<[[FOR3_SPLICE:%.+]]> = first-order splice ir<%for.3>, vp<[[FOR2_SPLICE]]>
 ; CHECK-NEXT:     WIDEN ir<%add.1> = add vp<[[FOR1_SPLICE]]>, vp<[[FOR2_SPLICE]]>
 ; CHECK-NEXT:     WIDEN ir<%add.2> = add ir<%add.1>, vp<[[FOR3_SPLICE]]>
-; CHECK-NEXT:     WIDEN store ir<%gep.ptr>, ir<%add.2>
+; CHECK-NEXT:     vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%gep.ptr>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR2]]>, ir<%add.2>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 506b66276121..b451d4b4e546 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -38,8 +38,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 4
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD1]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> [[WIDE_LOAD1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -48,8 +48,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP11]]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[WIDE_LOAD1]], [[TMP12]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP17]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP17]], align 4
 ; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP16]], ptr [[TMP18]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -246,8 +246,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 4
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD2]] = load <4 x i32>, ptr [[TMP6]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -507,8 +507,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP3]]
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP4]]
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 4
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 2
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD1]] = load <4 x i16>, ptr [[TMP8]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -523,8 +523,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP3]]
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP4]]
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i32 0
-; UNROLL-NO-IC-NEXT:    store <4 x double> [[TMP17]], ptr [[TMP21]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i32 4
+; UNROLL-NO-IC-NEXT:    store <4 x double> [[TMP17]], ptr [[TMP21]], align 8
 ; UNROLL-NO-IC-NEXT:    store <4 x double> [[TMP18]], ptr [[TMP22]], align 8
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1818,8 +1818,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP2]]
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP3]]
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 4
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD1]] = load <4 x i16>, ptr [[TMP7]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -1832,8 +1832,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP18]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP18]], align 4
 ; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP19]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2041,8 +2041,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i16], ptr [[A]], i64 [[TMP6]], i64 1
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x i16], ptr [[A]], i64 [[TMP7]], i64 1
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; UNROLL-NO-IC-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, ptr [[TMP18]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, ptr [[TMP18]], align 4
 ; UNROLL-NO-IC-NEXT:    store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, ptr [[TMP19]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = load i16, ptr [[TMP10]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = load i16, ptr [[TMP11]], align 2
@@ -2071,8 +2071,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP4]]
 ; UNROLL-NO-IC-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 0
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP42]], ptr [[TMP46]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP42]], ptr [[TMP46]], align 4
 ; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP43]], ptr [[TMP47]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2284,8 +2284,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP2]]
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP3]]
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 4
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD1]] = load <4 x i16>, ptr [[TMP7]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -2300,8 +2300,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP16]], ptr [[TMP20]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP16]], ptr [[TMP20]], align 4
 ; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP21]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2601,8 +2601,8 @@ define void @sink_dead_inst(ptr %a) {
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[A]], i16 [[TMP1]]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr [[TMP14]], i32 0
-; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP12]], ptr [[TMP16]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[TMP14]], i32 4
+; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP12]], ptr [[TMP16]], align 2
 ; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP13]], ptr [[TMP17]], align 2
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], <i16 4, i16 4, i16 4, i16 4>
@@ -3514,8 +3514,8 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP1]]
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
-; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP12]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP12]], align 4
 ; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP13]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], <i16 4, i16 4, i16 4, i16 4>
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index f232ac3aa698..24c52b704952 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -102,8 +102,8 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N)
 ; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VEC4_INTERL2-NEXT:    [[STEP_ADD:%.*]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT5]]
 ; VEC4_INTERL2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 4
+; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], ptr [[TMP5]], align 4
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; VEC4_INTERL2-NEXT:    [[VEC_IND_NEXT]] = fsub fast <4 x float> [[STEP_ADD]], [[DOTSPLAT5]]
@@ -349,8 +349,8 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32
 ; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VEC4_INTERL2-NEXT:    [[STEP_ADD:%.*]] = fsub reassoc <4 x float> [[VEC_IND]], [[DOTSPLAT5]]
 ; VEC4_INTERL2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 4
+; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], ptr [[TMP5]], align 4
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; VEC4_INTERL2-NEXT:    [[VEC_IND_NEXT]] = fsub reassoc <4 x float> [[STEP_ADD]], [[DOTSPLAT5]]
@@ -586,8 +586,8 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 {
 ; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VEC4_INTERL2-NEXT:    [[STEP_ADD:%.*]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
 ; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 4
+; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; VEC4_INTERL2-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
@@ -859,8 +859,8 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL2-NEXT:    [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT13:%.*]], [[VECTOR_BODY]] ]
 ; VEC4_INTERL2-NEXT:    [[STEP_ADD11:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]]
 ; VEC4_INTERL2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND10]], ptr [[TMP6]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 4
+; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND10]], ptr [[TMP6]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD11]], ptr [[TMP7]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT]]
 ; VEC4_INTERL2-NEXT:    [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST_SPLAT]]
@@ -869,12 +869,12 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL2-NEXT:    [[TMP12:%.*]] = fadd fast <4 x float> [[TMP10]], [[TMP8]]
 ; VEC4_INTERL2-NEXT:    [[TMP13:%.*]] = fadd fast <4 x float> [[TMP11]], [[TMP9]]
 ; VEC4_INTERL2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    store <4 x float> [[TMP12]], ptr [[TMP14]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 4
+; VEC4_INTERL2-NEXT:    store <4 x float> [[TMP12]], ptr [[TMP14]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[TMP13]], ptr [[TMP15]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    store <4 x float> [[TMP10]], ptr [[TMP16]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 4
+; VEC4_INTERL2-NEXT:    store <4 x float> [[TMP10]], ptr [[TMP16]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP17]], align 4
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; VEC4_INTERL2-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float -4.000000e+00, float -4.000000e+00, float -4.000000e+00, float -4.000000e+00>
@@ -1167,8 +1167,8 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) {
 ; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VEC4_INTERL2-NEXT:    [[STEP_ADD:%.*]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
 ; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 4
+; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; VEC4_INTERL2-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
@@ -1403,8 +1403,8 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC4_INTERL2-NEXT:    [[DOTCAST2:%.*]] = sitofp i64 [[INDEX]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 4
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4
+; VEC4_INTERL2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; VEC4_INTERL2-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[TMP4:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD3]], zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index a8cfac64258e..d73193392e39 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -112,8 +112,8 @@ define void @multi_int_induction(ptr %A, i32 %N) {
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 190, i32 191>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4
 ; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP4]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
@@ -161,8 +161,8 @@ define void @multi_int_induction(ptr %A, i32 %N) {
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP7]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP7]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP8]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
@@ -205,8 +205,8 @@ define void @multi_int_induction(ptr %A, i32 %N) {
 ; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 190, i32 191, i32 192, i32 193>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP3]], align 4
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP3]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP4]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
@@ -410,13 +410,13 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[TMP5]], i64 [[OFFSET]]
-; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; UNROLL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 2
+; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; UNROLL-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP7]], align 4, !alias.scope [[META4]], !noalias [[META7]]
 ; UNROLL-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[OFFSET2]]
-; UNROLL-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP9]], align 4, !alias.scope [[META7]]
 ; UNROLL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 2
+; UNROLL-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP9]], align 4, !alias.scope [[META7]]
 ; UNROLL-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x float>, ptr [[TMP10]], align 4, !alias.scope [[META7]]
 ; UNROLL-NEXT:    [[TMP11:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]]
 ; UNROLL-NEXT:    [[TMP12:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD6]]
@@ -483,16 +483,16 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP11]], align 4, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 2
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP11]], align 4, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP12]], align 4, !alias.scope [[META4]], !noalias [[META7]]
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[OFFSET2]]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = add i64 [[TMP6]], [[OFFSET2]]
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP17]], align 4, !alias.scope [[META7]]
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 2
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP17]], align 4, !alias.scope [[META7]]
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x float>, ptr [[TMP18]], align 4, !alias.scope [[META7]]
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]]
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD6]]
@@ -553,13 +553,13 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[TMP5]], i64 [[OFFSET]]
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 4
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP7]], align 4, !alias.scope [[META4]], !noalias [[META7]]
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[OFFSET2]]
-; INTERLEAVE-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope [[META7]]
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 4
+; INTERLEAVE-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope [[META7]]
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP10]], align 4, !alias.scope [[META7]]
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]]
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD6]]
@@ -718,8 +718,8 @@ define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) {
 ; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
 ; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 2
+; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
 ; UNROLL-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
 ; UNROLL-NEXT:    [[TMP2]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; UNROLL-NEXT:    [[TMP3]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]]
@@ -766,8 +766,8 @@ define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP6]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP7]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]]
@@ -809,8 +809,8 @@ define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) {
 ; INTERLEAVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 4
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; INTERLEAVE-NEXT:    [[TMP2]] = add <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; INTERLEAVE-NEXT:    [[TMP3]] = add <4 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]]
@@ -2117,8 +2117,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NEXT:    [[TMP0:%.*]] = or disjoint i32 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
 ; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
-; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 2
+; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; UNROLL-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
 ; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL:       pred.udiv.if:
@@ -2210,8 +2210,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP1]]
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP6]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
@@ -2305,8 +2305,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = or disjoint i32 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 4
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; INTERLEAVE:       pred.udiv.if:
@@ -3607,8 +3607,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]]
 ; UNROLL-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; UNROLL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
@@ -3683,8 +3683,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP12]]
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP13]]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP16]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP16]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP17]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
@@ -3751,8 +3751,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]]
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
@@ -3995,8 +3995,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]]
 ; UNROLL-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; UNROLL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 16, i32 16>
@@ -4074,8 +4074,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP13]]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP14]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP17]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP17]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP18]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 8, i32 8>
@@ -4145,8 +4145,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]]
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 32, i32 32, i32 32, i32 32>
@@ -4293,8 +4293,8 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) {
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; UNROLL-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
 ; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
@@ -4334,8 +4334,8 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) {
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP1]]
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP5]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
@@ -4370,8 +4370,8 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) {
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
@@ -4511,8 +4511,8 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; UNROLL-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
 ; UNROLL-NEXT:    [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32
 ; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
@@ -4562,8 +4562,8 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP6]]
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP7]]
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP10]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP10]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP11]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
@@ -4603,8 +4603,8 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; INTERLEAVE-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
@@ -4748,8 +4748,8 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]]
 ; UNROLL-NEXT:    [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64
 ; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
@@ -4795,8 +4795,8 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP1]]
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP2]]
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP5]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP5]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP6]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
@@ -4837,8 +4837,8 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]]
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
@@ -4970,8 +4970,8 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
 ; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP0]], align 4
 ; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP0]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP1]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
@@ -5016,8 +5016,8 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP5]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 4, i32 4>
@@ -5057,8 +5057,8 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP0]], align 4
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP0]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP1]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 16, i32 16, i32 16, i32 16>
@@ -6050,8 +6050,8 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]]
 ; UNROLL-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP3]]
 ; UNROLL-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP4]]
-; UNROLL-NEXT:    store <2 x i32> [[TMP7]], ptr [[TMP6]], align 4
 ; UNROLL-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP6]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[TMP7]], ptr [[TMP6]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[TMP8]], ptr [[TMP9]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
@@ -6091,8 +6091,8 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP6]]
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP7]]
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP10]], ptr [[TMP12]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP10]], ptr [[TMP12]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP11]], ptr [[TMP13]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
@@ -6145,8 +6145,8 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]]
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[VEC_IND]], [[TMP3]]
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[STEP_ADD]], [[TMP4]]
-; INTERLEAVE-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP6]], align 4
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP6]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP6]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
@@ -6396,8 +6396,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
 ; UNROLL-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> <i32 1, i32 2>
 ; UNROLL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]]
-; UNROLL-NEXT:    store <2 x i32> [[TMP17]], ptr [[TMP19]], align 4
 ; UNROLL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 2
+; UNROLL-NEXT:    store <2 x i32> [[TMP17]], ptr [[TMP19]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[TMP18]], ptr [[TMP20]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], [[DOTSPLAT3]]
@@ -6479,8 +6479,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP19]]
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[TMP20]]
 ; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP21]], ptr [[TMP25]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 2
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP21]], ptr [[TMP25]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP22]], ptr [[TMP26]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], [[DOTSPLAT3]]
@@ -6554,8 +6554,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; INTERLEAVE-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP19]], align 4
 ; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 4
+; INTERLEAVE-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP19]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[TMP18]], ptr [[TMP20]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], [[DOTSPLAT3]]
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index 393490ad862b..5410d5d2f647 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -503,27 +503,27 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]]
-; CHECK-NEXT:    [[DOTSPLATINSERT9:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT10:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT9]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT11:%.*]] = insertelement <4 x i8> poison, i8 [[INDUCTION_IV]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT12:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT11]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i8> <i8 0, i8 1, i8 2, i8 3>, [[DOTSPLAT12]]
-; CHECK-NEXT:    [[INDUCTION13:%.*]] = add <4 x i8> [[DOTSPLAT10]], [[TMP8]]
+; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT10:%.*]] = insertelement <4 x i8> poison, i8 [[INDUCTION_IV]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT11:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT10]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i8> <i8 0, i8 1, i8 2, i8 3>, [[DOTSPLAT11]]
+; CHECK-NEXT:    [[INDUCTION12:%.*]] = add <4 x i8> [[DOTSPLAT9]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul i8 [[INDUCTION_IV]], 4
-; CHECK-NEXT:    [[DOTSPLATINSERT14:%.*]] = insertelement <4 x i8> poison, i8 [[TMP9]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT15:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT14]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT13:%.*]] = insertelement <4 x i8> poison, i8 [[TMP9]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT14:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT13]], <4 x i8> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND16:%.*]] = phi <4 x i8> [ [[INDUCTION13]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX18:%.*]] = add i64 1, [[INDEX8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX18]], 0
+; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND15:%.*]] = phi <4 x i8> [ [[INDUCTION12]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX17:%.*]] = add i64 1, [[INDEX7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX17]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[VEC_IND16]], ptr [[TMP12]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT19]] = add nuw i64 [[INDEX8]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT17]] = add <4 x i8> [[VEC_IND16]], [[DOTSPLAT15]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT19]], 84
+; CHECK-NEXT:    store <4 x i8> [[VEC_IND15]], ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT18]] = add nuw i64 [[INDEX7]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT16]] = add <4 x i8> [[VEC_IND15]], [[DOTSPLAT14]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT18]], 84
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    br i1 true, label [[OUTER_LATCH]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -593,27 +593,27 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLATINSERT9:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLAT10:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT9]], <2 x i8> poison, <2 x i32> zeroinitializer
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLATINSERT11:%.*]] = insertelement <2 x i8> poison, i8 [[INDUCTION_IV]], i64 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLAT12:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT11]], <2 x i8> poison, <2 x i32> zeroinitializer
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP8:%.*]] = mul <2 x i8> <i8 0, i8 1>, [[DOTSPLAT12]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDUCTION13:%.*]] = add <2 x i8> [[DOTSPLAT10]], [[TMP8]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT8]], <2 x i8> poison, <2 x i32> zeroinitializer
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLATINSERT10:%.*]] = insertelement <2 x i8> poison, i8 [[INDUCTION_IV]], i64 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLAT11:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT10]], <2 x i8> poison, <2 x i32> zeroinitializer
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP8:%.*]] = mul <2 x i8> <i8 0, i8 1>, [[DOTSPLAT11]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDUCTION12:%.*]] = add <2 x i8> [[DOTSPLAT9]], [[TMP8]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP9:%.*]] = mul i8 [[INDUCTION_IV]], 2
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLATINSERT14:%.*]] = insertelement <2 x i8> poison, i8 [[TMP9]], i64 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLAT15:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT14]], <2 x i8> poison, <2 x i32> zeroinitializer
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLATINSERT13:%.*]] = insertelement <2 x i8> poison, i8 [[TMP9]], i64 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[DOTSPLAT14:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT13]], <2 x i8> poison, <2 x i32> zeroinitializer
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.vector.body:
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND16:%.*]] = phi <2 x i8> [ [[INDUCTION13]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[OFFSET_IDX18:%.*]] = add i64 1, [[INDEX8]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX18]], 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND15:%.*]] = phi <2 x i8> [ [[INDUCTION12]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[OFFSET_IDX17:%.*]] = add i64 1, [[INDEX7]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX17]], 0
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <2 x i8> [[VEC_IND16]], ptr [[TMP12]], align 1
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT19]] = add nuw i64 [[INDEX8]], 2
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND_NEXT17]] = add <2 x i8> [[VEC_IND16]], [[DOTSPLAT15]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT19]], 84
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <2 x i8> [[VEC_IND15]], ptr [[TMP12]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT18]] = add nuw i64 [[INDEX7]], 2
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND_NEXT16]] = add <2 x i8> [[VEC_IND15]], [[DOTSPLAT14]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT18]], 84
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.middle.block:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 true, label [[OUTER_LATCH]], label [[VEC_EPILOG_SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
index 84d72892193e..e4c183906865 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
@@ -16,12 +16,12 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 8
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 12
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll b/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll
index 1cce3c837c08..0b98a054ebea 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll
@@ -14,10 +14,10 @@ define i32 @reduction_sum(i64 %n, ptr noalias nocapture %A) {
 ; UF3-NEXT:   [[GEP1:%.+]] = getelementptr inbounds i32, ptr %A, i64 [[IV1]]
 ; UF3-NEXT:   [[GEP2:%.+]] = getelementptr inbounds i32, ptr %A, i64 [[IV2]]
 ; UF3-NEXT:   [[L_GEP0:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 0
-; UF3-NEXT:   [[L0:%.+]] = load <4 x i32>, ptr [[L_GEP0]], align 4
 ; UF3-NEXT:   [[L_GEP1:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 4
-; UF3-NEXT:   [[L1:%.+]] = load <4 x i32>, ptr [[L_GEP1]], align 4
 ; UF3-NEXT:   [[L_GEP2:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 8
+; UF3-NEXT:   [[L0:%.+]] = load <4 x i32>, ptr [[L_GEP0]], align 4
+; UF3-NEXT:   [[L1:%.+]] = load <4 x i32>, ptr [[L_GEP1]], align 4
 ; UF3-NEXT:   [[L2:%.+]] = load <4 x i32>, ptr [[L_GEP2]], align 4
 ; UF3-NEXT:   [[SUM0_NEXT]] = add <4 x i32> [[SUM0]], [[L0]]
 ; UF3-NEXT:   [[SUM1_NEXT]] = add <4 x i32> [[SUM1]], [[L1]]
@@ -50,14 +50,14 @@ define i32 @reduction_sum(i64 %n, ptr noalias nocapture %A) {
 ; UF5-NEXT:   [[GEP3:%.+]] = getelementptr inbounds i32, ptr %A, i64 [[IV3]]
 ; UF5-NEXT:   [[GEP4:%.+]] = getelementptr inbounds i32, ptr %A, i64 [[IV4]]
 ; UF5-NEXT:   [[L_GEP0:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 0
-; UF5-NEXT:   [[L0:%.+]] = load <4 x i32>, ptr [[L_GEP0]], align 4
 ; UF5-NEXT:   [[L_GEP1:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 4
-; UF5-NEXT:   [[L1:%.+]] = load <4 x i32>, ptr [[L_GEP1]], align 4
 ; UF5-NEXT:   [[L_GEP2:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 8
-; UF5-NEXT:   [[L2:%.+]] = load <4 x i32>, ptr [[L_GEP2]], align 4
 ; UF5-NEXT:   [[L_GEP3:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 12
-; UF5-NEXT:   [[L3:%.+]] = load <4 x i32>, ptr [[L_GEP3]], align 4
 ; UF5-NEXT:   [[L_GEP4:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 16
+; UF5-NEXT:   [[L0:%.+]] = load <4 x i32>, ptr [[L_GEP0]], align 4
+; UF5-NEXT:   [[L1:%.+]] = load <4 x i32>, ptr [[L_GEP1]], align 4
+; UF5-NEXT:   [[L2:%.+]] = load <4 x i32>, ptr [[L_GEP2]], align 4
+; UF5-NEXT:   [[L3:%.+]] = load <4 x i32>, ptr [[L_GEP3]], align 4
 ; UF5-NEXT:   [[L4:%.+]] = load <4 x i32>, ptr [[L_GEP4]], align 4
 ; UF5-NEXT:   [[SUM0_NEXT]] = add <4 x i32> [[SUM0]], [[L0]]
 ; UF5-NEXT:   [[SUM1_NEXT]] = add <4 x i32> [[SUM1]], [[L1]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
index 2d5db6fc2833..1b9f15a419ea 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
@@ -32,18 +32,18 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 [[TMP11]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_LOAD]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_LOAD2]], [[STEP_ADD]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP17:%.*]] = shl i64 [[TMP16]], 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i64 [[TMP17]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP14]], ptr [[TMP18]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[STEP_ADD]], [[DOTSPLAT]]
@@ -113,16 +113,16 @@ define void @add_ind64_unrolled_nxv1i64(ptr noalias nocapture %a, ptr noalias no
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 [[TMP9]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 1 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 1 x i64> [[WIDE_LOAD]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 1 x i64> [[WIDE_LOAD2]], [[STEP_ADD]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP11]], ptr [[TMP13]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 [[TMP14]]
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP11]], ptr [[TMP13]], align 8
 ; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP12]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[STEP_ADD]], [[DOTSPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
index 33248e13320b..b50a7cd999ec 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
@@ -47,18 +47,18 @@
 ; CHECKUF2: vector.body:
 ; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, ptr %b, i64 %index
-; CHECKUF2: %wide.load = load <vscale x 4 x double>, ptr %[[IDXB]], align 8
 ; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
 ; CHECKUF2: %[[VSCALE2:.*]] = shl i64 %[[VSCALE]], 2
 ; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, ptr %[[IDXB]], i64 %[[VSCALE2]]
+; CHECKUF2: %wide.load = load <vscale x 4 x double>, ptr %[[IDXB]], align 8
 ; CHECKUF2: %wide.load{{[0-9]+}} = load <vscale x 4 x double>, ptr %[[IDXB_NEXT]], align 8
 ; CHECKUF2: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i64 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECKUF2: %[[FADD_NEXT:.*]] = fadd <vscale x 4 x double> %wide.load{{[0-9]+}}, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i64 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, ptr %a, i64 %index
-; CHECKUF2: store <vscale x 4 x double> %[[FADD]], ptr %[[IDXA]], align 8
 ; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
 ; CHECKUF2: %[[VSCALE2:.*]] = shl i64 %[[VSCALE]], 2
 ; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, ptr %[[IDXA]], i64 %[[VSCALE2]]
+; CHECKUF2: store <vscale x 4 x double> %[[FADD]], ptr %[[IDXA]], align 8
 ; CHECKUF2: store <vscale x 4 x double> %[[FADD_NEXT]], ptr %[[IDXA_NEXT]], align 8
 ; CHECKUF2: %index.next = add nuw i64 %index, %[[VSCALEX8]]
 ; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
diff --git a/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll b/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll
index ed0d35da387c..4a6523f2dabf 100644
--- a/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll
@@ -11,8 +11,8 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ; CHECK:   %[[T2:.+]] = add nuw nsw i64 %offset.idx, %tmp0
 ; CHECK:   %[[T3:.+]] = sub nsw i64 %[[T2]], %x
 ; CHECK:   %[[T4:.+]] = getelementptr inbounds i32, ptr %a, i64 %[[T3]]
-; CHECK:   load <4 x i32>, ptr %[[T4]], align 4
 ; CHECK:   %[[T6:.+]] = getelementptr inbounds i32, ptr %[[T4]], i64 4
+; CHECK:   load <4 x i32>, ptr %[[T4]], align 4
 ; CHECK:   load <4 x i32>, ptr %[[T6]], align 4
 ; CHECK:   br {{.*}}, label %middle.block, label %vector.body
 ;
@@ -30,8 +30,8 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ; NO-IC:   %[[T8:.+]] = getelementptr inbounds i32, ptr %a, i64 %[[T6]]
 ; NO-IC:   %[[T9:.+]] = getelementptr inbounds i32, ptr %a, i64 %[[T7]]
 ; NO-IC:   %[[T10:.+]] = getelementptr inbounds i32, ptr %[[T8]], i32 0
-; NO-IC:   load <4 x i32>, ptr %[[T10]], align 4
 ; NO-IC:   %[[T12:.+]] = getelementptr inbounds i32, ptr %[[T8]], i32 4
+; NO-IC:   load <4 x i32>, ptr %[[T10]], align 4
 ; NO-IC:   load <4 x i32>, ptr %[[T12]], align 4
 ; NO-IC:   br {{.*}}, label %middle.block, label %vector.body
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
index 305fb7f8ab8d..12b8e657aabf 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll
@@ -29,10 +29,12 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
 ; CHECK-NEXT:    "  EMIT vp\<[[CAN_IV:%.+]]\> = CANONICAL-INDUCTION ir\<0\>, vp\<[[CAN_IV_NEXT:%.+]]\>\l" +
 ; CHECK-NEXT:    "  vp\<[[STEPS:%.+]]\> = SCALAR-STEPS vp\<[[CAN_IV]]\>, ir\<1\>\l" +
 ; CHECK-NEXT:    "  CLONE ir\<%arrayidx\> = getelementptr inbounds ir\<%y\>, vp\<[[STEPS]]\>\l" +
-; CHECK-NEXT:    "  WIDEN ir\<%lv\> = load ir\<%arrayidx\>\l" +
+; CHECK-NEXT:    "  vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>\l" +
+; CHECK-NEXT:    "  WIDEN ir\<%lv\> = load vp\<[[VEC_PTR]]\>\l" +
 ; CHECK-NEXT:    "  WIDEN-CALL ir\<%call\> = call @llvm.sqrt.f32(ir\<%lv\>) (using vector intrinsic)\l" +
 ; CHECK-NEXT:    "  CLONE ir\<%arrayidx2\> = getelementptr inbounds ir\<%x\>, vp\<[[STEPS]]\>\l" +
-; CHECK-NEXT:    "  WIDEN store ir\<%arrayidx2\>, ir\<%call\>\l" +
+; CHECK-NEXT:    "  vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>\l" +
+; CHECK-NEXT:    "  WIDEN store vp\<[[VEC_PTR2]]\>, ir\<%call\>\l" +
 ; CHECK-NEXT:    "  EMIT vp\<[[CAN_IV_NEXT]]\> = add nuw vp\<[[CAN_IV]]\>, vp\<[[VFxUF]]\>\l" +
 ; CHECK-NEXT:    "  EMIT branch-on-count vp\<[[CAN_IV_NEXT]]\>, vp\<[[VEC_TC]]\>\l" +
 ; CHECK-NEXT:    "No successors\l"
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
index 8d50993faf9e..7ab2459ada2e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
@@ -17,7 +17,8 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
 ; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi 0, %iv.next.p, ir<1>
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]>    = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep> = getelementptr inbounds ir<%dst>, vp<[[STEPS:%.+]]>
-; CHECK-NEXT:     WIDEN store ir<%gep>, ir<%iv>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<%iv>
 ; CHECK-NEXT:     EMIT vp<[[CAN_INC:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_INC]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:   No successors
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 13f444eaaf76..52a18d7b365c 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -21,10 +21,12 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
-; CHECK-NEXT:   WIDEN ir<%lv> = load ir<%arrayidx>
+; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:   WIDEN ir<%lv> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:   WIDEN-CALL ir<%call> = call @llvm.sqrt.f32(ir<%lv>)
 ; CHECK-NEXT:   CLONE ir<%arrayidx2> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
-; CHECK-NEXT:   WIDEN store ir<%arrayidx2>, ir<%call>
+; CHECK-NEXT:   vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT:   WIDEN store vp<[[VEC_PTR2]]>, ir<%call>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -70,12 +72,14 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x,
 ; CHECK-NEXT:   WIDEN-INDUCTION %iv = phi %iv.next, 0, ir<1>
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr inbounds ir<%y>, ir<%iv>
-; CHECK-NEXT:   WIDEN ir<%lv> = load ir<%arrayidx>
+; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:   WIDEN ir<%lv> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:   WIDEN ir<%cmp> = icmp eq ir<%arrayidx>, ir<%z>
 ; CHECK-NEXT:   WIDEN-SELECT ir<%sel> = select ir<%cmp>, ir<1.000000e+01>, ir<2.000000e+01>
 ; CHECK-NEXT:   WIDEN ir<%add> = fadd ir<%lv>, ir<%sel>
 ; CHECK-NEXT:   CLONE ir<%arrayidx2> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
-; CHECK-NEXT:   WIDEN store ir<%arrayidx2>, ir<%add>
+; CHECK-NEXT:   vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT:   WIDEN store vp<[[VEC_PTR2]]>, ir<%add>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -123,7 +127,8 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next>
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
-; CHECK-NEXT:   WIDEN ir<%lv> = load ir<%arrayidx>
+; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:   WIDEN ir<%lv> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:   REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>)
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
@@ -170,7 +175,8 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
 ; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next>
 ; CHECK-NEXT:   vp<[[IV:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[IV]]>
-; CHECK-NEXT:   WIDEN ir<%lv> = load ir<%arrayidx>
+; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:   WIDEN ir<%lv> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:   REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) (with final reduction value stored in invariant address sank outside of loop)
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
@@ -241,7 +247,8 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
 ; CHECK-NEXT:   EMIT vp<[[NOT:%.+]]> = not ir<%cmp>
 ; CHECK-NEXT:   BLEND ir<%d> = ir<0>/vp<[[NOT]]> vp<[[PRED]]>/ir<%cmp>
 ; CHECK-NEXT:   CLONE ir<%idx> = getelementptr ir<%x>, vp<[[STEPS]]>
-; CHECK-NEXT:   WIDEN store ir<%idx>, ir<%d>
+; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx>
+; CHECK-NEXT:   WIDEN store vp<[[VEC_PTR]]>, ir<%d>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -364,9 +371,11 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%sum.07> = phi ir<0.000000e+00>, ir<%muladd>
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:   WIDEN ir<%l.a> = load ir<%arrayidx>
+; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:   WIDEN ir<%l.a> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:   CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:   WIDEN ir<%l.b> = load ir<%arrayidx2>
+; CHECK-NEXT:   vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%arrayidx2>
+; CHECK-NEXT:   WIDEN ir<%l.b> = load vp<[[VEC_PTR2]]>
 ; CHECK-NEXT:   EMIT vp<[[FMUL:%.+]]> = fmul nnan ninf nsz ir<%l.a>, ir<%l.b>
 ; CHECK-NEXT:   REDUCE ir<[[MULADD:%.+]]> = ir<%sum.07> + nnan ninf nsz reduce.fadd (vp<[[FMUL]]>)
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -415,7 +424,8 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db
 ; CHECK-NEXT:    EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
 ; CHECK-NEXT:    vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:    CLONE ir<%isd> = getelementptr inbounds ir<%asd>, vp<[[STEPS]]>
-; CHECK-NEXT:    WIDEN ir<%lsd> = load ir<%isd>
+; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%isd>
+; CHECK-NEXT:    WIDEN ir<%lsd> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:    WIDEN ir<%psd> = add nuw nsw ir<%lsd>, ir<23>
 ; CHECK-NEXT:    WIDEN ir<%cmp1> = icmp slt ir<%lsd>, ir<100>
 ; CHECK-NEXT:    WIDEN ir<%cmp2> = icmp sge ir<%lsd>, ir<200>
@@ -443,7 +453,8 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db
 ; CHECK-NEXT:    EMIT vp<[[NOT2:%.+]]> = not ir<%cmp2>
 ; CHECK-NEXT:    EMIT vp<[[SEL2:%.+]]> = select vp<[[NOT1]]>, vp<[[NOT2]]>, ir<false>
 ; CHECK-NEXT:    BLEND ir<%ysd.0> = vp<[[PHI]]>/vp<[[OR1]]> ir<%psd>/vp<[[SEL2]]>
-; CHECK-NEXT:    WIDEN store ir<%isd>, ir<%ysd.0>
+; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%isd>
+; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%ysd.0>
 ; CHECK-NEXT:    EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:    EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:  No successors
@@ -560,7 +571,8 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]>    = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%gep> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]>
 ; CHECK-NEXT:     WIDEN ir<%add> = add ir<%iv>, ir<%off>
-; CHECK-NEXT:     WIDEN store ir<%gep>, ir<0>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<0>
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:   No successors
@@ -605,12 +617,14 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr %
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%gep.y> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
-; CHECK-NEXT:   WIDEN ir<%lv> = load ir<%gep.y>
+; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.y>
+; CHECK-NEXT:   WIDEN ir<%lv> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:   WIDEN ir<%add> = fadd nnan ir<%lv>, ir<1.000000e+00>
 ; CHECK-NEXT:   WIDEN ir<%mul> = fmul reassoc nnan ninf nsz arcp contract afn ir<%add>, ir<2.000000e+00>
 ; CHECK-NEXT:   WIDEN ir<%div> = fdiv reassoc nsz contract ir<%mul>, ir<2.000000e+00>
 ; CHECK-NEXT:   CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
-; CHECK-NEXT:   WIDEN store ir<%gep.x>, ir<%div>
+; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x>
+; CHECK-NEXT:   WIDEN store vp<[[VEC_PTR]]>, ir<%div>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -656,11 +670,13 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) {
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
-; CHECK-NEXT:   WIDEN ir<%lv> = load ir<%gep.x>
+; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x>
+; CHECK-NEXT:   WIDEN ir<%lv> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:   WIDEN ir<%div.1> = udiv exact ir<%lv>, ir<20>
 ; CHECK-NEXT:   WIDEN ir<%div.2> = udiv ir<%lv>, ir<60>
 ; CHECK-NEXT:   WIDEN ir<%add> = add nuw nsw ir<%div.1>, ir<%div.2>
-; CHECK-NEXT:   WIDEN store ir<%gep.x>, ir<%add>
+; CHECK-NEXT:   vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%gep.x>
+; CHECK-NEXT:   WIDEN store vp<[[VEC_PTR2]]>, ir<%add>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -705,7 +721,8 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%ld.addr> = getelementptr inbounds ir<%src>, vp<[[STEPS]]>
-; CHECK-NEXT:   WIDEN ir<%ld.value> = load ir<%ld.addr>
+; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%ld.addr>
+; CHECK-NEXT:   WIDEN ir<%ld.value> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:   WIDEN ir<%ifcond> = fcmp oeq ir<%ld.value>, ir<5.000000e+00>
 ; CHECK-NEXT:  Successor(s): pred.call
 ; CHECK-EMPTY:
@@ -731,7 +748,8 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
 ; CHECK-NEXT:    EMIT vp<[[NOT_COND:%.+]]> = not ir<%ifcond>
 ; CHECK-NEXT:    BLEND ir<%st.value> = ir<%ld.value>/vp<[[NOT_COND]]> ir<%fadd>/ir<%ifcond>
 ; CHECK-NEXT:    CLONE ir<%st.addr> = getelementptr inbounds ir<%dest>, vp<[[STEPS]]>
-; CHECK-NEXT:    WIDEN store ir<%st.addr>, ir<%st.value>
+; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%st.addr>
+; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%st.value>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -786,11 +804,13 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) {
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
-; CHECK-NEXT:   WIDEN ir<%lv> = load ir<%gep.x>
+; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x>
+; CHECK-NEXT:   WIDEN ir<%lv> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:   WIDEN ir<%or.1> = or disjoint ir<%lv>, ir<1>
 ; CHECK-NEXT:   WIDEN ir<%or.2> = or ir<%lv>, ir<3>
 ; CHECK-NEXT:   WIDEN ir<%add> = add nuw nsw ir<%or.1>, ir<%or.2>
-; CHECK-NEXT:   WIDEN store ir<%gep.x>, ir<%add>
+; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.x>
+; CHECK-NEXT:   WIDEN store vp<[[VEC_PTR]]>, ir<%add>
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -835,7 +855,8 @@ define void @zext_nneg(ptr noalias %p, ptr noalias %p1) {
 ; CHECK-NEXT:    EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
 ; CHECK-NEXT:    vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:    CLONE ir<%idx> = getelementptr ir<%p>, vp<[[STEPS]]>
-; CHECK-NEXT:    WIDEN ir<%l> = load ir<%idx>
+; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx>
+; CHECK-NEXT:    WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:    WIDEN-CAST ir<%zext> = zext nneg ir<%l>
 ; CHECK-NEXT:    REPLICATE store ir<%zext>, ir<%p1>
 ; CHECK-NEXT:    EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index f2e6dd43a2ec..229d5b139d97 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -974,7 +974,8 @@ define void @sinking_requires_duplication(ptr %addr) {
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
 ; CHECK-NEXT:   CLONE ir<%gep> = getelementptr ir<%addr>, vp<[[STEPS]]>
-; CHECK-NEXT:   WIDEN ir<%0> = load ir<%gep>
+; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
+; CHECK-NEXT:   WIDEN ir<%0> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:   WIDEN ir<%pred> = fcmp oeq ir<%0>, ir<0.000000e+00>
 ; CHECK-NEXT:   EMIT vp<[[MASK:%.+]]> = not ir<%pred>
 ; CHECK-NEXT: Successor(s): pred.store
@@ -1114,7 +1115,8 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) {
 ; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; CHECK-NEXT:     EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION ir<%start>, -1
 ; CHECK-NEXT:     CLONE ir<%ptr.iv.next> = getelementptr inbounds ir<%ptr.iv>, ir<-1>
-; CHECK-NEXT:     WIDEN ir<%l> = load ir<%ptr.iv.next>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer (reverse) ir<%ptr.iv.next>
+; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:     WIDEN ir<%c.1> = icmp eq ir<%l>, ir<0>
 ; CHECK-NEXT:     EMIT vp<[[NEG:%.+]]> = not ir<%c.1>
 ; CHECK-NEXT:   Successor(s): pred.store
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-unwind-direct-call.ll b/llvm/test/Transforms/LowerTypeTests/cfi-unwind-direct-call.ll
index c56094083527..3e1f8b97e98b 100644
--- a/llvm/test/Transforms/LowerTypeTests/cfi-unwind-direct-call.ll
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-unwind-direct-call.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --include-generated-funcs --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --include-generated-funcs --version 4
 ; RUN: opt < %s -passes='lowertypetests,default<O3>' -S | FileCheck %s
 
 ; This IR is based of the following C++
@@ -156,38 +156,38 @@ attributes #8 = { noreturn nounwind }
 !13 = !{}
 !14 = !{!"branch_weights", i32 1048575, i32 1}
 ; CHECK: Function Attrs: minsize mustprogress optsize
-; CHECK-LABEL: define dso_local void @_Z7throw_ei
-; CHECK-SAME: (i32 noundef [[NUM:%.*]]) #[[ATTR0:[0-9]+]] !type !4 !type !5 !type !6 {
+; CHECK-LABEL: define dso_local void @_Z7throw_ei(
+; CHECK-SAME: i32 noundef [[NUM:%.*]]) #[[ATTR0:[0-9]+]] !type [[META4:![0-9]+]] !type [[META5:![0-9]+]] !type [[META6:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[NUM]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[EXCEPTION:%.*]] = tail call ptr @__cxa_allocate_exception(i64 4) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[EXCEPTION:%.*]] = tail call ptr @__cxa_allocate_exception(i64 4) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    store i32 20, ptr [[EXCEPTION]], align 16, !tbaa [[TBAA7:![0-9]+]]
-; CHECK-NEXT:    tail call void @__cxa_throw(ptr nonnull [[EXCEPTION]], ptr nonnull @_ZTIi, ptr null) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    tail call void @__cxa_throw(ptr nonnull [[EXCEPTION]], ptr nonnull @_ZTIi, ptr null) #[[ATTR7:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK: Function Attrs: minsize mustprogress optsize
-; CHECK-LABEL: define dso_local void @_Z10call_catchi
-; CHECK-SAME: (i32 noundef [[NUM:%.*]]) local_unnamed_addr #[[ATTR0]] personality ptr @__gxx_personality_v0 !type !4 !type !5 !type !6 {
+; CHECK-LABEL: define dso_local void @_Z10call_catchi(
+; CHECK-SAME: i32 noundef [[NUM:%.*]]) local_unnamed_addr #[[ATTR0]] personality ptr @__gxx_personality_v0 !type [[META4]] !type [[META5]] !type [[META6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    store ptr @_Z7throw_ei.cfi_jt, ptr @catch_ptr, align 8, !tbaa [[TBAA11:![0-9]+]]
-; CHECK-NEXT:    invoke void @_Z7throw_ei.cfi_jt() #[[ATTR7:[0-9]+]]
-; CHECK-NEXT:    to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]]
+; CHECK-NEXT:    invoke void @_Z7throw_ei.cfi_jt(i32 noundef [[NUM]]) #[[ATTR8:[0-9]+]]
+; CHECK-NEXT:            to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]], !callees [[META13:![0-9]+]]
 ; CHECK:       lpad:
 ; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    catch ptr @_ZTIi
+; CHECK-NEXT:            catch ptr @_ZTIi
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @_ZTIi) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @_ZTIi) #[[ATTR6]]
 ; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EH_RESUME:%.*]]
 ; CHECK:       catch:
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    tail call void @__cxa_end_catch() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP3]]) #[[ATTR6]]
+; CHECK-NEXT:    tail call void @__cxa_end_catch() #[[ATTR6]]
 ; CHECK-NEXT:    br label [[TRY_CONT]]
 ; CHECK:       try.cont:
 ; CHECK-NEXT:    ret void
@@ -196,33 +196,46 @@ attributes #8 = { noreturn nounwind }
 ;
 ;
 ; CHECK: Function Attrs: minsize optsize
-; CHECK-LABEL: define weak_odr hidden void @__cfi_check_fail
-; CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-LABEL: define weak_odr hidden void @__cfi_check_fail(
+; CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq ptr [[TMP0]], null, !nosanitize !13
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[TRAP:%.*]], label [[CONT:%.*]], !nosanitize !13
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq ptr [[TMP0]], null, !nosanitize [[META14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[TRAP:%.*]], label [[CONT:%.*]], !nosanitize [[META14]]
 ; CHECK:       trap:
-; CHECK-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR8:[0-9]+]], !nosanitize !13
-; CHECK-NEXT:    unreachable, !nosanitize !13
+; CHECK-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR9:[0-9]+]], !nosanitize [[META14]]
+; CHECK-NEXT:    unreachable, !nosanitize [[META14]]
 ; CHECK:       cont:
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP0]], align 4, !nosanitize !13
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP0]], align 4, !nosanitize [[META14]]
 ; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i8 [[TMP2]], 5
 ; CHECK-NEXT:    br i1 [[SWITCH]], label [[TRAP]], label [[CONT6:%.*]]
 ; CHECK:       cont6:
-; CHECK-NEXT:    ret void, !nosanitize !13
+; CHECK-NEXT:    ret void, !nosanitize [[META14]]
 ;
 ;
-; CHECK-LABEL: define weak void @__cfi_check
-; CHECK-SAME: (i64 [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]]) local_unnamed_addr {
+; CHECK-LABEL: define weak void @__cfi_check(
+; CHECK-SAME: i64 [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    tail call void @llvm.trap()
 ; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK: Function Attrs: naked nocf_check noinline
-; CHECK-LABEL: define internal void @_Z7throw_ei.cfi_jt
-; CHECK-SAME: () #[[ATTR4:[0-9]+]] align 8 {
+; CHECK-LABEL: define internal void @_Z7throw_ei.cfi_jt(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] align 8 {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    tail call void asm sideeffect "jmp ${0:c}@plt\0Aint3\0Aint3\0Aint3\0A", "s"(ptr nonnull @_Z7throw_ei) #[[ATTR5]]
+; CHECK-NEXT:    tail call void asm sideeffect "jmp ${0:c}@plt\0Aint3\0Aint3\0Aint3\0A", "s"(ptr nonnull @_Z7throw_ei) #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ;
+;.
+; CHECK: [[META4]] = !{i64 0, !"_ZTSFviE"}
+; CHECK: [[META5]] = !{i64 0, !"_ZTSFviE.generalized"}
+; CHECK: [[META6]] = !{i64 0, i64 -8738933900360652027}
+; CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+; CHECK: [[META8]] = !{!"int", [[META9:![0-9]+]], i64 0}
+; CHECK: [[META9]] = !{!"omnipotent char", [[META10:![0-9]+]], i64 0}
+; CHECK: [[META10]] = !{!"Simple C++ TBAA"}
+; CHECK: [[TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
+; CHECK: [[META12]] = !{!"any pointer", [[META9]], i64 0}
+; CHECK: [[META13]] = !{ptr @_Z7throw_ei.cfi_jt}
+; CHECK: [[META14]] = !{}
+;.
diff --git a/llvm/test/Transforms/PGOProfile/thinlto_indirect_call_promotion.ll b/llvm/test/Transforms/PGOProfile/thinlto_indirect_call_promotion.ll
index b24effed7024..d2f4696ccf41 100644
--- a/llvm/test/Transforms/PGOProfile/thinlto_indirect_call_promotion.ll
+++ b/llvm/test/Transforms/PGOProfile/thinlto_indirect_call_promotion.ll
@@ -9,9 +9,6 @@
 ; The raw profiles storesd compressed function names, so profile reader should
 ; be built with zlib support to decompress them.
 ; REQUIRES: zlib
-; REQUIRES: host-byteorder-little-endian
-; Raw profiles are generate on 64-bit systems.
-; REQUIRES: llvm-64-bits
 
 ; RUN: rm -rf %t && split-file %s %t && cd %t
 
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/constraint-elimination-placement.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/constraint-elimination-placement.ll
index eb813bdb8c4e..ad4d4cf28ace 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/constraint-elimination-placement.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/constraint-elimination-placement.ll
@@ -5,7 +5,7 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-macosx"
 
 define i1 @test_order_1(ptr %this, ptr noalias %other, i1 %tobool9.not, i32 %call) {
-; CHECK-LABEL: define i1 @test_order_1(
+; CHECK-LABEL: define noundef i1 @test_order_1(
 ; CHECK-SAME: ptr nocapture writeonly [[THIS:%.*]], ptr noalias [[OTHER:%.*]], i1 [[TOBOOL9_NOT:%.*]], i32 [[CALL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[TOBOOL9_NOT]], label [[EXIT:%.*]], label [[FOR_COND_PREHEADER:%.*]]
@@ -105,14 +105,14 @@ define void @test2(ptr %this) #0 {
 ; CHECK-NEXT:    i64 17, label [[IF_END_I31:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.end.i:
-; CHECK-NEXT:    [[CALL8_I_I:%.*]] = tail call fastcc i32 @test2_fn6()
+; CHECK-NEXT:    [[CALL8_I_I:%.*]] = tail call fastcc noundef i32 @test2_fn6()
 ; CHECK-NEXT:    [[TRUNC_I_I:%.*]] = trunc i32 [[CALL8_I_I]] to i8
 ; CHECK-NEXT:    [[CALL1_I1_I:%.*]] = tail call i1 @test2_fn4(i8 [[TRUNC_I_I]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[CALL1_I1_I]], true
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ; CHECK:       test2_fn2.exit12:
-; CHECK-NEXT:    [[CALL8_I_I8:%.*]] = tail call fastcc i32 @test2_fn6()
+; CHECK-NEXT:    [[CALL8_I_I8:%.*]] = tail call fastcc noundef i32 @test2_fn6()
 ; CHECK-NEXT:    [[TRUNC_I_I9:%.*]] = trunc i32 [[CALL8_I_I8]] to i8
 ; CHECK-NEXT:    [[CALL1_I1_I10:%.*]] = tail call i1 @test2_fn4(i8 [[TRUNC_I_I9]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[CALL1_I1_I10]], true
@@ -125,11 +125,11 @@ define void @test2(ptr %this) #0 {
 ; CHECK-NEXT:    store i8 0, ptr [[THIS]], align 4
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ; CHECK:       if.end.i31:
-; CHECK-NEXT:    [[DOTPRE:%.*]] = tail call fastcc i32 @test2_fn6()
-; CHECK-NEXT:    [[DOTPRE38:%.*]] = trunc i32 [[DOTPRE]] to i8
-; CHECK-NEXT:    [[DOTPRE39:%.*]] = tail call i1 @test2_fn4(i8 [[DOTPRE38]])
-; CHECK-NEXT:    [[DOTPRE40:%.*]] = xor i1 [[DOTPRE39]], true
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[DOTPRE40]])
+; CHECK-NEXT:    [[CALL8_I_I32:%.*]] = tail call fastcc noundef i32 @test2_fn6()
+; CHECK-NEXT:    [[TRUNC_I_I33:%.*]] = trunc i32 [[CALL8_I_I32]] to i8
+; CHECK-NEXT:    [[CALL1_I1_I34:%.*]] = tail call i1 @test2_fn4(i8 [[TRUNC_I_I33]])
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[CALL1_I1_I34]], true
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ;
 entry:
@@ -152,7 +152,7 @@ if.else21:                                        ; preds = %entry
 }
 
 define i1 @test2_fn2(ptr %__rhs) #0 {
-; CHECK-LABEL: define i1 @test2_fn2(
+; CHECK-LABEL: define noundef i1 @test2_fn2(
 ; CHECK-SAME: ptr nocapture readonly [[__RHS:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @strlen(ptr noundef nonnull dereferenceable(1) [[__RHS]])
@@ -162,7 +162,7 @@ define i1 @test2_fn2(ptr %__rhs) #0 {
 ; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i64 [[CALL]], [[COND_I]]
 ; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[IF_END:%.*]], label [[CLEANUP:%.*]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[CALL8_I:%.*]] = tail call fastcc i32 @test2_fn6()
+; CHECK-NEXT:    [[CALL8_I:%.*]] = tail call fastcc noundef i32 @test2_fn6()
 ; CHECK-NEXT:    [[TRUNC_I:%.*]] = trunc i32 [[CALL8_I]] to i8
 ; CHECK-NEXT:    [[CALL1_I1:%.*]] = tail call i1 @test2_fn4(i8 [[TRUNC_I]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[CALL1_I1]], true
@@ -231,7 +231,7 @@ entry:
 }
 
 define internal i32 @test2_fn6() {
-; CHECK-LABEL: define internal fastcc i32 @test2_fn6(
+; CHECK-LABEL: define internal fastcc noundef i32 @test2_fn6(
 ; CHECK-SAME: ) unnamed_addr #[[ATTR5]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index 40a5c73b69de..b43fcc8548ce 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -49,8 +49,8 @@ define void @loop(ptr %X, ptr %Y) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD8]], zeroinitializer
@@ -61,8 +61,8 @@ define void @loop(ptr %X, ptr %Y) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x double> zeroinitializer, <2 x double> [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP11]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    store <2 x double> [[TMP10]], ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000
@@ -149,20 +149,20 @@ define void @loop2(ptr %A, ptr %B, ptr %C, float %x) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4, !alias.scope !4
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4, !alias.scope !4
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !alias.scope !4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 20, i32 20, i32 20, i32 20>
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD7]], <i32 20, i32 20, i32 20, i32 20>
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP4]], align 4, !alias.scope !7
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 4
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP4]], align 4, !alias.scope !7
 ; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP5]], align 4, !alias.scope !7
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x float> [[WIDE_LOAD9]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP8]], align 4, !alias.scope !9, !noalias !11
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i64 4
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP8]], align 4, !alias.scope !9, !noalias !11
 ; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope !9, !noalias !11
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP2]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[WIDE_LOAD10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd <4 x float> [[TMP6]], [[TMP10]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
index e43d25eb1d0e..e346bef27bbd 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
@@ -50,12 +50,12 @@ define i64 @sum_2_at_with_int_conversion(ptr %A, ptr %B, i64 %N) {
 ; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i64 2
+; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[WIDE_LOAD17]], [[VEC_PHI16]]
@@ -171,16 +171,16 @@ define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-NEXT:    [[VEC_PHI28:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i64 2
+; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[TMP10]], i64 2
+; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <2 x i64> [[WIDE_LOAD29]], [[VEC_PHI28]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
index 4ff67b560161..8f37e79539b8 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
@@ -180,18 +180,18 @@ define void @test_runtime_trip_count(i32 %N) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 16
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x double>, ptr [[TMP1]], align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x double>, ptr [[TMP2]], align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 2
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x double>, ptr [[TMP2]], align 16
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD5]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[WIDE_LOAD4]], [[WIDE_LOAD6]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP6]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP6]], align 16
 ; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 16
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
index 5dac5edf6365..eb8fe1027452 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
@@ -56,8 +56,8 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(ptr noundef nonnull align 8
 ; O2:       vector.body:
 ; O2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_BODY4_PREHEADER]] ]
 ; O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[INDEX]]
-; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; O2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; O2-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !tbaa [[TBAA0]]
 ; O2-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
 ; O2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD8]], <i32 1, i32 1, i32 1, i32 1>
@@ -104,8 +104,8 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(ptr noundef nonnull align 8
 ; O3:       vector.body:
 ; O3-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_COND1_PREHEADER_US]] ]
 ; O3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[INDEX]]
-; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; O3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; O3-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !tbaa [[TBAA0]]
 ; O3-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
 ; O3-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], <i32 1, i32 1, i32 1, i32 1>
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll
index 0e751ce4e55a..8f1c52c59163 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll
@@ -90,7 +90,7 @@ bb3:                                              ; preds = %bb1, %bb2
 
 define i1 @test2(i32 %c) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call i1 @test1(i32 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call noundef i1 @test1(i32 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]]
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll b/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll
index cee2666bc506..c50f16404630 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll
@@ -33,8 +33,8 @@ define void @loop_or(ptr noalias %pIn, ptr noalias %pOut, i32 %s) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[PIN:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD4]] to <4 x i32>
@@ -43,8 +43,8 @@ define void @loop_or(ptr noalias %pIn, ptr noalias %pOut, i32 %s) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint <4 x i32> [[TMP4]], <i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216>
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint <4 x i32> [[TMP5]], <i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[POUT:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 4
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
index f1ddee00ca3d..24cc18625af8 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
@@ -40,8 +40,8 @@ define void @licm(ptr align 8 dereferenceable(8) %_M_start.i, i64 %numElem) {
 ; O23:       vector.body:
 ; O23-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; O23-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i64 [[INDEX]]
-; O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, ptr [[TMP1]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; O23-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 2
+; O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, ptr [[TMP1]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, ptr [[TMP2]], align 8, !tbaa [[TBAA8]]
 ; O23-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; O23-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
index 2510aeaebcf0..5053fd64f59c 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
@@ -37,24 +37,24 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP5]], align 8, !tbaa [[TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i64 4
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP6]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i64 8
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x double>, ptr [[TMP7]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i64 12
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP5]], align 8, !tbaa [[TBAA3:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP6]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x double>, ptr [[TMP7]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x double>, ptr [[TMP8]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x double> [[WIDE_LOAD6]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x double> [[TMP9]], ptr [[TMP13]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 4
-; CHECK-NEXT:    store <4 x double> [[TMP10]], ptr [[TMP14]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 8
-; CHECK-NEXT:    store <4 x double> [[TMP11]], ptr [[TMP15]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 12
+; CHECK-NEXT:    store <4 x double> [[TMP9]], ptr [[TMP13]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store <4 x double> [[TMP10]], ptr [[TMP14]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store <4 x double> [[TMP11]], ptr [[TMP15]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    store <4 x double> [[TMP12]], ptr [[TMP16]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reduction-known-first-value.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reduction-known-first-value.ll
index 1c493144fcd1..e9a42a859ab3 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reduction-known-first-value.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reduction-known-first-value.ll
@@ -17,8 +17,8 @@ define i16 @test(ptr %ptr) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i16> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i16>
diff --git a/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll b/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll
index 678ac59a694b..4c9cd3090681 100644
--- a/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll
+++ b/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll
@@ -11,7 +11,7 @@ entry:
 }
 
 define ptr @parent(ptr align 8 dereferenceable(72) %f, half %val1, i16 %val2, i32 %val3) align 2 {
-; CHECK-LABEL: define nonnull ptr @parent
+; CHECK-LABEL: define noundef nonnull ptr @parent
 ; CHECK-SAME: (ptr readonly returned align 8 dereferenceable(72) [[F:%.*]], half [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[F]], i64 64
diff --git a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll
index ee913d299aaa..a3d111fffbbf 100644
--- a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll
+++ b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll
@@ -13,7 +13,7 @@ entry:
 }
 
 define ptr @parent(ptr align 8 dereferenceable(72) %f, i16 %val1, i16 %val2, i32 %val3) align 2 {
-; CHECK-LABEL: define nonnull ptr @parent
+; CHECK-LABEL: define noundef nonnull ptr @parent
 ; CHECK-SAME: (ptr readonly returned align 8 dereferenceable(72) [[F:%.*]], i16 [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[F]], i64 64
diff --git a/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll b/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll
index a42e4dd75d9f..ba4583160b13 100644
--- a/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll
+++ b/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -S -O3 -memssa-check-limit=1 -memdep-block-scan-limit=1 < %s | FileCheck %s
 
 define i32 @f(ptr noalias %p, i32 %c) {
-; CHECK-LABEL: define i32 @f
+; CHECK-LABEL: define noundef i32 @f
 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], i32 [[C:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:    tail call void @g()
 ; CHECK-NEXT:    tail call void @g()
diff --git a/llvm/test/Transforms/PhaseOrdering/gep-null-compare-in-loop.ll b/llvm/test/Transforms/PhaseOrdering/gep-null-compare-in-loop.ll
index 545e203c5e2c..fba3c1c154d8 100644
--- a/llvm/test/Transforms/PhaseOrdering/gep-null-compare-in-loop.ll
+++ b/llvm/test/Transforms/PhaseOrdering/gep-null-compare-in-loop.ll
@@ -33,7 +33,7 @@ bb12:
 }
 
 define i32 @using_alloca() {
-; CHECK-LABEL: define i32 @using_alloca
+; CHECK-LABEL: define noundef i32 @using_alloca
 ; CHECK-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    ret i32 6
@@ -51,7 +51,7 @@ bb:
 }
 
 define i32 @using_malloc() {
-; CHECK-LABEL: define i32 @using_malloc
+; CHECK-LABEL: define noundef i32 @using_malloc
 ; CHECK-SAME: () local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    ret i32 6
diff --git a/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll b/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll
index c159d1b68678..89095048f224 100644
--- a/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll
+++ b/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll
@@ -56,3 +56,64 @@ loop.latch:
 exit:
   ret void
 }
+
+
+define void @chained_conditions(i64 noundef %a, i64 noundef %b, i64 noundef %c, i64 noundef %d) #0 {
+; CHECK-LABEL: @chained_conditions(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a.addr = alloca i64, align 8
+  %b.addr = alloca i64, align 8
+  %c.addr = alloca i64, align 8
+  %d.addr = alloca i64, align 8
+  store i64 %a, ptr %a.addr, align 8
+  store i64 %b, ptr %b.addr, align 8
+  store i64 %c, ptr %c.addr, align 8
+  store i64 %d, ptr %d.addr, align 8
+  %0 = load i64, ptr %a.addr, align 8
+  %cmp = icmp ugt i64 %0, 2048
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %1 = load i64, ptr %b.addr, align 8
+  %cmp1 = icmp ugt i64 %1, 1024
+  br i1 %cmp1, label %if.then, label %lor.lhs.false2
+
+lor.lhs.false2:                                   ; preds = %lor.lhs.false
+  %2 = load i64, ptr %c.addr, align 8
+  %cmp3 = icmp ugt i64 %2, 1024
+  br i1 %cmp3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false2, %lor.lhs.false, %entry
+  br label %if.end10
+
+if.end:                                           ; preds = %lor.lhs.false2
+  %3 = load i64, ptr %a.addr, align 8
+  %4 = load i64, ptr %b.addr, align 8
+  %add = add i64 %3, %4
+  %5 = load i64, ptr %c.addr, align 8
+  %add4 = add i64 %add, %5
+  %6 = load i64, ptr %d.addr, align 8
+  %cmp5 = icmp uge i64 %add4, %6
+  br i1 %cmp5, label %if.then6, label %if.end7
+
+if.then6:                                         ; preds = %if.end
+  br label %if.end10
+
+if.end7:                                          ; preds = %if.end
+  %7 = load i64, ptr %a.addr, align 8
+  %8 = load i64, ptr %d.addr, align 8
+  %cmp8 = icmp uge i64 %7, %8
+  br i1 %cmp8, label %if.then9, label %if.end10
+
+if.then9:                                         ; preds = %if.end7
+  call void @bar()
+  br label %if.end10
+
+if.end10:                                         ; preds = %if.then, %if.then6, %if.then9, %if.end7
+  ret void
+}
+
+declare void @bar()
diff --git a/llvm/test/Transforms/Reassociate/basictest.ll b/llvm/test/Transforms/Reassociate/basictest.ll
index 6205256a3104..3f4057dd14e7 100644
--- a/llvm/test/Transforms/Reassociate/basictest.ll
+++ b/llvm/test/Transforms/Reassociate/basictest.ll
@@ -239,10 +239,8 @@ define i32 @test14(i32 %X1, i32 %X2) {
 
 define i32 @test15(i32 %X1, i32 %X2, i32 %X3) {
 ; CHECK-LABEL: @test15(
-; CHECK-NEXT:    [[A:%.*]] = icmp ne i32 [[X1:%.*]], 0
 ; CHECK-NEXT:    [[B:%.*]] = icmp slt i32 [[X2:%.*]], [[X3:%.*]]
-; CHECK-NEXT:    [[C:%.*]] = and i1 [[A]], [[B]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[C]], i32 [[X1]], i32 0
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X1:%.*]], i32 0
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %X1, 0
diff --git a/llvm/test/Transforms/SCCP/pr50901.ll b/llvm/test/Transforms/SCCP/pr50901.ll
index bbeec180a6a9..11d6bba6f6a9 100644
--- a/llvm/test/Transforms/SCCP/pr50901.ll
+++ b/llvm/test/Transforms/SCCP/pr50901.ll
@@ -49,6 +49,8 @@
 ; CHECK-DAG: ![[DBG6]] = distinct !DIGlobalVariable(name: "g_66", {{.*}}
 ; CHECK: ![[G7:[0-9]+]] = !DIGlobalVariableExpression(var: ![[DBG7:[0-9]+]], expr: !DIExpression(DW_OP_constu, 70, DW_OP_stack_value))
 ; CHECK-DAG: ![[DBG7]] = distinct !DIGlobalVariable(name: "g_77", {{.*}}
+; CHECK:     = !DIGlobalVariableExpression(var: ![[DBG_FLOAT_UNDEF:.+]], expr: !DIExpression())
+; CHECK-DAG: ![[DBG_FLOAT_UNDEF]]  = distinct !DIGlobalVariable(name: "g_float_undef"
 
 @g_1 = dso_local global i32 -4, align 4, !dbg !0
 @g_2 = dso_local global float 0x4011C28F60000000, align 4, !dbg !8
@@ -64,6 +66,7 @@
 @_ZL4g_55 = internal global i8 1, align 1, !dbg !33
 @_ZL4g_66 = internal global ptr null, align 8, !dbg !35
 @_ZL4g_77 = internal global ptr inttoptr (i64 70 to ptr), align 8, !dbg !37
+@g_float_undef = internal global float undef, align 4, !dbg !83
 
 define dso_local void @_Z3barv() !dbg !46 {
 entry:
@@ -83,6 +86,8 @@ entry:
   store ptr %5, ptr @g_6, align 8, !dbg !59
   %6 = load ptr, ptr @_ZL4g_77, align 8, !dbg !59
   store ptr %6, ptr @g_7, align 8, !dbg !59
+  %l = load float, ptr @g_float_undef, align 8, !dbg !59
+  store float %l, ptr @g_2, align 8, !dbg !59
   ret void, !dbg !59
 }
 
@@ -103,7 +108,7 @@ entry:
 !4 = !{!5}
 !5 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 64)
 !6 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
-!7 = !{!0, !8, !10, !13, !16, !19, !23, !25, !27, !29, !31, !33, !35, !37}
+!7 = !{!0, !8, !10, !13, !16, !19, !23, !25, !27, !29, !31, !33, !35, !37, !83}
 !8 = !DIGlobalVariableExpression(var: !9, expr: !DIExpression())
 !9 = distinct !DIGlobalVariable(name: "g_2", scope: !2, file: !3, line: 2, type: !6, isLocal: false, isDefinition: true)
 !10 = !DIGlobalVariableExpression(var: !11, expr: !DIExpression())
@@ -152,3 +157,5 @@ entry:
 !80 = !DILocation(line: 29, column: 5, scope: !81)
 !81 = distinct !DILexicalBlock(scope: !77, file: !3, line: 28, column: 3)
 !82 = !DILocation(line: 31, column: 1, scope: !77)
+!83 = !DIGlobalVariableExpression(var: !84, expr: !DIExpression())
+!84 = distinct !DIGlobalVariable(name: "g_float_undef", linkageName: "g_float_undef", scope: !2, file: !3, line: 15, type: !6, isLocal: true, isDefinition: true)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
index b20dd0c41e17..0a6899641044 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
@@ -1,9 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-w32-windows-gnu | FileCheck %s
 
-define i32 @foo() {
+define i32 @foo(i32 %v1, double %v2) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <2 x i32> [[TMP0]] to <2 x double>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    br label [[FOR_COND15_PREHEADER:%.*]]
 ; CHECK:       for.cond15.preheader:
 ; CHECK-NEXT:    br label [[IF_END:%.*]]
@@ -13,21 +16,24 @@ define i32 @foo() {
 ; CHECK-NEXT:    br label [[FOR_COND15:%.*]]
 ; CHECK:       for.end39:
 ; CHECK-NEXT:    switch i32 undef, label [[DO_BODY:%.*]] [
-; CHECK-NEXT:    i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 1, label [[SW_BB195:%.*]]
+; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 1, label [[SW_BB195:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds [4 x [2 x double]], ptr undef, i32 0, i64 1, i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[ARRAYIDX43]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], <double 0x7FF8000000000000, double 0x7FF8000000000000, double 0x7FF8000000000000, double 0x7FF8000000000000>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> zeroinitializer, <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> <double poison, double undef>, double [[V2:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[ARRAYIDX43]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP7]])
 ; CHECK-NEXT:    br label [[SW_EPILOG:%.*]]
 ; CHECK:       sw.bb195:
 ; CHECK-NEXT:    br label [[SW_EPILOG]]
 ; CHECK:       do.body:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       sw.epilog:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP3]], [[SW_BB]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP8]], [[SW_BB]] ]
 ; CHECK-NEXT:    ret i32 undef
 ; CHECK:       if.end.1:
 ; CHECK-NEXT:    br label [[FOR_COND15_1:%.*]]
@@ -36,7 +42,7 @@ define i32 @foo() {
 ;
 entry:
   %conv = sitofp i32 undef to double
-  %conv2 = sitofp i32 undef to double
+  %conv2 = sitofp i32 %v1 to double
   br label %for.cond15.preheader
 
 for.cond15.preheader:                             ; preds = %for.cond15.1, %entry
@@ -63,7 +69,7 @@ sw.bb:                                            ; preds = %for.end39
   %2 = load double, ptr %arrayidx51, align 8
   %arrayidx58 = getelementptr inbounds [4 x [2 x double]], ptr undef, i32 0, i64 1, i64 1
   %3 = load double, ptr %arrayidx58, align 8
-  %mul = fmul double undef, %conv2
+  %mul = fmul double %v2, %conv2
   %mul109 = fmul double undef, %conv
   %mul143 = fmul double %0, %mul
   %4 = call double @llvm.fmuladd.f64(double undef, double %conv2, double %mul143)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
index be6b0bc47c02..096f57d100a5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
@@ -11,11 +11,11 @@ define i32 @fn1() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @a, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, <2 x ptr> [[SHUFFLE]], <2 x i64> <i64 11, i64 56>
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint <2 x ptr> [[TMP2]] to <2 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 11, i64 56>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
@@ -34,13 +34,13 @@ declare float @llvm.powi.f32.i32(float, i32)
 define void @fn2(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @fn2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP5]], i32 [[TMP6]])
-; CHECK-NEXT:    store <4 x float> [[TMP7]], ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP4]], i32 [[TMP3]])
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -90,12 +90,12 @@ define void @externally_used_ptrs() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @a, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, <2 x ptr> [[SHUFFLE]], <2 x i64> <i64 56, i64 11>
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint <2 x ptr> [[TMP2]] to <2 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 56, i64 11>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
index 75431c13a770..ddc2a1b81904 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
@@ -9,15 +9,15 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> <i64 8, i64 12, i64 28, i64 24>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
-; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <8 x float> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <8 x float> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x ptr addrspace(1)> [[TMP5]], i32 0
-; CHECK-NEXT:    store <8 x float> [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr addrspace(1)> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <8 x float> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <8 x float> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    store <8 x float> [[TMP13]], ptr addrspace(1) [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
index d4c71285a93a..87063fc3f7a8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
@@ -103,21 +103,23 @@ define void @test_supernode_addsub_alt(ptr %Aarray, ptr %Barray, ptr %Carray, pt
 ; ENABLED-LABEL: @test_supernode_addsub_alt(
 ; ENABLED-NEXT:  entry:
 ; ENABLED-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[BARRAY:%.*]], i64 1
 ; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, ptr [[SARRAY:%.*]], i64 1
 ; ENABLED-NEXT:    [[A0:%.*]] = load double, ptr [[AARRAY]], align 8
 ; ENABLED-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
-; ENABLED-NEXT:    [[B0:%.*]] = load double, ptr [[BARRAY]], align 8
-; ENABLED-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
 ; ENABLED-NEXT:    [[C0:%.*]] = load double, ptr [[CARRAY]], align 8
 ; ENABLED-NEXT:    [[C1:%.*]] = load double, ptr [[IDXC1]], align 8
-; ENABLED-NEXT:    [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]]
-; ENABLED-NEXT:    [[ADDB1C1:%.*]] = fadd fast double [[B1]], [[C1]]
-; ENABLED-NEXT:    [[SUB0:%.*]] = fsub fast double [[SUBA0B0]], [[C0]]
-; ENABLED-NEXT:    [[ADD1:%.*]] = fadd fast double [[ADDB1C1]], [[A1]]
-; ENABLED-NEXT:    store double [[SUB0]], ptr [[SARRAY]], align 8
-; ENABLED-NEXT:    store double [[ADD1]], ptr [[IDXS1]], align 8
+; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
+; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1
+; ENABLED-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]]
+; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP0]]
+; ENABLED-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
+; ENABLED-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A1]], i32 1
+; ENABLED-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
+; ENABLED-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]]
+; ENABLED-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    store <2 x double> [[TMP10]], ptr [[SARRAY:%.*]], align 8
 ; ENABLED-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
index aa3c2be7dc9c..17f9f371ff6e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
@@ -12,22 +12,24 @@ define void @foo() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP10:%.*]], [[BB3:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP14:%.*]], [[BB3:%.*]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr undef, align 8
 ; CHECK-NEXT:    br i1 undef, label [[BB3]], label [[BB4:%.*]]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
 ; CHECK-NEXT:    [[CONV2:%.*]] = uitofp i16 undef to double
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[TMP3]], [[CONV2]]
-; CHECK-NEXT:    [[SUB1:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double poison, double poison, double undef, double undef>, double [[SUB1]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[ADD1]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <4 x double> [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fptrunc <4 x double> [[TMP6]] to <4 x float>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP2]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = fcmp ogt <4 x double> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fptrunc <4 x double> [[TMP10]] to <4 x float>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP2]], <4 x float> [[TMP12]]
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP10]] = phi <4 x float> [ [[TMP9]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP14]] = phi <4 x float> [ [[TMP13]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
 ; CHECK-NEXT:    br label [[BB2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SROA/pr64081.ll b/llvm/test/Transforms/SROA/pr64081.ll
new file mode 100644
index 000000000000..4b8938421382
--- /dev/null
+++ b/llvm/test/Transforms/SROA/pr64081.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=sroa < %s | FileCheck %s
+
+%B = type { i1, i3 }
+
+define void @test(i7 %x) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i7 [[X:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[RES:%.*]] = alloca [2 x i8], align 1
+; CHECK-NEXT:    [[TMP_SROA_0:%.*]] = alloca i1, align 8
+; CHECK-NEXT:    [[TMP_SROA_1:%.*]] = alloca i3, align 1
+; CHECK-NEXT:    store i7 [[X]], ptr [[TMP_SROA_1]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RES]], ptr align 8 [[TMP_SROA_0]], i64 1, i1 false)
+; CHECK-NEXT:    [[TMP_SROA_1_0_RES_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[RES]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP_SROA_1_0_RES_SROA_IDX]], ptr align 1 [[TMP_SROA_1]], i64 1, i1 false)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @use(ptr [[RES]])
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = alloca [2 x i8]
+  %tmp = alloca { i1, i3 }
+  %tmp.1 = getelementptr i8, ptr %tmp, i64 1
+  store i7 %x, ptr %tmp.1
+  call void @llvm.memcpy.p0.p0.i64(ptr %res, ptr %tmp, i64 2, i1 false)
+  call i8 @use(ptr %res)
+  ret void
+}
+
+declare void @use(ptr)
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
diff --git a/llvm/test/Transforms/SampleProfile/ctxsplit.ll b/llvm/test/Transforms/SampleProfile/ctxsplit.ll
index 0b105e1aa703..46e088a63e94 100644
--- a/llvm/test/Transforms/SampleProfile/ctxsplit.ll
+++ b/llvm/test/Transforms/SampleProfile/ctxsplit.ll
@@ -10,16 +10,16 @@
 ; be read in non-thinlto mode.
 ; RUN: opt < %s -passes='default<O2>' -pgo-kind=pgo-sample-use-pipeline -use-profiled-call-graph=0 -profile-file=%S/Inputs/ctxsplit.extbinary.afdo -S | FileCheck %s --check-prefix=NOTHINLTO
 
-; POSTLINK: define dso_local i32 @goo() {{.*}} !prof ![[ENTRY1:[0-9]+]] {
-; POSTLINK: define dso_local i32 @foo() {{.*}} !prof ![[ENTRY2:[0-9]+]] {
+; POSTLINK: define dso_local noundef i32 @goo() {{.*}} !prof ![[ENTRY1:[0-9]+]] {
+; POSTLINK: define dso_local noundef i32 @foo() {{.*}} !prof ![[ENTRY2:[0-9]+]] {
 ; POSTLINK: ![[ENTRY1]] = !{!"function_entry_count", i64 1001}
 ; POSTLINK: ![[ENTRY2]] = !{!"function_entry_count", i64 -1}
-; PRELINK: define dso_local i32 @goo() {{.*}} !prof ![[ENTRY1:[0-9]+]] {
-; PRELINK: define dso_local i32 @foo() {{.*}} !prof ![[ENTRY2:[0-9]+]] {
+; PRELINK: define dso_local noundef i32 @goo() {{.*}} !prof ![[ENTRY1:[0-9]+]] {
+; PRELINK: define dso_local noundef i32 @foo() {{.*}} !prof ![[ENTRY2:[0-9]+]] {
 ; PRELINK: ![[ENTRY1]] = !{!"function_entry_count", i64 1001}
 ; PRELINK: ![[ENTRY2]] = !{!"function_entry_count", i64 3001}
-; NOTHINLTO: define dso_local i32 @goo() {{.*}} !prof ![[ENTRY1:[0-9]+]] {
-; NOTHINLTO: define dso_local i32 @foo() {{.*}} !prof ![[ENTRY2:[0-9]+]] {
+; NOTHINLTO: define dso_local noundef i32 @goo() {{.*}} !prof ![[ENTRY1:[0-9]+]] {
+; NOTHINLTO: define dso_local noundef i32 @foo() {{.*}} !prof ![[ENTRY2:[0-9]+]] {
 ; NOTHINLTO: ![[ENTRY1]] = !{!"function_entry_count", i64 1001}
 ; NOTHINLTO: ![[ENTRY2]] = !{!"function_entry_count", i64 3001}
 
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
index 1662bb99f27b..7c0d5e4f2b65 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
@@ -1,12 +1,13 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt %s -S -passes='simplifycfg<switch-to-lookup>' -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 declare void @foo(i32)
 
 define void @test(i1 %a) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[A_OFF:%.*]] = add i1 [[A:%.*]], true
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    [[A_OFF:%.*]] = add i1 [[A]], true
 ; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i1 [[A_OFF]], true
 ; CHECK-NEXT:    br i1 [[SWITCH]], label [[TRUE:%.*]], label [[FALSE:%.*]]
 ; CHECK:       common.ret:
@@ -32,12 +33,13 @@ default:
 }
 
 define void @test2(i2 %a) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    switch i2 [[A:%.*]], label [[DOTUNREACHABLEDEFAULT:%.*]] [
-; CHECK-NEXT:    i2 0, label [[CASE0:%.*]]
-; CHECK-NEXT:    i2 1, label [[CASE1:%.*]]
-; CHECK-NEXT:    i2 -2, label [[CASE2:%.*]]
-; CHECK-NEXT:    i2 -1, label [[CASE3:%.*]]
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: i2 [[A:%.*]]) {
+; CHECK-NEXT:    switch i2 [[A]], label [[DOTUNREACHABLEDEFAULT:%.*]] [
+; CHECK-NEXT:      i2 0, label [[CASE0:%.*]]
+; CHECK-NEXT:      i2 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i2 -2, label [[CASE2:%.*]]
+; CHECK-NEXT:      i2 -1, label [[CASE3:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
@@ -80,11 +82,12 @@ default:
 ; This one is a negative test - we know the value of the default,
 ; but that's about it
 define void @test3(i2 %a) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    switch i2 [[A:%.*]], label [[DEFAULT:%.*]] [
-; CHECK-NEXT:    i2 0, label [[CASE0:%.*]]
-; CHECK-NEXT:    i2 1, label [[CASE1:%.*]]
-; CHECK-NEXT:    i2 -2, label [[CASE2:%.*]]
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: i2 [[A:%.*]]) {
+; CHECK-NEXT:    switch i2 [[A]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:      i2 0, label [[CASE0:%.*]]
+; CHECK-NEXT:      i2 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i2 -2, label [[CASE2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
@@ -122,10 +125,11 @@ default:
 ; Negative test - check for possible overflow when computing
 ; number of possible cases.
 define void @test4(i128 %a) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    switch i128 [[A:%.*]], label [[DEFAULT:%.*]] [
-; CHECK-NEXT:    i128 0, label [[CASE0:%.*]]
-; CHECK-NEXT:    i128 1, label [[CASE1:%.*]]
+; CHECK-LABEL: define void @test4(
+; CHECK-SAME: i128 [[A:%.*]]) {
+; CHECK-NEXT:    switch i128 [[A]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:      i128 0, label [[CASE0:%.*]]
+; CHECK-NEXT:      i128 1, label [[CASE1:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
@@ -155,8 +159,9 @@ default:
 
 ; All but one bit known zero
 define void @test5(i8 %a) {
-; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[A:%.*]], 2
+; CHECK-LABEL: define void @test5(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[A]], 2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[A_OFF:%.*]] = add i8 [[A]], -1
 ; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i8 [[A_OFF]], 1
@@ -187,8 +192,9 @@ default:
 
 ;; All but one bit known one
 define void @test6(i8 %a) {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[A:%.*]], -2
+; CHECK-LABEL: define void @test6(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[A]], -2
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[AND]], -2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[A_OFF:%.*]] = add i8 [[A]], 1
@@ -222,8 +228,9 @@ default:
 ; Check that we can eliminate both dead cases and dead defaults
 ; within a single run of simplifycfg
 define void @test7(i8 %a) {
-; CHECK-LABEL: @test7(
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[A:%.*]], -2
+; CHECK-LABEL: define void @test7(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[A]], -2
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[AND]], -2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[A_OFF:%.*]] = add i8 [[A]], 1
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
index 3910d8047342..06a8a6fa0482 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
@@ -7,10 +7,10 @@ define i64 @i64_test(i64 %i) nounwind readnone {
 ; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %0
 ; CHECK-NEXT:    t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %1
-; CHECK-NEXT:    t49: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<64>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11>
+; CHECK-NEXT:    t49: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11>
 ; CHECK-NEXT:    t26: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc, align 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
 ; CHECK-NEXT:    t29: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc + 4, basealign 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
-; CHECK-NEXT:    t32: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<64>, t26, TargetConstant:i32<3>, t29, TargetConstant:i32<11>
+; CHECK-NEXT:    t32: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t26, TargetConstant:i32<3>, t29, TargetConstant:i32<11>
 ; CHECK-NEXT:    t10: i64 = V_ADD_U64_PSEUDO # D:1 t49, t32
 ; CHECK-NEXT:    t23: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3>
 ; CHECK-NEXT:    t16: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t23
diff --git a/llvm/test/tools/gold/X86/devirt_vcall_vis_export_dynamic.ll b/llvm/test/tools/gold/X86/devirt_vcall_vis_export_dynamic.ll
index 3d2badc18674..1976f8f539fd 100644
--- a/llvm/test/tools/gold/X86/devirt_vcall_vis_export_dynamic.ll
+++ b/llvm/test/tools/gold/X86/devirt_vcall_vis_export_dynamic.ll
@@ -116,7 +116,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 ;; Prevent the vtables from being dead code eliminated.
 @llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D]
 
-; CHECK-IR-LABEL: define dso_local i32 @_start
+; CHECK-IR-LABEL: define dso_local {{(noundef )?}}i32 @_start
 define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
diff --git a/llvm/test/tools/gold/X86/devirt_vcall_vis_public.ll b/llvm/test/tools/gold/X86/devirt_vcall_vis_public.ll
index bb94cd3e1889..4e84a21ebfdf 100644
--- a/llvm/test/tools/gold/X86/devirt_vcall_vis_public.ll
+++ b/llvm/test/tools/gold/X86/devirt_vcall_vis_public.ll
@@ -74,7 +74,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 ; Prevent the vtables from being dead code eliminated.
 @llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D]
 
-; CHECK-IR-LABEL: define dso_local i32 @_start
+; CHECK-IR-LABEL: define dso_local {{(noundef )?}}i32 @_start
 define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
diff --git a/llvm/test/tools/gold/X86/opt-level.ll b/llvm/test/tools/gold/X86/opt-level.ll
index 1ec2ea5e6609..65304c03f704 100644
--- a/llvm/test/tools/gold/X86/opt-level.ll
+++ b/llvm/test/tools/gold/X86/opt-level.ll
@@ -24,7 +24,7 @@ define internal void @foo() {
 }
 
 ; CHECK-O0: define internal i32 @bar(
-; CHECK-O1: define internal i32 @bar(
+; CHECK-O1: define internal noundef i32 @bar(
 define internal i32 @bar(i1 %p) {
   br i1 %p, label %t, label %f
 
diff --git a/llvm/test/tools/gold/X86/v1.16/devirt_vcall_vis_export_dynamic.ll b/llvm/test/tools/gold/X86/v1.16/devirt_vcall_vis_export_dynamic.ll
index f927af61d56f..82880a2483fa 100644
--- a/llvm/test/tools/gold/X86/v1.16/devirt_vcall_vis_export_dynamic.ll
+++ b/llvm/test/tools/gold/X86/v1.16/devirt_vcall_vis_export_dynamic.ll
@@ -114,7 +114,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 ;; Prevent the vtables from being dead code eliminated.
 @llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D]
 
-; CHECK-IR-LABEL: define dso_local i32 @_start
+; CHECK-IR-LABEL: define dso_local {{(noundef )?}}i32 @_start
 define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
diff --git a/llvm/test/tools/llvm-cxxfilt/no-params.test b/llvm/test/tools/llvm-cxxfilt/no-params.test
new file mode 100644
index 000000000000..17cbbbe2242b
--- /dev/null
+++ b/llvm/test/tools/llvm-cxxfilt/no-params.test
@@ -0,0 +1,34 @@
+RUN: llvm-cxxfilt             _Z3fooILZ3BarEET_f _Z3fooIPFcfEET_d _ZN1f2baC2ERKNS_2baIT_EE _Z3foov.123 | FileCheck %s --check-prefix=CHECK-PARAMS
+RUN: llvm-cxxfilt          -p _Z3fooILZ3BarEET_f _Z3fooIPFcfEET_d _ZN1f2baC2ERKNS_2baIT_EE _Z3foov.123 | FileCheck %s --check-prefix=CHECK-NO-PARAMS --match-full-lines
+RUN: llvm-cxxfilt --no-params _Z3fooILZ3BarEET_f _Z3fooIPFcfEET_d _ZN1f2baC2ERKNS_2baIT_EE _Z3foov.123 | FileCheck %s --check-prefix=CHECK-NO-PARAMS --match-full-lines
+
+# Check that -p or --no-params flag omits function parameters and the return
+# type.
+
+CHECK-PARAMS: Bar foo<Bar>(float)
+CHECK-NO-PARAMS: foo<Bar>
+
+# Check that only the top-level function is impacted by the switch, and that
+# nested function types in the encoding (e.g. where a function type is being
+# used as a template parameter) still include their parameters.
+#
+# template <typename T> T foo(double);
+# typedef char (*F)(float);
+# F foo<F>(double)
+
+CHECK-PARAMS: char (*foo<char (*)(float)>(double))(float)
+CHECK-NO-PARAMS: foo<char (*)(float)>
+
+# Use an invalid mangled name broken in the function parameters to check how -p
+# or --no-params flag works. If the option is given we should be able to
+# demangle the function name just fine. If it is not given, demangling will fail
+# because of the invalid params.
+
+CHECK-PARAMS: _ZN1f2baC2ERKNS_2baIT_EE
+CHECK-NO-PARAMS: f::ba::ba
+
+# Check that a vendor specific suffix is also omitted when --no-params is
+# specified. This matches c++filt's behaviour.
+
+CHECK-PARAMS: foo() (.123)
+CHECK-NO-PARAMS: foo
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/segment-registers-subprocess.asm b/llvm/test/tools/llvm-exegesis/X86/latency/segment-registers-subprocess.asm
new file mode 100644
index 000000000000..5d5219f9375f
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/segment-registers-subprocess.asm
@@ -0,0 +1,29 @@
+# REQUIRES: exegesis-can-measure-latency, x86_64-linux
+
+# Check that the value of the segment registers is set properly when in
+# subprocess mode.
+
+# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -snippets-file=%s -execution-mode=subprocess | FileCheck %s
+
+# LLVM-EXEGESIS-DEFREG FS 12345600
+# LLVM-EXEGESIS-DEFREG GS 2468ac00
+# LLVM-EXEGESIS-DEFREG R13 0
+# LLVM-EXEGESIS-DEFREG R14 127
+# LLVM-EXEGESIS-DEFREG R15 0
+# LLVM-EXEGESIS-MEM-DEF MEM1 4096 0000000012345600
+# LLVM-EXEGESIS-MEM-DEF MEM2 4096 000000002468ac00
+# LLVM-EXEGESIS-MEM-MAP MEM1 305418240
+# LLVM-EXEGESIS-MEM-MAP MEM2 610836480
+
+movq %fs:0, %r13
+cmpq $0x12345600, %r13
+cmovneq %r14, %r15
+movq %gs:0, %r13
+cmpq $0x2468ac00, %r13
+cmovneq %r14, %r15
+
+movq $60, %rax
+movq %r15, %rdi
+syscall
+
+# CHECK-NOT: error:           'Child benchmarking process exited with non-zero exit code: Child process returned with unknown exit code'
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/subprocess-preserved-registers.s b/llvm/test/tools/llvm-exegesis/X86/latency/subprocess-preserved-registers.s
index 1c680b7c66d2..e46da53a212c 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/subprocess-preserved-registers.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/subprocess-preserved-registers.s
@@ -7,9 +7,18 @@
 
 # LLVM-EXEGESIS-DEFREG RAX 3
 # LLVM-EXEGESIS-DEFREG RCX 5
-# LLVM-EXEGESIS-DEFREG RDI 7
-# LLVM-EXEGESIS-DEFREG RSI B
-# LLVM-EXEGESIS-DEFREG R11 D
+# LLVM-EXEGESIS-DEFREG RDX 7
+# LLVM-EXEGESIS-DEFREG RBX B
+# LLVM-EXEGESIS-DEFREG RSI D
+# LLVM-EXEGESIS-DEFREG RDI 11
+# LLVM-EXEGESIS-DEFREG RSP 13
+# LLVM-EXEGESIS-DEFREG RBP 17
+# LLVM-EXEGESIS-DEFREG R8 1D
+# LLVM-EXEGESIS-DEFREG R9 1F
+# LLVM-EXEGESIS-DEFREG R10 29
+# LLVM-EXEGESIS-DEFREG R11 2B
+# LLVM-EXEGESIS-DEFREG R12 2F
+# LLVM-EXEGESIS-DEFREG R13 35
 # LLVM-EXEGESIS-DEFREG R14 127
 # LLVM-EXEGESIS-DEFREG R15 0
 
@@ -17,11 +26,29 @@ cmpq $0x3, %rax
 cmovneq %r14, %r15
 cmpq $0x5, %rcx
 cmovneq %r14, %r15
-cmpq $0x7, %rdi
+cmpq $0x7, %rdx
 cmovneq %r14, %r15
-cmpq $0xB, %rsi
+cmpq $0xB, %rbx
 cmovneq %r14, %r15
-cmpq $0xD, %r11
+cmpq $0xD, %rsi
+cmovneq %r14, %r15
+cmpq $0x11, %rdi
+cmovneq %r14, %r15
+cmpq $0x13, %rsp
+cmovneq %r14, %r15
+cmpq $0x17, %rbp
+cmovneq %r14, %r15
+cmpq $0x1d, %r8
+cmovneq %r14, %r15
+cmpq $0x1f, %r9
+cmovneq %r14, %r15
+cmpq $0x29, %r10
+cmovneq %r14, %r15
+cmpq $0x2b, %r11
+cmovneq %r14, %r15
+cmpq $0x2f, %r12
+cmovneq %r14, %r15
+cmpq $0x35, %r13
 cmovneq %r14, %r15
 
 movq $60, %rax
diff --git a/llvm/test/tools/llvm-nm/wasm/dylink.yaml b/llvm/test/tools/llvm-nm/wasm/dylink.yaml
new file mode 100644
index 000000000000..2a8654526789
--- /dev/null
+++ b/llvm/test/tools/llvm-nm/wasm/dylink.yaml
@@ -0,0 +1,69 @@
+# RUN: yaml2obj %s -o %t.so
+# RUN: llvm-nm %t.so | FileCheck %s
+#
+# CHECK: 00000001 T my_func_export
+# CHECK: 0000002a D my_global_export
+
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            CUSTOM
+    Name:            dylink.0
+    MemorySize:      15
+    MemoryAlignment: 0
+    TableSize:       0
+    TableAlignment:  0
+    Needed:          []
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            IMPORT
+    Imports:
+      - Module:          env
+        Field:           foo
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          env
+        Field:           bar
+        Kind:            GLOBAL
+        GlobalType:      I32
+        GlobalMutable:   true
+      - Module:          env
+        Field:           memory
+        Kind:            MEMORY
+        Memory:
+          Minimum:         0x1
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            GLOBAL
+    Globals:
+      - Index:           1
+        Mutable:         false
+        Type:            I32
+        InitExpr:
+          Opcode:          I32_CONST
+          Value:           42
+  - Type:            EXPORT
+    Exports:
+      - Name:            my_func_export
+        Kind:            FUNCTION
+        Index:           1
+      - Name:            my_global_export
+        Kind:            GLOBAL
+        Index:           1
+  - Type:            CODE
+    Functions:
+      - Index:           1
+        Locals:
+        Body:            00
+  - Type:            DATA
+    Segments:
+      - SectionOffset:   0
+        InitFlags:       0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           0
+        Content:         ''
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s
new file mode 100644
index 000000000000..e1d312d6035c
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s
@@ -0,0 +1,105 @@
+;; Test disassembly for gfx12 kernel descriptor.
+
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+;--- 1.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=+wavefrontsize32,-wavefrontsize64 -filetype=obj -mcpu=gfx1200 < 1.s > 1.o
+; RUN: llvm-objdump --disassemble-symbols=kernel.kd 1.o | tail -n +7 | tee 1-disasm.s | FileCheck 1.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=+wavefrontsize32,-wavefrontsize64 -filetype=obj -mcpu=gfx1200 < 1-disasm.s > 1-disasm.o
+; RUN: cmp 1.o 1-disasm.o
+; CHECK: .amdhsa_kernel kernel
+; CHECK-NEXT: .amdhsa_group_segment_fixed_size 0
+; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0
+; CHECK-NEXT: .amdhsa_kernarg_size 0
+; CHECK-NEXT: ; INST_PREF_SIZE 0
+; CHECK-NEXT: ; GLG_EN 0
+; CHECK-NEXT: ; IMAGE_OP 0
+; CHECK-NEXT: .amdhsa_next_free_vgpr 32
+; CHECK-NEXT: .amdhsa_reserve_vcc 0
+; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0
+; CHECK-NEXT: .amdhsa_next_free_sgpr 8
+; CHECK-NEXT: .amdhsa_float_round_mode_32 0
+; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0
+; CHECK-NEXT: .amdhsa_float_denorm_mode_32 0
+; CHECK-NEXT: .amdhsa_float_denorm_mode_16_64 3
+; CHECK-NEXT: .amdhsa_fp16_overflow 0
+; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1
+; CHECK-NEXT: .amdhsa_memory_ordered 1
+; CHECK-NEXT: .amdhsa_forward_progress 0
+; CHECK-NEXT: .amdhsa_round_robin_scheduling 0
+; CHECK-NEXT: .amdhsa_enable_private_segment 0
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_info 0
+; CHECK-NEXT: .amdhsa_system_vgpr_workitem_id 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
+; CHECK-NEXT: .amdhsa_exception_fp_denorm_src 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_overflow 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_underflow 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_inexact 0
+; CHECK-NEXT: .amdhsa_exception_int_div_zero 0
+; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+; CHECK-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_wavefront_size32 1
+; CHECK-NEXT: .end_amdhsa_kernel
+.amdhsa_kernel kernel
+  .amdhsa_next_free_vgpr 32
+  .amdhsa_next_free_sgpr 32
+  .amdhsa_wavefront_size32 1
+.end_amdhsa_kernel
+
+;--- 2.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-wavefrontsize32,+wavefrontsize64 -filetype=obj -mcpu=gfx1200 < 2.s > 2.o
+; RUN: llvm-objdump --disassemble-symbols=kernel.kd 2.o | tail -n +7 | tee 2-disasm.s | FileCheck 2.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-wavefrontsize32,+wavefrontsize64 -filetype=obj -mcpu=gfx1200 < 2-disasm.s > 2-disasm.o
+; RUN: cmp 2.o 2-disasm.o
+; CHECK: .amdhsa_kernel kernel
+; CHECK-NEXT: .amdhsa_group_segment_fixed_size 0
+; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0
+; CHECK-NEXT: .amdhsa_kernarg_size 0
+; CHECK-NEXT: ; INST_PREF_SIZE 0
+; CHECK-NEXT: ; GLG_EN 0
+; CHECK-NEXT: ; IMAGE_OP 0
+; CHECK-NEXT: .amdhsa_next_free_vgpr 32
+; CHECK-NEXT: .amdhsa_reserve_vcc 0
+; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0
+; CHECK-NEXT: .amdhsa_next_free_sgpr 8
+; CHECK-NEXT: .amdhsa_float_round_mode_32 0
+; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0
+; CHECK-NEXT: .amdhsa_float_denorm_mode_32 0
+; CHECK-NEXT: .amdhsa_float_denorm_mode_16_64 3
+; CHECK-NEXT: .amdhsa_fp16_overflow 0
+; CHECK-NEXT: .amdhsa_workgroup_processor_mode 1
+; CHECK-NEXT: .amdhsa_memory_ordered 1
+; CHECK-NEXT: .amdhsa_forward_progress 0
+; CHECK-NEXT: .amdhsa_round_robin_scheduling 0
+; CHECK-NEXT: .amdhsa_enable_private_segment 0
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
+; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_info 0
+; CHECK-NEXT: .amdhsa_system_vgpr_workitem_id 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
+; CHECK-NEXT: .amdhsa_exception_fp_denorm_src 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_overflow 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_underflow 0
+; CHECK-NEXT: .amdhsa_exception_fp_ieee_inexact 0
+; CHECK-NEXT: .amdhsa_exception_int_div_zero 0
+; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
+; CHECK-NEXT: .amdhsa_user_sgpr_queue_ptr 0
+; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
+; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
+; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_wavefront_size32 0
+; CHECK-NEXT: .end_amdhsa_kernel
+.amdhsa_kernel kernel
+  .amdhsa_next_free_vgpr 32
+  .amdhsa_next_free_sgpr 32
+  .amdhsa_wavefront_size32 0
+.end_amdhsa_kernel
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
index adedb6b7a5ab..2b4d6806292c 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
@@ -21,7 +21,7 @@
 ; CHECK-NEXT:        68:      	cmplwi	3, 11
 ; CHECK-NEXT:        6c:      	bt	0, 0x60 <L2>
 ; CHECK-NEXT:        70:        mr      31, 3
-; CHECK-NEXT:        74:      	bl 0x0 <.internal>
+; CHECK-NEXT:        74:      	bl 0x0 <.extern>
 ; CHECK-NEXT:        78:      	nop
 ; CHECK-NEXT:        7c:        mr      3, 31
 ; CHECK-NEXT:        80:      	b 0x60 <L2>
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands2.ll b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands2.ll
new file mode 100644
index 000000000000..a9cee924845f
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands2.ll
@@ -0,0 +1,65 @@
+; RUN: llc -mtriple=powerpc-ibm-aix-xcoff %s -filetype=obj -o %t
+; RUN: llvm-objdump %t -r -d --symbolize-operands --no-show-raw-insn \
+; RUN:   | FileCheck %s
+
+; CHECK-LABEL: <.a>:
+;; No <L0> should appear
+; CHECK-NEXT:       0:      	mflr 0
+; CHECK-NEXT:       4:      	stwu 1, -64(1)
+; CHECK-NEXT:       8:      	lwz 3, 0(2)
+; CHECK-NEXT:0000000a:  R_TOC        var
+; CHECK-NEXT:       c:      	stw 0, 72(1)
+; CHECK-NEXT:      10:      	lwz 3, 0(3)
+; CHECK-NEXT:      14:      	bl 0x4c <.b>
+; CHECK-NEXT:      18:      	nop
+; CHECK-NEXT:      1c:      	li 3, 1
+; CHECK-NEXT:      20:      	bl 0x0 <.c>
+; CHECK-NEXT:00000020:  R_RBR        .c
+
+; CHECK-LABEL: <.b>:
+; CHECK-NEXT:      4c:      	mflr 0
+; CHECK-NEXT:      50:      	stwu 1, -64(1)
+; CHECK-NEXT:      54:      	cmplwi	3, 1
+; CHECK-NEXT:      58:      	stw 0, 72(1)
+; CHECK-NEXT:      5c:      	stw 3, 60(1)
+; CHECK-NEXT:      60:      	bf	2, 0x6c <L0>
+; CHECK-NEXT:      64:      	bl 0x0 <.a>
+; CHECK-NEXT:      68:      	nop
+; CHECK-NEXT:<L0>:
+; CHECK-NEXT:      6c:      	li 3, 2
+; CHECK-NEXT:      70:      	bl 0x0 <.c>
+; CHECK-NEXT:00000070:  R_RBR        .c
+
+target triple = "powerpc-ibm-aix7.2.0.0"
+
+@var = external global i32, align 4
+
+; Function Attrs: noinline nounwind optnone
+define i32 @a() {
+entry:
+  %0 = load i32, ptr @var, align 4
+  %call = call i32 @b(i32 noundef %0)
+  %call1 = call i32 @c(i32 noundef 1)
+  ret i32 %call1
+}
+
+; Function Attrs: noinline nounwind optnone
+define i32 @b(i32 noundef %x) {
+entry:
+  %x.addr = alloca i32, align 4
+  store i32 %x, ptr %x.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %cmp = icmp eq i32 %0, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %call = call i32 @a()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %call1 = call i32 @c(i32 noundef 2)
+  ret i32 %call1
+}
+
+declare i32 @c(i32 noundef)
+
diff --git a/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml b/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml
new file mode 100644
index 000000000000..9c1e90a2d896
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/wasm/dylink-symbol-table.yaml
@@ -0,0 +1,70 @@
+# RUN: yaml2obj %s -o %t.so
+# RUN: llvm-objdump -t %t.so | FileCheck %s
+#
+# CHECK:      SYMBOL TABLE:
+# CHECK-NEXT: 00000001 g F CODE my_func_export
+# CHECK-NEXT: 0000002a g O DATA my_global_export
+
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            CUSTOM
+    Name:            dylink.0
+    MemorySize:      15
+    MemoryAlignment: 0
+    TableSize:       0
+    TableAlignment:  0
+    Needed:          []
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            IMPORT
+    Imports:
+      - Module:          env
+        Field:           foo
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          env
+        Field:           bar
+        Kind:            GLOBAL
+        GlobalType:      I32
+        GlobalMutable:   true
+      - Module:          env
+        Field:           memory
+        Kind:            MEMORY
+        Memory:
+          Minimum:         0x1
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            GLOBAL
+    Globals:
+      - Index:           1
+        Mutable:         false
+        Type:            I32
+        InitExpr:
+          Opcode:          I32_CONST
+          Value:           42
+  - Type:            EXPORT
+    Exports:
+      - Name:            my_func_export
+        Kind:            FUNCTION
+        Index:           1
+      - Name:            my_global_export
+        Kind:            GLOBAL
+        Index:           1
+  - Type:            CODE
+    Functions:
+      - Index:           1
+        Locals:
+        Body:            00
+  - Type:            DATA
+    Segments:
+      - SectionOffset:   0
+        InitFlags:       0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           0
+        Content:         ''
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
index fbd31d044382..8220361df6cf 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
@@ -20,7 +20,8 @@ RUN: printf '\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0\0\0\0\3' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
@@ -30,7 +31,8 @@ RUN: printf '\2\xff\xff\xd3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2' >> %t
-RUN: printf '\0\0\0\0\0\0\0\1' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\1' >> %t
 RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
index bb899c5fdb55..9352ae132380 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
@@ -20,7 +20,8 @@ RUN: printf '\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0\3\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
@@ -30,7 +31,8 @@ RUN: printf '\xd3\xff\xff\2' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0' >> %t
-RUN: printf '\0\0\0\0\1\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\1\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
index 8fcadb6a0dd2..c3e995add6ff 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
@@ -19,8 +19,10 @@ RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\1\0\0\0\0' >> %t
-RUN: printf '\0\0\0\3\0\0\0\0' >> %t
+RUN: printf '\0\0\0\1' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\3' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\02' >> %t
@@ -28,8 +30,10 @@ RUN: printf '\0\0\0\1\0\3\xff\xc8' >> %t
 RUN: printf '\0\0\0\3\0\3\xff\xc3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\02\0\0\0\0' >> %t
-RUN: printf '\0\0\0\1\0\0\0\0' >> %t
+RUN: printf '\0\0\0\02' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\1' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
index 0aa8b38f6926..0b3ef2a89abe 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
@@ -19,8 +19,10 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t
 RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\1\0\0\0\0\0\0\0' >> %t
-RUN: printf '\3\0\0\0\0\0\0\0' >> %t
+RUN: printf '\1\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\3\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
@@ -28,8 +30,10 @@ RUN: printf '\xc8\xff\3\0\1\0\0\0' >> %t
 RUN: printf '\xc3\xff\3\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\02\0\0\0\0\0\0\0' >> %t
-RUN: printf '\1\0\0\0\0\0\0\0' >> %t
+RUN: printf '\02\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\1\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-readobj/wasm/globals.test b/llvm/test/tools/llvm-readobj/wasm/globals.test
index 4e9f6403c267..0dff18dc5d96 100644
--- a/llvm/test/tools/llvm-readobj/wasm/globals.test
+++ b/llvm/test/tools/llvm-readobj/wasm/globals.test
@@ -19,7 +19,7 @@ Sections:
 # CHECK:      Section {
 # CHECK-NEXT:     Type: DATA (0xB)
 # CHECK-NEXT:     Size: 7
-# CHECK-NEXT:     Offset: 8
+# CHECK-NEXT:     Offset: 14
 # CHECK-NEXT:     Segments [
 # CHECK-NEXT:       Segment {
 # CHECK-NEXT:         Size: 1
diff --git a/llvm/test/tools/llvm-readobj/wasm/sections.test b/llvm/test/tools/llvm-readobj/wasm/sections.test
index 8b8a526295eb..1a4aadad392f 100644
--- a/llvm/test/tools/llvm-readobj/wasm/sections.test
+++ b/llvm/test/tools/llvm-readobj/wasm/sections.test
@@ -6,27 +6,27 @@
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: TYPE (0x1)
 # CHECK-NEXT:     Size: 17
-# CHECK-NEXT:     Offset: 8
+# CHECK-NEXT:     Offset: 14
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: IMPORT (0x2)
 # CHECK-NEXT:     Size: 93
-# CHECK-NEXT:     Offset: 31
+# CHECK-NEXT:     Offset: 37
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: FUNCTION (0x3)
 # CHECK-NEXT:     Size: 3
-# CHECK-NEXT:     Offset: 130
+# CHECK-NEXT:     Offset: 136
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: CODE (0xA)
 # CHECK-NEXT:     Size: 36
-# CHECK-NEXT:     Offset: 139
+# CHECK-NEXT:     Offset: 145
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: DATA (0xB)
 # CHECK-NEXT:     Size: 19
-# CHECK-NEXT:     Offset: 181
+# CHECK-NEXT:     Offset: 187
 # CHECK-NEXT:     Segments [
 # CHECK-NEXT:       Segment {
 # CHECK-NEXT:         Name: .rodata..L.str
@@ -38,13 +38,13 @@
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: CUSTOM (0x0)
 # CHECK-NEXT:     Size: 89
-# CHECK-NEXT:     Offset: 206
+# CHECK-NEXT:     Offset: 212
 # CHECK-NEXT:     Name: linking
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: CUSTOM (0x0)
 # CHECK-NEXT:     Size: 15
-# CHECK-NEXT:     Offset: 309
+# CHECK-NEXT:     Offset: 315
 # CHECK-NEXT:     Name: reloc.CODE
 # CHECK-NEXT:   }
 # CHECK-NEXT: ]
diff --git a/llvm/test/tools/llvm-readtapi/Inputs/flat_namespace.yaml b/llvm/test/tools/llvm-readtapi/Inputs/flat_namespace.yaml
new file mode 100644
index 000000000000..1cd97f09bac7
--- /dev/null
+++ b/llvm/test/tools/llvm-readtapi/Inputs/flat_namespace.yaml
@@ -0,0 +1,328 @@
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x00000006
+  ncmds:           13
+  sizeofcmds:      1128
+  flags:           0x00100004
+  reserved:        0x00000000
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         392
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          4096
+    fileoff:         0
+    filesize:        4096
+    maxprot:         7
+    initprot:        5
+    nsects:          4
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000000000F60
+        size:            44
+        offset:          0x00000F60
+        align:           4
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+        content:         554889E54883EC10897DFCBF7A000000E817000000488B0D940000008B39037DFC8945F889F84883C4105DC3
+      - sectname:        __stubs
+        segname:         __TEXT
+        addr:            0x0000000000000F8C
+        size:            6
+        offset:          0x00000F8C
+        align:           1
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000408
+        reserved1:       0x00000000
+        reserved2:       0x00000006
+        reserved3:       0x00000000
+        content:         FF2586000000
+      - sectname:        __stub_helper
+        segname:         __TEXT
+        addr:            0x0000000000000F94
+        size:            26
+        offset:          0x00000F94
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+        content:         4C8D1D6D0000004153FF255D000000906800000000E9E6FFFFFF
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x0000000000000FB0
+        size:            72
+        offset:          0x00000FB0
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+        content:         010000001C000000000000001C000000000000001C00000002000000600F000034000000340000008D0F00000000000034000000030000000C000100100001000000000000000001
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         312
+    segname:         __DATA
+    vmaddr:          4096
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        4096
+    maxprot:         7
+    initprot:        3
+    nsects:          3
+    flags:           0
+    Sections:
+      - sectname:        __nl_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000000001000
+        size:            16
+        offset:          0x00001000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000006
+        reserved1:       0x00000001
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+        content:         '00000000000000000000000000000000'
+      - sectname:        __got
+        segname:         __DATA
+        addr:            0x0000000000001010
+        size:            8
+        offset:          0x00001010
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000006
+        reserved1:       0x00000003
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+        content:         '0000000000000000'
+      - sectname:        __la_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000000001018
+        size:            8
+        offset:          0x00001018
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000007
+        reserved1:       0x00000004
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+        content:         A40F000000000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          8192
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        220
+    maxprot:         7
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         40
+    dylib:
+      name:            24
+      timestamp:       1
+      current_version: 0
+      compatibility_version: 0
+    Content:   foo.dylib
+    ZeroPadBytes:    7
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      8192
+    rebase_size:     8
+    bind_off:        8200
+    bind_size:       48
+    weak_bind_off:   0
+    weak_bind_size:  0
+    lazy_bind_off:   8248
+    lazy_bind_size:  16
+    export_off:      8264
+    export_size:     16
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          8288
+    nsyms:           4
+    stroff:          8372
+    strsize:         40
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       0
+    iextdefsym:      0
+    nextdefsym:      1
+    iundefsym:       1
+    nundefsym:       3
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  8352
+    nindirectsyms:   5
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            24AE602E-6B89-37A8-9E38-39A436D25110
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           658944
+    sdk:             658944
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         29491968
+  - cmd:             LC_SOURCE_VERSION
+    cmdsize:         16
+    version:         0
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 82115073
+      compatibility_version: 65536
+    Content:   '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         8280
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         8288
+    datasize:        0
+LinkEditData:
+  RebaseOpcodes:
+    - Opcode:          REBASE_OPCODE_SET_TYPE_IMM
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ExtraData:
+        - 0x0000000000000018
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DONE
+      Imm:             0
+  BindOpcodes:
+    - Opcode:          BIND_OPCODE_SET_DYLIB_SPECIAL_IMM
+      Imm:             14
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          _bar
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ULEBExtraData:
+        - 0x0000000000000010
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          dyld_stub_binder
+    - Opcode:          BIND_OPCODE_ADD_ADDR_ULEB
+      Imm:             0
+      ULEBExtraData:
+        - 0xFFFFFFFFFFFFFFE8
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+  LazyBindOpcodes:
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ULEBExtraData:
+        - 0x0000000000000018
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_DYLIB_SPECIAL_IMM
+      Imm:             14
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          _putchar
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0000000000000000
+    Address:         0x0000000000000000
+    Other:           0x0000000000000000
+    ImportName:      ''
+    Children:
+      - TerminalSize:    3
+        NodeOffset:      8
+        Name:            _foo
+        Flags:           0x0000000000000000
+        Address:         0x0000000000000F60
+        Other:           0x0000000000000000
+        ImportName:      ''
+  NameList:
+    - n_strx:          2
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         3936
+    - n_strx:          7
+      n_type:          0x01
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          12
+      n_type:          0x01
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          21
+      n_type:          0x01
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ' '
+    - _foo
+    - _bar
+    - _putchar
+    - dyld_stub_binder
+    - ''
+    - ''
+...
+
diff --git a/llvm/test/tools/llvm-readtapi/Inputs/mixed-swift-objc.yaml b/llvm/test/tools/llvm-readtapi/Inputs/mixed-swift-objc.yaml
new file mode 100644
index 000000000000..48ceea1ed801
--- /dev/null
+++ b/llvm/test/tools/llvm-readtapi/Inputs/mixed-swift-objc.yaml
@@ -0,0 +1,4011 @@
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x1000007
+  cpusubtype:      0x3
+  filetype:        0x6
+  ncmds:           27
+  sizeofcmds:      3200
+  flags:           0x110085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         1032
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          16384
+    fileoff:         0
+    filesize:        16384
+    maxprot:         5
+    initprot:        5
+    nsects:          12
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x1B30
+        size:            6790
+        offset:          0x1B30
+        align:           4
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         554889E531C05DC30F1F840000000000554889E5B8010000005DC30F1F440000554889E531C024010FB6C05DC3909090554889E541555031C089C7E8600500004989C5E8E80200004883C408415D5DC3554889E541554883EC28488945E8488B3F48897DD8E80C1B0000488B45D8488945F04C8B6DF04C896DE0498B4500488B4058FFD0488B7DE04889C1488B45E8488908E8D91A00004883C428415D5DC390554889E541554883EC38488975C0488B3F48897DC8E8BC1A0000488B75C0488B45C8488945F0488B45F0488945D0488B3E48897DD8E89C1A0000488B7DD0488B45D8488945E84C8B6DE84C896DE0498B4500488B4060FFD0488B7DE0E86F1A00004883C438415D5DC30F1F8000000000554889E54883EC304C89EF4883C71031C089C1488D75E8488975D8BA20000000E81D1A0000498B7D1048897DE0E8341A0000488B7DD8E8131A0000488B45E04883C4305DC366662E0F1F840000000000554889E54883EC3048897DE0E8051A00004C89EF4883C71031C089C1488D75E8488975D8BA21000000E8C4190000488B45E0498B7D1049894510E8D1190000488B7DD8E8B6190000488B7DE0E8BF1900004883C4305DC3660F1F840000000000554889E54883EC104C896DF04889FE488B7DF04883C71048897DF831C089C1BA21000000E869190000488B55F8488D050C0000004883C4105DC3660F1F440000554889E54883EC1048897DF84088F0A801750B488B7DF8E842190000EB09488B7DF8E8371900004883C4105DC30F1F00554889E54883EC104C896DF048C745F8000000004C896DF8498B7D10E81F190000488B45F04883C4105DC30F1F440000554889E54883EC1048C745F8000000004C896DF8E8B7FFFFFF4889C7BE18000000BA07000000E8CD1800004883C4105DC36666666666662E0F1F840000000000554889E54155504C89EFBE18000000BA07000000E8931800004989C5E80F0000004883C408415D5DC30F1F8000000000554889E541554883EC184C896DE848C745F0000000004C896DF031C089C7E8AD0200004989C5E8350000004889C1488B45E8488948104883C418415D5DC36690554889E5488D3D95320000E88A17000031C989CA5DC390909090909090909090554889E54155504C89EFBE10000000BA07000000E8031800004989C5E80F0000004883C408415D5DC30F1F8000000000554889E54C89E848C745F800000000488945F85DC366662E0F1F840000000000554889E548C745F8000000004C896DF85DC366666666662E0F1F840000000000554889E54C89E848C745F800000000488945F85DC366662E0F1F840000000000554889E54883EC1048C745F8000000004C896DF8E8C7FFFFFF4889C7BE10000000BA07000000E86D1700004883C4105DC36666666666662E0F1F840000000000554889E54155504C89EFBE10000000BA07000000E8331700004989C5E80F0000004883C408415D5DC30F1F8000000000554889E54C89E848C745F800000000488945F85DC366662E0F1F840000000000554889E548C745F8000000004C896DF85DC366666666662E0F1F840000000000554889E54C89E848C745F800000000488945F85DC366662E0F1F840000000000554889E54883EC1048C745F8000000004C896DF8E8C7FFFFFF4889C7BE10000000BA07000000E89D1600004883C4105DC36666666666662E0F1F840000000000554889E54155504C89EFBE10000000BA07000000E8631600004989C5E80F0000004883C408415D5DC30F1F8000000000554889E54C89E848C745F800000000488945F85DC366662E0F1F840000000000554889E548C745F8000000004C896DF85DC366666666662E0F1F840000000000554889E54C89E848C745F800000000488945F85DC366662E0F1F840000000000554889E54883EC1048C745F8000000004C896DF8E8C7FFFFFF4889C7BE10000000BA07000000E8CD1500004883C4105DC36666666666662E0F1F840000000000554889E5488D3DBD300000E8FA14000031C989CA5DC3662E0F1F840000000000554889E5488D3D3D310000E8DA14000031C989CA5DC3662E0F1F840000000000554889E5488D3DBD310000E8BA14000031C989CA5DC390909090909090909090554889E54157415641554154534883EC484C894DA04C8945A848894DB0488955B84989F68B4F2089C8C1E81883E00148C1E004488D14074881C2C00000004889559848897DC84801F8C1E91783E1084889CA48894D904C8BBC01C0000000488D8680000000488945D031C0488945C04531ED66666666662E0F1F8400000000004D89FC4983E4037427498D4424FF4883F8010F861101000041F6C5017424E9B9000000666666662E0F1F8400000000004C89F7E8A80B000041F6C5010F859A000000488B4DB848C7411800000000488B45A048894120488B45B048894108488B45A848890131C031D231C931DBF0490FC74E50EB2166662E0F1F8400000000004881E1FFF3FFFF418856214889C3F0490FC74E507432F7C20002000075224889D1F7C20004000075D74881E1FFF7FFFFEBD966666666662E0F1F8400000000004C89F7E808050000488B7DD0E88F1200004C89F7E8B708000041B50131C031D231C931DBF0490FC74E500FB6DA483B5DC07610488B7DC84889DEE87106000048895DC04C89F84883E0FC498946104C89F8488B4D90488B5598F04C0FB1340A74614989C7E9D7FEFFFF488B7DC8E85E0A000041F6C501746D31C031D231C931DBF0490FC74E50EB130F1F8400000000004889C3F0490FC74E507439F7C20002000075294889D14881C900080000F7C20004000074DB4881E1FFFBFFFF41885621EBCEE8A90700004531E4EB194C89F7E8AC0200004C89F7E8C4070000488B7DD0E86B1100004C89E04883C4485B415C415D415E415F5DC3660F1F840000000000480FBAED3C554156488D6C24084883EC0841574154534883EC184989CF4989D44889F34989FDE835070000488D0DBE000000488948384C8978404889DF4889C64C89FA4D89E74C89E14D89F04D89E9E86CFDFFFF4883F801741C4883F80274714883C4185B415C415F4883C4105D480FBAF53CC208008B432089C1C1E91883E10148C1E1044801D9C1E81783E008488D3C01488B9408C8000000488B42F80FB67050488D0C3E4881C1D800000048F7D64821CE4C89EFFF50104C89F84883C4185B415C415F4883C4105D480FBAF53CFFE0488D35FE13000031FF31C0E86F0600006666666666662E0F1F840000000000480FBAED3C554156488D6C2408504C89F04D8B364883C4105D480FBAF53CFF60086666666666662E0F1F840000000000480FBAED3C554156488D6C24084883EC0841574154534883EC184989CF4989D44889F34989FDE815060000488D0DDE000000488948384C8978404889DF4889C64C89FA4D89E74C89E14D89F04D89E9E84CFCFFFF4883F80274524883F8010F85880000008B432089C1C1E91883E10148C1E1044801D9C1E81783E008488D3C01488B9408C8000000488B42F80FB67050488D0C3E4881C1D800000048F7D64821CE4C89EFFF50104531EDEB288B432089C1C1E91883E10148C1E1044801D9C1E81783E0084C8BAC08D00000004C89EFE8301100004C89F84883C4185B415C415F4883C4105D480FBAF53CFFE04883C4185B415C415F4883C4105D480FBAF53CC2080066666666662E0F1F840000000000480FBAED3C554156488D6C240850498B06498B4E084D8B6E184989C64883C4105D480FBAF53CFFE10F1F840000000000554889E54983F921B8200000004C0F42C85DFF64240890909090909090909090554889E54157415641554154534883EC284989FF31C031D231C931DBF0480FC74F504989C54989D631F6488945C0488955C84C8D65C041F7C6000200007430904889F3666666662E0F1F8400000000004C89FF4C89E6E8550300004C8B75C841F7C60002000075E84C8B6DC04889DE4885F6740C4C896E18EB41660F1F440000BF20000000E8420F00004889C348C70001000000488D780848897DB831F6E8BD0B000048C74310C00000004C896B18488B7DB8E8C80B00004889DE488D5E104981CE00020000488B45C0488B55C84C89F1F0490FC74F5074204989C54989D6488945C0488955C841F7C6000200000F854CFFFFFFE976FFFFFF488B45C84889C14881C90008000048894DC8A900040000740F4881E1FFFBFFFF48894DC8418847214989F6488B5DC0498B4750498B5758F0490FC74F5075F8488D3D692E0000E83C0B0000498D5E084889DFE8400B0000498B064883F80175124889DFE80F0B00004C89F7E85D0E0000EB0648FFC8498906488D3D302E00004883C4285B415C415D415E415F5DE9050B00000F1F440000554889E54157415641554154534883EC284989FF31C031D231C931DBF0480FC74F504989C54989D631F6488945C0488955C84C8D65C041F7C6000200007430904889F3666666662E0F1F8400000000004C89FF4C89E6E8C50100004C8B75C841F7C60002000075E84C8B6DC04889DE4885F6740C4C896E18EB41660F1F440000BF20000000E8B20D00004889C348C70001000000488D780848897DB831F6E82D0A000048C74310C00000004C896B18488B7DB8E8380A00004889DE488D5E104981CE00020000488B45C0488B55C84C89F1F0490FC74F5074204989C54989D6488945C0488955C841F7C6000200000F854CFFFFFFE976FFFFFF488B45C84889C14881E1FFF7FFFF48894DC8A90004000074124889C14881E1FFF3FFFF48894DC8418847214989F6488B5DC0498B4750498B5758666666662E0F1F840000000000F0490FC74F5075F8488D3DC92C0000E89C090000498D5E084889DFE8A0090000498B064883F80175124889DFE86F0900004C89F7E8BD0C0000EB0648FFC8498906488D3D902C00004883C4285B415C415D415E415F5DE9650900000F1F440000554889E54156534889F34989FE48833D5B2C0000FF7518488B054A2C00004885C074284C89F74889DE5B415E5DFFE0488D3D3A2C0000488D352B2C0000488D150C000000E8650C0000EBCCE8580C0000554889E553504889FB488D35F60E000048C7C7FEFFFFFFE8480C00004889034883C4085B5DC3662E0F1F840000000000554889E54157415641554154534883EC184989F64989FF4C8D25DA2B000048897DD0EB466666662E0F1F84000000000048FFC8488B4DC84889014D89EC4C8B7DD04C89EFE89708000031C031D231C931DBF0490FC74F5049890649895608F7C2000200000F84A10000004C89E7E85E08000031C031D231C931DBF0490FC74F5049890649895608F7C20002000074624889C34885C0745A4989DF488D43F0488945C848FF43F04C89E7E8320800004883C3F84889DFE8160800004889DFE81E0800004D89E54C89E7E803080000498B47F04883F8010F8555FFFFFF4889DFE8DD070000488B7DC8E82A0B0000E949FFFFFF488D3D002B00004883C4185B415C415D415E415F5DE9D50700004883C4185B415C415D415E415F5DC3909090909090554889E5E8FF0A000090909090909090554889E5BF670000005DE9370B000090554889E55350BF67000000E8260B00004889C3BF6700000031F6E8770B00004889D84883C4085B5DC30F1F8000000000554889E553504889FBBF68000000E8F30A0000488D78184889DE4883C4085B5DE91B01000066662E0F1F840000000000554889E54157415653504889FBBF68000000E8BF0A00004989C6488D3D3F2A0000488D35A800000031D2E82B0B0000803D322A00000075404D8B7E1848833D342A0000FF753D488B05232A00004885C074094C89FFFFD084C0740B498B7E18E8080B0000EB04498B46184889432841C646180041C64620004883C4085B415E415F5DC3488D3DEE290000488D35DF290000488D1538010000E8F1090000EBA790554889E5488D3DB5290000488D351E00000031D2E8A10A00000FB605A82900005DC366666666662E0F1F840000000000554889E54883EC20488D7DE8E8A109000048837DE80C0F94C048837DF0000F99C10F9EC220C120D1880D6A2900004883C4205DC36666662E0F1F840000000000554889E54157415653504989F64889FB488D3D39290000488D35A2FFFFFF31D2E8250A0000803D2C29000000740B4883C4085B415E415F5DC34D8B7E2848833D23290000FF7557488B05122900004885C074094C89FFFFD084C0740B498B7E28E8F7090000EB04498B462849C74628FFFFFFFF807B0800741C488D48014883F90272AB4889C74883C4085B415E415F5DE9FB080000C6430801488903EB90488D3DC3280000488D35B4280000488D150D000000E8C6080000EB8D660F1F440000554889E553504889FB488D356A0B000048C7C7FEFFFFFFE8A80800004889034883C4085B5DC390909090909090909090554889E553504889FB48833D77280000FF751E488B05662800004885C0740B4889DF4883C4085B5DFFE04883C4085B5DC3488D3D50280000488D3541280000488D155A000000E843080000EBC60F1F00554889E553504889FB48833D37280000FF751E488B05262800004885C0740B4889DF4883C4085B5DFFE04883C4085B5DC3488D3D10280000488D3501280000488D153A000000E8F3070000EBC60F1F00554889E553504889FB488D35BD0A000048C7C7FEFFFFFFE8D80700004889034883C4085B5DC3662E0F1F840000000000554889E553504889FB488D359C0A000048C7C7FEFFFFFFE8A80700004889034883C4085B5DC390909090909090909090554889E5535031F6E8B707000085C075074883C4085B5DC389C389C7E81F000000488D35630A0000488D157D0A000031FF4889C14189D831C0E842FCFFFF6690FFCF83FF22771D554889E5488D054200000048630CB84801C15DFFE1488D05E40C0000C3488D05FE0C0000C3488D05DA0C0000C3488D05DA0C0000C3488D05E00C0000C3488D05B50C0000C3488D05C90C0000C3C8FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD8FFFFFFE0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFE8FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFF0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFD0FFFFFFF8FFFFFF554889E55350E89306000085C075074883C4085B5DC389C389C7E801FFFFFF488D3545090000488D158609000031FF4889C14189D831C0E824FBFFFF0F1F4000554889E55350E85F06000085C075074883C4085B5DC389C389C7E8C1FEFFFF488D3505090000488D156709000031FF4889C14189D831C0E8E4FAFFFF0F1F4000554889E55350E80D06000085C075074883C4085B5DC389C389C7E881FEFFFF488D35C5080000488D154709000031FF4889C14189D831C0E8A4FAFFFF0F1F4000554889E55350E8E505000085C075074883C4085B5DC389C389C7E841FEFFFF488D3585080000488D152A09000031FF4889C14189D831C0E864FAFFFF0F1F4000554889E541574156534883EC184189F74889FB488D7DD8E8C405000085C07539410FB6F7488D7DD8E8B905000085C07542488D75D84889DFE88505000085C07559488D7DD8E89005000085C075654883C4185B415E415F5DC34189C689C7E8BDFDFFFF488D3501080000488D15CC080000EB184189C689C7E8A3FDFFFF488D35E7070000488D15D008000031FF4889C14589F031C0E8C6F9FFFF89C389C7E87DFDFFFF488D35C1070000488D15D1080000EB1789C389C7E864FDFFFF488D35A8070000488D15DA08000031FF4889C14189D831C0E887F9FFFF0F1F8000000000554889E55350E8D104000085C075074883C4085B5DC389C389C7E821FDFFFF488D3565070000488D15B808000031FF4889C14189D831C0E844F9FFFF0F1F4000554889E55350E89D04000085C075074883C4085B5DC389C389C7E8E1FCFFFF488D3525070000488D159608000031FF4889C14189D831C0E804F9FFFF0F1F4000554889E55350E86904000085C075074883C4085B5DC389C389C7E8A1FCFFFF488D35E5060000488D157108000031FF4889C14189D831C0E8C4F8FFFF0F1F4000554889E55350E82304000089C3A9EFFFFFFF750C85DB0F94C04883C4085B5DC389DFE859FCFFFF488D359D060000488D154608000031FF4889C14189D831C0E87CF8FFFF6666662E0F1F840000000000554889E5C707000000005DC30F1F4000554889E55DC3662E0F1F840000000000554889E55DE96C030000660F1F440000554889E55DE968030000660F1F440000554889E55DE952030000660F1F440000554889E5535031F6E8A503000085C075074883C4085B5DC389C389C7E8BFFBFFFF488D3503060000488D15CA07000031FF4889C14189D831C0E8E2F7FFFF6690554889E55350E86103000085C075074883C4085B5DC389C389C7E881FBFFFF488D35C5050000488D15B207000031FF4889C14189D831C0E8A4F7FFFF0F1F4000554889E55350E82D03000085C075074883C4085B5DC389C389C7E841FBFFFF488D3585050000488D159207000031FF4889C14189D831C0E864F7FFFF0F1F4000554889E55350E8F302000089C3A9EFFFFFFF750C85DB0F94C04883C4085B5DC389DFE8F9FAFFFF488D353D050000488D156907000031FF4889C14189D831C0E81CF7FFFF6666662E0F1F840000000000554889E55350E8B502000085C075074883C4085B5DC389C389C7E8B1FAFFFF488D35F5040000488D154307000031FF4889C14189D831C0E8D4F6FFFF0F1F4000554889E55350E86902000089C3A9EFFFFFFF750C85DB0F94C04883C4085B5DC389DFE869FAFFFF488D35AD040000488D151A07000031FF4889C14189D831C0E88CF6FFFF6666662E0F1F840000000000554889E55350E81F02000085C075074883C4085B5DC389C389C7E821FAFFFF488D3565040000488D15F406000031FF4889C14189D831C0E844F6FFFF0F1F4000554889E55350E8DF01000085C075074883C4085B5DC389C389C7E8E1F9FFFF488D3525040000488D15B406000031FF4889C14189D831C0E804F6FFFF90909090554889E553504889FB48833D17210000FF751E488B05062100004885C0740B4889DF4883C4085B5DFFE04883C4085B5DC3488D3DF0200000488D35E1200000488D155A000000E8C3000000EBC60F1F00554889E553504889FB48833DD7200000FF751E488B05C62000004885C0740B4889DF4883C4085B5DFFE04883C4085B5DC3488D3DB0200000488D35A1200000488D153A000000E873000000EBC60F1F00554889E553504889FB488D354306000048C7C7FEFFFFFFE8580000004889034883C4085B5DC3662E0F1F840000000000554889E553504889FB488D353606000048C7C7FEFFFFFFE8280000004889034883C4085B5DC3
+      - sectname:        __stubs
+        segname:         __TEXT
+        addr:            0x35B6
+        size:            252
+        offset:          0x35B6
+        align:           1
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000408
+        reserved1:       0x0
+        reserved2:       0x6
+        reserved3:       0x0
+        content:         FF254C0A0000FF254E0A0000FF25500A0000FF25520A0000FF25540A0000FF25560A0000FF25580A0000FF255A0A0000FF255C0A0000FF255E0A0000FF25600A0000FF25620A0000FF25640A0000FF25660A0000FF25680A0000FF256A0A0000FF256C0A0000FF256E0A0000FF25700A0000FF25720A0000FF25740A0000FF25760A0000FF25780A0000FF257A0A0000FF257C0A0000FF257E0A0000FF25800A0000FF25820A0000FF25840A0000FF25860A0000FF25880A0000FF258A0A0000FF258C0A0000FF258E0A0000FF25900A0000FF25920A0000FF25940A0000FF25960A0000FF25980A0000FF259A0A0000FF259C0A0000FF259E0A0000
+      - sectname:        __const
+        segname:         __TEXT
+        addr:            0x36C0
+        size:            190
+        offset:          0x36C0
+        align:           4
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         4028232950524F4752414D3A53756767657374696F6E734B6974202050524F4A4543543A53756767657374696F6E734B69742D310A000000000000000000F03F100000000000000053756767657374696F6E734B697400000000000000000000547261696C53756767657374696F6E7300000300000000000000000000000000417661696C61626C655365727669636573000000000000000000000000000000536572766963655265717565737465720053657276696365526174657200
+      - sectname:        __cstring
+        segname:         __TEXT
+        addr:            0x3780
+        size:            1144
+        offset:          0x3780
+        align:           4
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x2
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         5F547443313453756767657374696F6E734B69743136547261696C53756767657374696F6E7300616374697665536572766963657300000000000000000000005F5474433137417661696C61626C6553657276696365733137417661696C61626C6553657276696365730000000000005F5474433137417661696C61626C655365727669636573313653657276696365526571756573746572000000000000005F5474433137417661696C61626C655365727669636573313253657276696365526174657200667574757265207265706F7274656420616E206572726F722C2062757420776169742063616E6E6F74207468726F770073776966745F7461736B5F657363616C617465005F5A3139766F75636865725F6E656564735F61646F70745039766F75636865725F73005F5F7473616E5F61637175697265005F5F7473616E5F72656C656173650027257327206661696C65642077697468206572726F722027257327282564290A00707468726561645F636F6E645F696E69742826636F6E646974696F6E2C206E756C6C7074722900707468726561645F636F6E645F64657374726F792826636F6E646974696F6E2900707468726561645F636F6E645F7369676E616C2826636F6E646974696F6E2900707468726561645F636F6E645F62726F6164636173742826636F6E646974696F6E2900707468726561645F636F6E645F776169742826636F6E646974696F6E2C20266D757465782900707468726561645F6D75746578617474725F696E69742826617474722900707468726561645F6D75746578617474725F736574747970652826617474722C206B696E642900707468726561645F6D757465785F696E697428266D757465782C2026617474722900707468726561645F6D75746578617474725F64657374726F792826617474722900707468726561645F6D757465785F64657374726F7928266D757465782900707468726561645F6D757465785F6C6F636B28266D757465782900707468726561645F6D757465785F756E6C6F636B28266D757465782900707468726561645F6D757465785F7472796C6F636B28266D757465782900707468726561645F72776C6F636B5F696E6974282672776C6F636B2C206E756C6C7074722900707468726561645F72776C6F636B5F64657374726F79282672776C6F636B2900707468726561645F72776C6F636B5F72646C6F636B282672776C6F636B2900707468726561645F72776C6F636B5F74727972646C6F636B282672776C6F636B2900707468726561645F72776C6F636B5F77726C6F636B282672776C6F636B2900707468726561645F72776C6F636B5F74727977726C6F636B282672776C6F636B2900707468726561645F72776C6F636B5F756E6C6F636B282672776C6F636B290045494E56414C00455045524D0045444541444C4B00454E4F4D454D0045414741494E004542555359003C756E6B6E6F776E3E0073776966745F7461736B5F656E7465725468726561644C6F63616C436F6E746578740073776966745F7461736B5F657869745468726561644C6F63616C436F6E7465787400
+      - sectname:        __constg_swiftt
+        segname:         __TEXT
+        addr:            0x3BF8
+        size:            312
+        offset:          0x3BF8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         000000000000000008FBFFFF50000080F0FFFFFF14FBFFFF30E2FFFF4401000000000000030000000F00000005000000010000000A0000000B000000040000001200000004E0FFFF130000004CE0FFFF14000000A4E0FFFF010000007CE1FFFF0000000000000000E0FAFFFF50000080F0FFFFFFD4FAFFFF60E4FFFF0001000000000000030000000C00000002000000000000000A0000000A0000000200000001000000C4E1FFFF100000000CE2FFFF50000080ACFFFFFFB0FAFFFF3CE4FFFFCC00000000000000030000000C00000002000000000000000A0000000A000000020000000100000050E2FFFF1000000098E2FFFF5000008068FFFFFF7DFAFFFF18E4FFFF9800000000000000030000000C00000002000000000000000A0000000A0000000200000001000000DCE2FFFF1000000024E3FFFF
+      - sectname:        __swift5_typeref
+        segname:         __TEXT
+        addr:            0x3D30
+        size:            24
+        offset:          0x3D30
+        align:           1
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         01D3FEFFFF0002C902000000016BFFFFFF0001A9FFFFFF00
+      - sectname:        __swift5_reflstr
+        segname:         __TEXT
+        addr:            0x3D48
+        size:            15
+        offset:          0x3D48
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '616374697665536572766963657300'
+      - sectname:        __swift5_fieldmd
+        segname:         __TEXT
+        addr:            0x3D58
+        size:            76
+        offset:          0x3D58
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         D8FFFFFF0000000001000C000100000002000000CAFFFFFFD8FFFFFFC2FFFFFF0000000001000C0000000000B8FFFFFF0000000001000C0000000000AEFFFFFF0000000001000C0000000000
+      - sectname:        __swift5_types
+        segname:         __TEXT
+        addr:            0x3DA4
+        size:            16
+        offset:          0x3DA4
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         60FEFFFFBCFEFFFFFCFEFFFF3CFFFFFF
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x3DB4
+        size:            224
+        offset:          0x3DB4
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         010000001C000000050000003000000000000000300000000200000000000001010001010300010161010301D1580501301B00004800000048000000B73500000000000048000000030000000C0020008C000300000000003000000210010000A00200021003000030030002600300000004000230040000D00400020005000000060004400800073009000060090006700A0000C00A0004F00D0005400E0001700E0004900F0000B00F000110100003B010000020110003E0110001501300003014000130150003101600012017000070170001210002018800000460000004
+      - sectname:        __eh_frame
+        segname:         __TEXT
+        addr:            0x3E98
+        size:            320
+        offset:          0x3E98
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         1400000000000000017A520001781001100C0708900100001400000000000000017A520001781001100C0708900100001400000000000000017A520001781001100C0708900100001400000000000000017A520001781001100C070890010000240000001C00000070E4FFFFFFFFFFFFE10000000000000000460E1086024B0D064983078C068F05240000004400000068E5FFFFFFFFFFFF020100000000000000460E1086024B0D064983078C068F051400000000000000017A520001781001100C0708900100001400000000000000017A520001781001100C0708900100001400000000000000017A520001781001100C0708900100001400000000000000017A520001781001100C0708900100001400000000000000017A520001781001100C0708900100001400000000000000017A520001781001100C070890010000
+      - sectname:        __objc_classname
+        segname:         __TEXT
+        addr:            0x3FD8
+        size:            29
+        offset:          0x3FD8
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x2
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         506F646361737453756767657374696F6E0053756767657374696F6E00
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         472
+    segname:         __DATA_CONST
+    vmaddr:          16384
+    vmsize:          4096
+    fileoff:         16384
+    filesize:        4096
+    maxprot:         3
+    initprot:        3
+    nsects:          5
+    flags:           0
+    Sections:
+      - sectname:        __got
+        segname:         __DATA_CONST
+        addr:            0x4000
+        size:            344
+        offset:          0x4000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x6
+        reserved1:       0x2A
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         643C00000000100000000000000010800100000000001080020000000000108003000000000010800400000000001080050000000000108006000000000010800700000000001080080000000000108009000000000010800A000000000010800B000000000010800C000000000010800D000000000010800E000000000010800F0000000000108010000000000010801100000000001080120000000000108013000000000010801400000000001080150000000000108016000000000010801700000000001080180000000000108019000000000010801A000000000010801B000000000010801C000000000010801D000000000010801E000000000010801F00000000001080200000000000108021000000000010802200000000001080230000000000108024000000000010802500000000001080260000000000108027000000000010802800000000001080290000000000A080
+      - sectname:        __const
+        segname:         __DATA_CONST
+        addr:            0x4158
+        size:            144
+        offset:          0x4158
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         00004002DCFAFFFF20DAFFFF6CDAFFFF0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000385300000000800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003853000000001000
+      - sectname:        __objc_classlist
+        segname:         __DATA_CONST
+        addr:            0x41E8
+        size:            48
+        offset:          0x41E8
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x10000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         28500000000010007850000000001000E05000000000100098510000000010003852000000001000D852000000005000
+      - sectname:        __objc_imageinfo
+        segname:         __DATA_CONST
+        addr:            0x4218
+        size:            8
+        offset:          0x4218
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         0000000040070B05
+      - sectname:        __objc_const
+        segname:         __DATA_CONST
+        addr:            0x4220
+        size:            904
+        offset:          0x4220
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         810000002800000028000000000000000000000000000000D83F00000000900000000000000000000000000000000000000000000000000000000000000000000000000000000000800000000800000008000000000000000000000000000000D83F00000000900000000000000000000000000000000000000000000000000000000000000000000000000000000000810000002800000028000000000000000000000000000000EA3F00000000900000000000000000000000000000000000000000000000000000000000000000000000000000000000800000000800000008000000000000000000000000000000EA3F0000000090000000000000000000000000000000000000000000000000000000000000000000000000000000000081000000280000002800000000000000000000000000000080370000000070000000000000000000000000000000000000000000000000000000000000000000000000000000000020000000010000000037000000001000A737000000001000EC370000000050000300000008000000800000001000000018000000000000000000000000000000803700000000300000000000000000000000000000000000884300000000600000000000000000000000000000000000810000002800000028000000000000000000000000000000C03700000000900000000000000000000000000000000000000000000000000000000000000000000000000000000000800000001000000010000000000000000000000000000000C03700000000900000000000000000000000000000000000000000000000000000000000000000000000000000000000810000002800000028000000000000000000000000000000F03700000000900000000000000000000000000000000000000000000000000000000000000000000000000000000000800000001000000010000000000000000000000000000000F03700000000900000000000000000000000000000000000000000000000000000000000000000000000000000000000810000002800000028000000000000000000000000000000203800000000900000000000000000000000000000000000000000000000000000000000000000000000000000000000800000001000000010000000000000000000000000000000203800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         552
+    segname:         __DATA
+    vmaddr:          20480
+    vmsize:          4096
+    fileoff:         20480
+    filesize:        4096
+    maxprot:         3
+    initprot:        3
+    nsects:          6
+    flags:           0
+    Sections:
+      - sectname:        __objc_data
+        segname:         __DATA
+        addr:            0x5000
+        size:            160
+        offset:          0x5000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         2B0000000000108050500000000010002A0000000000208000000000000000002042000000001000005000000000100078500000000010002A00000000002080000000000000000068420000000010002B000000000010802B000000000010802A000000000020800000000000000000B04200000000100050500000000010002C000000000010802A000000000020800000000000000000F842000000001000
+      - sectname:        __data
+        segname:         __DATA
+        addr:            0x50A0
+        size:            665
+        offset:          0x50A0
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         2D000000000010802D000000000010802A00000000002080000000000000000040430000000020000000000000000000901D0000000010002E00000000001080A0500000000010002F000000000010802A000000000020800000000000000000B243000000004000020000000000000018000000070000009000000018000000043C00000000300000000000000000001000000000000000401C000000001000901C000000001000F01C000000001000D01D0000000010002D000000000010802D000000000010802A000000000020800000000000000000F8430000000020000000000000000000F01E0000000010002E0000000000108058510000000010002F000000000010802A0000000000208000000000000000004244000000004000020000000000000010000000070000007800000018000000643C0000000020000000000000000000601E000000001000B01E0000000010002D000000000010802D000000000010802A00000000002080000000000000000088440000000020000000000000000000C01F0000000010002E00000000001080F8510000000010002F000000000010802A000000000020800000000000000000D244000000004000020000000000000010000000070000007800000018000000A83C0000000020000000000000000000301F000000001000801F0000000010002D000000000010802D000000000010802A0000000000208000000000000000001845000000002000000000000000000090200000000010002E0000000000108098520000000010002F000000000010802A0000000000208000000000000000006245000000004000020000000000000010000000070000007800000018000000EC3C00000000200000000000000000000020000000001000502000000000800000
+      - sectname:        __s_async_hook
+        segname:         __DATA
+        addr:            0x5340
+        size:            400
+        offset:          0x5340
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000D0250000000010007023000000001000902400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+      - sectname:        __swift56_hooks
+        segname:         __DATA
+        addr:            0x54D0
+        size:            176
+        offset:          0x54D0
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
+      - sectname:        __common
+        segname:         __DATA
+        addr:            0x5580
+        size:            8
+        offset:          0x0
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x1
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+      - sectname:        __bss
+        segname:         __DATA
+        addr:            0x5588
+        size:            120
+        offset:          0x0
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x1
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          24576
+    vmsize:          32768
+    fileoff:         24576
+    filesize:        29912
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         104
+    dylib:
+      name:            24
+      timestamp:       1
+      current_version: 65536
+      compatibility_version: 65536
+    Content:         '/System/Library/Frameworks/SuggestionsKit.framework/Versions/A/SuggestionsKit'
+    ZeroPadBytes:    3
+  - cmd:             LC_DYLD_CHAINED_FIXUPS
+    cmdsize:         16
+    dataoff:         24576
+    datasize:        1288
+  - cmd:             LC_DYLD_EXPORTS_TRIE
+    cmdsize:         16
+    dataoff:         25864
+    datasize:        1176
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          28832
+    nsyms:           460
+    stroff:          36536
+    strsize:         17952
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       345
+    iextdefsym:      345
+    nextdefsym:      60
+    iundefsym:       405
+    nundefsym:       55
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  36192
+    nindirectsyms:   85
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            12258037-7DB8-3F3E-B24D-0CF79DC22D90
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           787200
+    sdk:             983040
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         62456064
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        6
+    minos:           1179648
+    sdk:             1179648
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         62456064
+  - cmd:             LC_SOURCE_VERSION
+    cmdsize:         16
+    version:         0
+  - cmd:             LC_SEGMENT_SPLIT_INFO
+    cmdsize:         16
+    dataoff:         27040
+    datasize:        1680
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 14942208
+      compatibility_version: 65536
+    Content:         '/usr/lib/libobjc.A.dylib'
+    ZeroPadBytes:    8
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 88342528
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         48
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 104897792
+      compatibility_version: 65536
+    Content:         '/usr/lib/libc++.1.dylib'
+    ZeroPadBytes:    1
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         64
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 330496
+      compatibility_version: 65536
+    Content:         '/usr/lib/swift/libswiftCore.dylib'
+    ZeroPadBytes:    7
+  - cmd:             LC_LOAD_WEAK_DYLIB
+    cmdsize:         72
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 7889920
+      compatibility_version: 65536
+    Content:         '/usr/lib/swift/libswiftCoreFoundation.dylib'
+    ZeroPadBytes:    5
+  - cmd:             LC_LOAD_WEAK_DYLIB
+    cmdsize:         64
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 0
+      compatibility_version: 65536
+    Content:         '/usr/lib/swift/libswiftDarwin.dylib'
+    ZeroPadBytes:    5
+  - cmd:             LC_LOAD_WEAK_DYLIB
+    cmdsize:         64
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 2686976
+      compatibility_version: 65536
+    Content:         '/usr/lib/swift/libswiftDispatch.dylib'
+    ZeroPadBytes:    3
+  - cmd:             LC_LOAD_WEAK_DYLIB
+    cmdsize:         64
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 65536
+      compatibility_version: 65536
+    Content:         '/usr/lib/swift/libswiftIOKit.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_LOAD_WEAK_DYLIB
+    cmdsize:         64
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 524288
+      compatibility_version: 65536
+    Content:         '/usr/lib/swift/libswiftObjectiveC.dylib'
+    ZeroPadBytes:    1
+  - cmd:             LC_LOAD_WEAK_DYLIB
+    cmdsize:         64
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 2752512
+      compatibility_version: 65536
+    Content:         '/usr/lib/swift/libswiftXPC.dylib'
+    ZeroPadBytes:    8
+  - cmd:             LC_LOAD_WEAK_DYLIB
+    cmdsize:         64
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 65536
+      compatibility_version: 65536
+    Content:         '/usr/lib/swift/libswiftFoundation.dylib'
+    ZeroPadBytes:    1
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         28720
+    datasize:        104
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         28824
+    datasize:        8
+LinkEditData:
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    0
+        NodeOffset:      764
+        Name:            _
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    0
+            NodeOffset:      236
+            Name:            '$s1'
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    0
+                NodeOffset:      93
+                Name:            4SuggestionsKit05TrailA0C
+                Flags:           0x0
+                Address:         0x0
+                Other:           0x0
+                ImportName:      ''
+                Children:
+                  - TerminalSize:    0
+                    NodeOffset:      20
+                    Name:            14activeServices09AvailableE0AECv
+                    Flags:           0x0
+                    Address:         0x0
+                    Other:           0x0
+                    ImportName:      ''
+                    Children:
+                      - TerminalSize:    3
+                        NodeOffset:      6
+                        Name:            M
+                        Flags:           0x0
+                        Address:         0x1CF0
+                        Other:           0x0
+                        ImportName:      ''
+                        Children:
+                          - TerminalSize:    3
+                            NodeOffset:      15
+                            Name:            Tq
+                            Flags:           0x0
+                            Address:         0x3C48
+                            Other:           0x0
+                            ImportName:      ''
+                      - TerminalSize:    3
+                        NodeOffset:      34
+                        Name:            g
+                        Flags:           0x0
+                        Address:         0x1C40
+                        Other:           0x0
+                        ImportName:      ''
+                        Children:
+                          - TerminalSize:    3
+                            NodeOffset:      43
+                            Name:            Tq
+                            Flags:           0x0
+                            Address:         0x3C38
+                            Other:           0x0
+                            ImportName:      ''
+                      - TerminalSize:    0
+                        NodeOffset:      54
+                        Name:            p
+                        Flags:           0x0
+                        Address:         0x0
+                        Other:           0x0
+                        ImportName:      ''
+                        Children:
+                          - TerminalSize:    4
+                            NodeOffset:      48
+                            Name:            MV
+                            Flags:           0x0
+                            Address:         0x4158
+                            Other:           0x0
+                            ImportName:      ''
+                          - TerminalSize:    3
+                            NodeOffset:      69
+                            Name:            Wvd
+                            Flags:           0x0
+                            Address:         0x3700
+                            Other:           0x0
+                            ImportName:      ''
+                          - TerminalSize:    3
+                            NodeOffset:      74
+                            Name:            fi
+                            Flags:           0x0
+                            Address:         0x1B60
+                            Other:           0x0
+                            ImportName:      ''
+                      - TerminalSize:    3
+                        NodeOffset:      79
+                        Name:            s
+                        Flags:           0x0
+                        Address:         0x1C90
+                        Other:           0x0
+                        ImportName:      ''
+                        Children:
+                          - TerminalSize:    3
+                            NodeOffset:      88
+                            Name:            Tq
+                            Flags:           0x0
+                            Address:         0x3C40
+                            Other:           0x0
+                            ImportName:      ''
+                  - TerminalSize:    0
+                    NodeOffset:      165
+                    Name:            ACycf
+                    Flags:           0x0
+                    Address:         0x0
+                    Other:           0x0
+                    ImportName:      ''
+                    Children:
+                      - TerminalSize:    3
+                        NodeOffset:      150
+                        Name:            C
+                        Flags:           0x0
+                        Address:         0x1DD0
+                        Other:           0x0
+                        ImportName:      ''
+                        Children:
+                          - TerminalSize:    3
+                            NodeOffset:      160
+                            Name:            Tq
+                            Flags:           0x0
+                            Address:         0x3C50
+                            Other:           0x0
+                            ImportName:      ''
+                      - TerminalSize:    3
+                        NodeOffset:      175
+                        Name:            c
+                        Flags:           0x0
+                        Address:         0x1E00
+                        Other:           0x0
+                        ImportName:      ''
+                  - TerminalSize:    0
+                    NodeOffset:      185
+                    Name:            M
+                    Flags:           0x0
+                    Address:         0x0
+                    Other:           0x0
+                    ImportName:      ''
+                    Children:
+                      - TerminalSize:    3
+                        NodeOffset:      180
+                        Name:            a
+                        Flags:           0x0
+                        Address:         0x1E40
+                        Other:           0x0
+                        ImportName:      ''
+                      - TerminalSize:    4
+                        NodeOffset:      199
+                        Name:            m
+                        Flags:           0x0
+                        Address:         0x50A0
+                        Other:           0x0
+                        ImportName:      ''
+                      - TerminalSize:    3
+                        NodeOffset:      205
+                        Name:            n
+                        Flags:           0x0
+                        Address:         0x3C04
+                        Other:           0x0
+                        ImportName:      ''
+                  - TerminalSize:    4
+                    NodeOffset:      210
+                    Name:            N
+                    Flags:           0x0
+                    Address:         0x50E0
+                    Other:           0x0
+                    ImportName:      ''
+                  - TerminalSize:    0
+                    NodeOffset:      221
+                    Name:            f
+                    Flags:           0x0
+                    Address:         0x0
+                    Other:           0x0
+                    ImportName:      ''
+                    Children:
+                      - TerminalSize:    3
+                        NodeOffset:      216
+                        Name:            D
+                        Flags:           0x0
+                        Address:         0x1D90
+                        Other:           0x0
+                        ImportName:      ''
+                      - TerminalSize:    3
+                        NodeOffset:      231
+                        Name:            d
+                        Flags:           0x0
+                        Address:         0x1D60
+                        Other:           0x0
+                        ImportName:      ''
+              - TerminalSize:    0
+                NodeOffset:      611
+                Name:            7AvailableServices
+                Flags:           0x0
+                Address:         0x0
+                Other:           0x0
+                ImportName:      ''
+                Children:
+                  - TerminalSize:    0
+                    NodeOffset:      430
+                    Name:            '1'
+                    Flags:           0x0
+                    Address:         0x0
+                    Other:           0x0
+                    ImportName:      ''
+                    Children:
+                      - TerminalSize:    0
+                        NodeOffset:      301
+                        Name:            2ServiceRaterC
+                        Flags:           0x0
+                        Address:         0x0
+                        Other:           0x0
+                        ImportName:      ''
+                        Children:
+                          - TerminalSize:    3
+                            NodeOffset:      286
+                            Name:            13publishRatingyyF
+                            Flags:           0x0
+                            Address:         0x2050
+                            Other:           0x0
+                            ImportName:      ''
+                            Children:
+                              - TerminalSize:    3
+                                NodeOffset:      296
+                                Name:            Tq
+                                Flags:           0x0
+                                Address:         0x3D28
+                                Other:           0x0
+                                ImportName:      ''
+                          - TerminalSize:    0
+                            NodeOffset:      359
+                            Name:            ACycf
+                            Flags:           0x0
+                            Address:         0x0
+                            Other:           0x0
+                            ImportName:      ''
+                            Children:
+                              - TerminalSize:    3
+                                NodeOffset:      344
+                                Name:            C
+                                Flags:           0x0
+                                Address:         0x2000
+                                Other:           0x0
+                                ImportName:      ''
+                                Children:
+                                  - TerminalSize:    3
+                                    NodeOffset:      354
+                                    Name:            Tq
+                                    Flags:           0x0
+                                    Address:         0x3D20
+                                    Other:           0x0
+                                    ImportName:      ''
+                              - TerminalSize:    3
+                                NodeOffset:      369
+                                Name:            c
+                                Flags:           0x0
+                                Address:         0x2030
+                                Other:           0x0
+                                ImportName:      ''
+                          - TerminalSize:    0
+                            NodeOffset:      379
+                            Name:            M
+                            Flags:           0x0
+                            Address:         0x0
+                            Other:           0x0
+                            ImportName:      ''
+                            Children:
+                              - TerminalSize:    3
+                                NodeOffset:      374
+                                Name:            a
+                                Flags:           0x0
+                                Address:         0x2110
+                                Other:           0x0
+                                ImportName:      ''
+                              - TerminalSize:    4
+                                NodeOffset:      393
+                                Name:            m
+                                Flags:           0x0
+                                Address:         0x5298
+                                Other:           0x0
+                                ImportName:      ''
+                              - TerminalSize:    3
+                                NodeOffset:      399
+                                Name:            n
+                                Flags:           0x0
+                                Address:         0x3CEC
+                                Other:           0x0
+                                ImportName:      ''
+                          - TerminalSize:    4
+                            NodeOffset:      404
+                            Name:            N
+                            Flags:           0x0
+                            Address:         0x52D8
+                            Other:           0x0
+                            ImportName:      ''
+                          - TerminalSize:    0
+                            NodeOffset:      415
+                            Name:            f
+                            Flags:           0x0
+                            Address:         0x0
+                            Other:           0x0
+                            ImportName:      ''
+                            Children:
+                              - TerminalSize:    3
+                                NodeOffset:      410
+                                Name:            D
+                                Flags:           0x0
+                                Address:         0x2090
+                                Other:           0x0
+                                ImportName:      ''
+                              - TerminalSize:    3
+                                NodeOffset:      425
+                                Name:            d
+                                Flags:           0x0
+                                Address:         0x2070
+                                Other:           0x0
+                                ImportName:      ''
+                      - TerminalSize:    0
+                        NodeOffset:      485
+                        Name:            6ServiceRequesterC
+                        Flags:           0x0
+                        Address:         0x0
+                        Other:           0x0
+                        ImportName:      ''
+                        Children:
+                          - TerminalSize:    3
+                            NodeOffset:      470
+                            Name:            10enterQueueyyF
+                            Flags:           0x0
+                            Address:         0x1F80
+                            Other:           0x0
+                            ImportName:      ''
+                            Children:
+                              - TerminalSize:    3
+                                NodeOffset:      480
+                                Name:            Tq
+                                Flags:           0x0
+                                Address:         0x3CE4
+                                Other:           0x0
+                                ImportName:      ''
+                          - TerminalSize:    0
+                            NodeOffset:      540
+                            Name:            ACycf
+                            Flags:           0x0
+                            Address:         0x0
+                            Other:           0x0
+                            ImportName:      ''
+                            Children:
+                              - TerminalSize:    3
+                                NodeOffset:      525
+                                Name:            C
+                                Flags:           0x0
+                                Address:         0x1F30
+                                Other:           0x0
+                                ImportName:      ''
+                                Children:
+                                  - TerminalSize:    3
+                                    NodeOffset:      535
+                                    Name:            Tq
+                                    Flags:           0x0
+                                    Address:         0x3CDC
+                                    Other:           0x0
+                                    ImportName:      ''
+                              - TerminalSize:    3
+                                NodeOffset:      550
+                                Name:            c
+                                Flags:           0x0
+                                Address:         0x1F60
+                                Other:           0x0
+                                ImportName:      ''
+                          - TerminalSize:    0
+                            NodeOffset:      560
+                            Name:            M
+                            Flags:           0x0
+                            Address:         0x0
+                            Other:           0x0
+                            ImportName:      ''
+                            Children:
+                              - TerminalSize:    3
+                                NodeOffset:      555
+                                Name:            a
+                                Flags:           0x0
+                                Address:         0x20F0
+                                Other:           0x0
+                                ImportName:      ''
+                              - TerminalSize:    4
+                                NodeOffset:      574
+                                Name:            m
+                                Flags:           0x0
+                                Address:         0x51F8
+                                Other:           0x0
+                                ImportName:      ''
+                              - TerminalSize:    3
+                                NodeOffset:      580
+                                Name:            n
+                                Flags:           0x0
+                                Address:         0x3CA8
+                                Other:           0x0
+                                ImportName:      ''
+                          - TerminalSize:    4
+                            NodeOffset:      585
+                            Name:            N
+                            Flags:           0x0
+                            Address:         0x5238
+                            Other:           0x0
+                            ImportName:      ''
+                          - TerminalSize:    0
+                            NodeOffset:      596
+                            Name:            f
+                            Flags:           0x0
+                            Address:         0x0
+                            Other:           0x0
+                            ImportName:      ''
+                            Children:
+                              - TerminalSize:    3
+                                NodeOffset:      591
+                                Name:            D
+                                Flags:           0x0
+                                Address:         0x1FC0
+                                Other:           0x0
+                                ImportName:      ''
+                              - TerminalSize:    3
+                                NodeOffset:      606
+                                Name:            d
+                                Flags:           0x0
+                                Address:         0x1FA0
+                                Other:           0x0
+                                ImportName:      ''
+                  - TerminalSize:    0
+                    NodeOffset:      638
+                    Name:            AAC
+                    Flags:           0x0
+                    Address:         0x0
+                    Other:           0x0
+                    ImportName:      ''
+                    Children:
+                      - TerminalSize:    3
+                        NodeOffset:      623
+                        Name:            10getServiceyyF
+                        Flags:           0x0
+                        Address:         0x1EB0
+                        Other:           0x0
+                        ImportName:      ''
+                        Children:
+                          - TerminalSize:    3
+                            NodeOffset:      633
+                            Name:            Tq
+                            Flags:           0x0
+                            Address:         0x3CA0
+                            Other:           0x0
+                            ImportName:      ''
+                      - TerminalSize:    0
+                        NodeOffset:      693
+                        Name:            ABycf
+                        Flags:           0x0
+                        Address:         0x0
+                        Other:           0x0
+                        ImportName:      ''
+                        Children:
+                          - TerminalSize:    3
+                            NodeOffset:      678
+                            Name:            C
+                            Flags:           0x0
+                            Address:         0x1E60
+                            Other:           0x0
+                            ImportName:      ''
+                            Children:
+                              - TerminalSize:    3
+                                NodeOffset:      688
+                                Name:            Tq
+                                Flags:           0x0
+                                Address:         0x3C98
+                                Other:           0x0
+                                ImportName:      ''
+                          - TerminalSize:    3
+                            NodeOffset:      703
+                            Name:            c
+                            Flags:           0x0
+                            Address:         0x1E90
+                            Other:           0x0
+                            ImportName:      ''
+                      - TerminalSize:    0
+                        NodeOffset:      713
+                        Name:            M
+                        Flags:           0x0
+                        Address:         0x0
+                        Other:           0x0
+                        ImportName:      ''
+                        Children:
+                          - TerminalSize:    3
+                            NodeOffset:      708
+                            Name:            a
+                            Flags:           0x0
+                            Address:         0x20D0
+                            Other:           0x0
+                            ImportName:      ''
+                          - TerminalSize:    4
+                            NodeOffset:      727
+                            Name:            m
+                            Flags:           0x0
+                            Address:         0x5158
+                            Other:           0x0
+                            ImportName:      ''
+                          - TerminalSize:    3
+                            NodeOffset:      733
+                            Name:            n
+                            Flags:           0x0
+                            Address:         0x3C64
+                            Other:           0x0
+                            ImportName:      ''
+                      - TerminalSize:    4
+                        NodeOffset:      738
+                        Name:            N
+                        Flags:           0x0
+                        Address:         0x5198
+                        Other:           0x0
+                        ImportName:      ''
+                      - TerminalSize:    0
+                        NodeOffset:      749
+                        Name:            f
+                        Flags:           0x0
+                        Address:         0x0
+                        Other:           0x0
+                        ImportName:      ''
+                        Children:
+                          - TerminalSize:    3
+                            NodeOffset:      744
+                            Name:            D
+                            Flags:           0x0
+                            Address:         0x1EF0
+                            Other:           0x0
+                            ImportName:      ''
+                          - TerminalSize:    3
+                            NodeOffset:      759
+                            Name:            d
+                            Flags:           0x0
+                            Address:         0x1ED0
+                            Other:           0x0
+                            ImportName:      ''
+          - TerminalSize:    0
+            NodeOffset:      931
+            Name:            OBJC_
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    0
+                NodeOffset:      890
+                Name:            'CLASS_$_'
+                Flags:           0x0
+                Address:         0x0
+                Other:           0x0
+                ImportName:      ''
+                Children:
+                  - TerminalSize:    4
+                    NodeOffset:      884
+                    Name:            PodcastSuggestion
+                    Flags:           0x0
+                    Address:         0x5028
+                    Other:           0x0
+                    ImportName:      ''
+                  - TerminalSize:    4
+                    NodeOffset:      925
+                    Name:            Suggestion
+                    Flags:           0x0
+                    Address:         0x5078
+                    Other:           0x0
+                    ImportName:      ''
+              - TerminalSize:    0
+                NodeOffset:      965
+                Name:            'METACLASS_$_'
+                Flags:           0x0
+                Address:         0x0
+                Other:           0x0
+                ImportName:      ''
+                Children:
+                  - TerminalSize:    4
+                    NodeOffset:      959
+                    Name:            PodcastSuggestion
+                    Flags:           0x0
+                    Address:         0x5000
+                    Other:           0x0
+                    ImportName:      ''
+                  - TerminalSize:    4
+                    NodeOffset:      1000
+                    Name:            Suggestion
+                    Flags:           0x0
+                    Address:         0x5050
+                    Other:           0x0
+                    ImportName:      ''
+          - TerminalSize:    0
+            NodeOffset:      1011
+            Name:            SuggestionsKitVersion
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    3
+                NodeOffset:      1006
+                Name:            Number
+                Flags:           0x0
+                Address:         0x36F8
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    3
+                NodeOffset:      1031
+                Name:            String
+                Flags:           0x0
+                Address:         0x36C0
+                Other:           0x0
+                ImportName:      ''
+          - TerminalSize:    3
+            NodeOffset:      1036
+            Name:            launchOnTestApps
+            Flags:           0x0
+            Address:         0x1B50
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    3
+            NodeOffset:      1041
+            Name:            setupTestSuggestionService
+            Flags:           0x0
+            Address:         0x1B30
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    3
+            NodeOffset:      1046
+            Name:            teardownTestSuggestionService
+            Flags:           0x0
+            Address:         0x1B40
+            Other:           0x0
+            ImportName:      ''
+  NameList:
+    - n_strx:          3740
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          128
+      n_value:         7040
+    - n_strx:          3808
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          128
+      n_value:         7120
+    - n_strx:          3876
+      n_type:          0xE
+      n_sect:          1
+      n_desc:          0
+      n_value:         7472
+    - n_strx:          3949
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8496
+    - n_strx:          4073
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          128
+      n_value:         9072
+    - n_strx:          4284
+      n_type:          0xE
+      n_sect:          1
+      n_desc:          0
+      n_value:         9312
+    - n_strx:          4345
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          128
+      n_value:         9360
+    - n_strx:          4583
+      n_type:          0xE
+      n_sect:          1
+      n_desc:          0
+      n_value:         9632
+    - n_strx:          4646
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          128
+      n_value:         9680
+    - n_strx:          4880
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         9712
+    - n_strx:          4923
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         10112
+    - n_strx:          4968
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         10528
+    - n_strx:          5045
+      n_type:          0xE
+      n_sect:          1
+      n_desc:          0
+      n_value:         10608
+    - n_strx:          5156
+      n_type:          0xE
+      n_sect:          1
+      n_desc:          0
+      n_value:         10656
+    - n_strx:          5234
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         10944
+    - n_strx:          5281
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         10960
+    - n_strx:          5317
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         10976
+    - n_strx:          5356
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         11024
+    - n_strx:          5401
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         11072
+    - n_strx:          5448
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         11232
+    - n_strx:          5498
+      n_type:          0xE
+      n_sect:          1
+      n_desc:          0
+      n_value:         11280
+    - n_strx:          5534
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          128
+      n_value:         11344
+    - n_strx:          5581
+      n_type:          0xE
+      n_sect:          1
+      n_desc:          0
+      n_value:         11536
+    - n_strx:          5659
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         11584
+    - n_strx:          5694
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         11664
+    - n_strx:          5729
+      n_type:          0xE
+      n_sect:          1
+      n_desc:          0
+      n_value:         11744
+    - n_strx:          5799
+      n_type:          0xE
+      n_sect:          1
+      n_desc:          0
+      n_value:         11792
+    - n_strx:          5869
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         11840
+    - n_strx:          5936
+      n_type:          0xE
+      n_sect:          1
+      n_desc:          0
+      n_value:         11904
+    - n_strx:          5952
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12128
+    - n_strx:          6022
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12192
+    - n_strx:          6094
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12256
+    - n_strx:          6166
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12320
+    - n_strx:          6259
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12384
+    - n_strx:          6324
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12608
+    - n_strx:          6391
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12672
+    - n_strx:          6455
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12736
+    - n_strx:          6521
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12800
+    - n_strx:          6589
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12880
+    - n_strx:          6647
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12896
+    - n_strx:          6707
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12912
+    - n_strx:          6764
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12928
+    - n_strx:          6823
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12944
+    - n_strx:          6884
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         12960
+    - n_strx:          6957
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         13024
+    - n_strx:          7033
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         13088
+    - n_strx:          7110
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         13152
+    - n_strx:          7192
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         13232
+    - n_strx:          7270
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         13296
+    - n_strx:          7353
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         13376
+    - n_strx:          7433
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         13440
+    - n_strx:          7514
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          128
+      n_value:         13504
+    - n_strx:          7576
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          128
+      n_value:         13584
+    - n_strx:          7637
+      n_type:          0xE
+      n_sect:          1
+      n_desc:          0
+      n_value:         13664
+    - n_strx:          7733
+      n_type:          0xE
+      n_sect:          1
+      n_desc:          0
+      n_value:         13712
+    - n_strx:          7828
+      n_type:          0x1E
+      n_sect:          3
+      n_desc:          128
+      n_value:         14130
+    - n_strx:          7856
+      n_type:          0x1E
+      n_sect:          5
+      n_desc:          128
+      n_value:         15352
+    - n_strx:          7879
+      n_type:          0x1E
+      n_sect:          5
+      n_desc:          128
+      n_value:         15448
+    - n_strx:          7905
+      n_type:          0x1E
+      n_sect:          6
+      n_desc:          128
+      n_value:         15664
+    - n_strx:          7948
+      n_type:          0x1E
+      n_sect:          6
+      n_desc:          128
+      n_value:         15670
+    - n_strx:          7987
+      n_type:          0x1E
+      n_sect:          6
+      n_desc:          128
+      n_value:         15676
+    - n_strx:          8042
+      n_type:          0x1E
+      n_sect:          6
+      n_desc:          128
+      n_value:         15682
+    - n_strx:          8093
+      n_type:          0xE
+      n_sect:          8
+      n_desc:          0
+      n_value:         15704
+    - n_strx:          8125
+      n_type:          0xE
+      n_sect:          8
+      n_desc:          0
+      n_value:         15732
+    - n_strx:          8153
+      n_type:          0xE
+      n_sect:          8
+      n_desc:          0
+      n_value:         15748
+    - n_strx:          8197
+      n_type:          0xE
+      n_sect:          8
+      n_desc:          0
+      n_value:         15764
+    - n_strx:          8237
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16744
+    - n_strx:          8291
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16752
+    - n_strx:          8345
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16760
+    - n_strx:          8395
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16768
+    - n_strx:          8453
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16776
+    - n_strx:          8505
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16784
+    - n_strx:          8552
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16792
+    - n_strx:          8601
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16800
+    - n_strx:          8660
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16808
+    - n_strx:          8717
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16816
+    - n_strx:          8774
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16824
+    - n_strx:          8827
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16832
+    - n_strx:          8888
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16840
+    - n_strx:          8943
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16848
+    - n_strx:          8993
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16856
+    - n_strx:          9045
+      n_type:          0x1E
+      n_sect:          14
+      n_desc:          128
+      n_value:         16864
+    - n_strx:          9107
+      n_type:          0xE
+      n_sect:          17
+      n_desc:          0
+      n_value:         16928
+    - n_strx:          9147
+      n_type:          0xE
+      n_sect:          17
+      n_desc:          0
+      n_value:         17000
+    - n_strx:          9183
+      n_type:          0xE
+      n_sect:          17
+      n_desc:          0
+      n_value:         17072
+    - n_strx:          9216
+      n_type:          0xE
+      n_sect:          17
+      n_desc:          0
+      n_value:         17144
+    - n_strx:          9245
+      n_type:          0xE
+      n_sect:          17
+      n_desc:          0
+      n_value:         17216
+    - n_strx:          9301
+      n_type:          0xE
+      n_sect:          17
+      n_desc:          0
+      n_value:         17288
+    - n_strx:          9348
+      n_type:          0xE
+      n_sect:          17
+      n_desc:          0
+      n_value:         17328
+    - n_strx:          9394
+      n_type:          0xE
+      n_sect:          17
+      n_desc:          0
+      n_value:         17400
+    - n_strx:          9454
+      n_type:          0xE
+      n_sect:          17
+      n_desc:          0
+      n_value:         17472
+    - n_strx:          9504
+      n_type:          0xE
+      n_sect:          17
+      n_desc:          0
+      n_value:         17544
+    - n_strx:          9563
+      n_type:          0xE
+      n_sect:          17
+      n_desc:          0
+      n_value:         17616
+    - n_strx:          9612
+      n_type:          0xE
+      n_sect:          17
+      n_desc:          0
+      n_value:         17688
+    - n_strx:          9667
+      n_type:          0xE
+      n_sect:          17
+      n_desc:          0
+      n_value:         17760
+    - n_strx:          9712
+      n_type:          0xE
+      n_sect:          19
+      n_desc:          0
+      n_value:         20680
+    - n_strx:          9744
+      n_type:          0xE
+      n_sect:          19
+      n_desc:          0
+      n_value:         20864
+    - n_strx:          9772
+      n_type:          0xE
+      n_sect:          19
+      n_desc:          0
+      n_value:         21024
+    - n_strx:          9816
+      n_type:          0xE
+      n_sect:          19
+      n_desc:          0
+      n_value:         21184
+    - n_strx:          9856
+      n_type:          0x1E
+      n_sect:          19
+      n_desc:          128
+      n_value:         21304
+    - n_strx:          9898
+      n_type:          0x1E
+      n_sect:          20
+      n_desc:          0
+      n_value:         21312
+    - n_strx:          9927
+      n_type:          0x1E
+      n_sect:          21
+      n_desc:          0
+      n_value:         21712
+    - n_strx:          9952
+      n_type:          0x1E
+      n_sect:          22
+      n_desc:          0
+      n_value:         21888
+    - n_strx:          9991
+      n_type:          0xE
+      n_sect:          23
+      n_desc:          0
+      n_value:         21896
+    - n_strx:          10090
+      n_type:          0xE
+      n_sect:          23
+      n_desc:          0
+      n_value:         21912
+    - n_strx:          10125
+      n_type:          0xE
+      n_sect:          23
+      n_desc:          0
+      n_value:         21920
+    - n_strx:          10155
+      n_type:          0xE
+      n_sect:          23
+      n_desc:          0
+      n_value:         21928
+    - n_strx:          10178
+      n_type:          0xE
+      n_sect:          23
+      n_desc:          0
+      n_value:         21936
+    - n_strx:          10244
+      n_type:          0xE
+      n_sect:          23
+      n_desc:          0
+      n_value:         21952
+    - n_strx:          10301
+      n_type:          0xE
+      n_sect:          23
+      n_desc:          0
+      n_value:         21968
+    - n_strx:          10358
+      n_type:          0xE
+      n_sect:          23
+      n_desc:          0
+      n_value:         21984
+    - n_strx:          10442
+      n_type:          0xE
+      n_sect:          23
+      n_desc:          0
+      n_value:         22000
+    - n_strx:          10525
+      n_type:          0x32
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          10795
+      n_type:          0x32
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          1
+      n_type:          0x64
+      n_sect:          1
+      n_desc:          0
+      n_value:         0
+    - n_strx:          11071
+      n_type:          0x64
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          11308
+      n_type:          0x64
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          11330
+      n_type:          0x66
+      n_sect:          3
+      n_desc:          1
+      n_value:         0
+    - n_strx:          11595
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          11624
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          1
+      n_type:          0x64
+      n_sect:          1
+      n_desc:          0
+      n_value:         0
+    - n_strx:          11653
+      n_type:          0x64
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          11708
+      n_type:          0x64
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          11728
+      n_type:          0x66
+      n_sect:          3
+      n_desc:          1
+      n_value:         0
+    - n_strx:          11991
+      n_type:          0x26
+      n_sect:          17
+      n_desc:          0
+      n_value:         16928
+    - n_strx:          12031
+      n_type:          0x26
+      n_sect:          17
+      n_desc:          0
+      n_value:         17000
+    - n_strx:          12067
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          12103
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          1
+      n_type:          0x64
+      n_sect:          1
+      n_desc:          0
+      n_value:         0
+    - n_strx:          12135
+      n_type:          0x64
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          12190
+      n_type:          0x64
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          12207
+      n_type:          0x66
+      n_sect:          3
+      n_desc:          1
+      n_value:         0
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         6960
+    - n_strx:          12467
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         6960
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         16
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         6960
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         6976
+    - n_strx:          12495
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         6976
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         16
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         6976
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         6992
+    - n_strx:          12526
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         6992
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         13
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         6992
+    - n_strx:          1
+      n_type:          0x64
+      n_sect:          1
+      n_desc:          0
+      n_value:         0
+    - n_strx:          12544
+      n_type:          0x64
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          12599
+      n_type:          0x64
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          12612
+      n_type:          0x66
+      n_sect:          3
+      n_desc:          1
+      n_value:         0
+    - n_strx:          12868
+      n_type:          0x26
+      n_sect:          17
+      n_desc:          0
+      n_value:         17072
+    - n_strx:          12901
+      n_type:          0x26
+      n_sect:          17
+      n_desc:          0
+      n_value:         17144
+    - n_strx:          12930
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          12959
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          1
+      n_type:          0x64
+      n_sect:          1
+      n_desc:          0
+      n_value:         0
+    - n_strx:          12984
+      n_type:          0x64
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          13039
+      n_type:          0x64
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          13062
+      n_type:          0x66
+      n_sect:          3
+      n_desc:          1
+      n_value:         0
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7008
+    - n_strx:          13324
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7008
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         32
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7008
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7040
+    - n_strx:          13390
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7040
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         80
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7040
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7120
+    - n_strx:          13458
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7120
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         112
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7120
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7232
+    - n_strx:          13526
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7232
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         80
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7232
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7312
+    - n_strx:          13590
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7312
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         96
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7312
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7408
+    - n_strx:          13654
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7408
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         64
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7408
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7472
+    - n_strx:          13718
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7472
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         48
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7472
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7520
+    - n_strx:          13791
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7520
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         48
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7520
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7568
+    - n_strx:          13823
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7568
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         64
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7568
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7632
+    - n_strx:          13855
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7632
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         48
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7632
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7680
+    - n_strx:          13891
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7680
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         64
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7680
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7744
+    - n_strx:          13927
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7744
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         22
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7744
+    - n_strx:          13959
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14026
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14054
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14077
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14109
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14175
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14241
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14307
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14345
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14388
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14427
+      n_type:          0x26
+      n_sect:          8
+      n_desc:          0
+      n_value:         15704
+    - n_strx:          14459
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14525
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14579
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14633
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14683
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14741
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14793
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14840
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14889
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          14948
+      n_type:          0x26
+      n_sect:          17
+      n_desc:          0
+      n_value:         17216
+    - n_strx:          15004
+      n_type:          0x26
+      n_sect:          17
+      n_desc:          0
+      n_value:         17288
+    - n_strx:          15051
+      n_type:          0x26
+      n_sect:          17
+      n_desc:          0
+      n_value:         17328
+    - n_strx:          15097
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          15129
+      n_type:          0x26
+      n_sect:          19
+      n_desc:          0
+      n_value:         20680
+    - n_strx:          15161
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          1
+      n_type:          0x64
+      n_sect:          1
+      n_desc:          0
+      n_value:         0
+    - n_strx:          15192
+      n_type:          0x64
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          15250
+      n_type:          0x64
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          15265
+      n_type:          0x66
+      n_sect:          3
+      n_desc:          1
+      n_value:         0
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7776
+    - n_strx:          15466
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7776
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         48
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7776
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7824
+    - n_strx:          15498
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7824
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         32
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7824
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7856
+    - n_strx:          15530
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7856
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         32
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7856
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7888
+    - n_strx:          15571
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7888
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         32
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7888
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7920
+    - n_strx:          15599
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7920
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         64
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7920
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7984
+    - n_strx:          15627
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         7984
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         48
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         7984
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8032
+    - n_strx:          15675
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         8032
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         32
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8032
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8064
+    - n_strx:          15723
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         8064
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         32
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8064
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8096
+    - n_strx:          15780
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         8096
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         32
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8096
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8128
+    - n_strx:          15824
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         8128
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         64
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8128
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8192
+    - n_strx:          15868
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         8192
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         48
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8192
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8240
+    - n_strx:          15912
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         8240
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         32
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8240
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8272
+    - n_strx:          15956
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         8272
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         32
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8272
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8304
+    - n_strx:          16012
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         8304
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         32
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8304
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8336
+    - n_strx:          16052
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         8336
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         64
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8336
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8400
+    - n_strx:          16092
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         8400
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         32
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8400
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8432
+    - n_strx:          16120
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         8432
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         32
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8432
+    - n_strx:          1
+      n_type:          0x2E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8464
+    - n_strx:          16164
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         8464
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         22
+    - n_strx:          1
+      n_type:          0x4E
+      n_sect:          1
+      n_desc:          0
+      n_value:         8464
+    - n_strx:          16204
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16230
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16258
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16292
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16335
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16379
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16429
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16488
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16528
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16574
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16632
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16687
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16738
+      n_type:          0x26
+      n_sect:          8
+      n_desc:          0
+      n_value:         15732
+    - n_strx:          16766
+      n_type:          0x26
+      n_sect:          8
+      n_desc:          0
+      n_value:         15748
+    - n_strx:          16810
+      n_type:          0x26
+      n_sect:          8
+      n_desc:          0
+      n_value:         15764
+    - n_strx:          16850
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16907
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          16964
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          17017
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          17078
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          17133
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          17183
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          17235
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          17297
+      n_type:          0x26
+      n_sect:          17
+      n_desc:          0
+      n_value:         17400
+    - n_strx:          17357
+      n_type:          0x26
+      n_sect:          17
+      n_desc:          0
+      n_value:         17472
+    - n_strx:          17407
+      n_type:          0x26
+      n_sect:          17
+      n_desc:          0
+      n_value:         17544
+    - n_strx:          17466
+      n_type:          0x26
+      n_sect:          17
+      n_desc:          0
+      n_value:         17616
+    - n_strx:          17515
+      n_type:          0x26
+      n_sect:          17
+      n_desc:          0
+      n_value:         17688
+    - n_strx:          17570
+      n_type:          0x26
+      n_sect:          17
+      n_desc:          0
+      n_value:         17760
+    - n_strx:          17615
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          17643
+      n_type:          0x26
+      n_sect:          19
+      n_desc:          0
+      n_value:         20864
+    - n_strx:          17671
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          17698
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          17742
+      n_type:          0x26
+      n_sect:          19
+      n_desc:          0
+      n_value:         21024
+    - n_strx:          17786
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          17829
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          17869
+      n_type:          0x26
+      n_sect:          19
+      n_desc:          0
+      n_value:         21184
+    - n_strx:          17909
+      n_type:          0x20
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          1
+      n_type:          0x64
+      n_sect:          1
+      n_desc:          0
+      n_value:         0
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7408
+    - n_strx:          66
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15432
+    - n_strx:          132
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7232
+    - n_strx:          196
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15416
+    - n_strx:          262
+      n_type:          0xF
+      n_sect:          14
+      n_desc:          0
+      n_value:         16728
+    - n_strx:          328
+      n_type:          0xF
+      n_sect:          3
+      n_desc:          0
+      n_value:         14080
+    - n_strx:          395
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7008
+    - n_strx:          461
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7312
+    - n_strx:          525
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15424
+    - n_strx:          591
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7632
+    - n_strx:          627
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15440
+    - n_strx:          665
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7680
+    - n_strx:          701
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7744
+    - n_strx:          733
+      n_type:          0xF
+      n_sect:          19
+      n_desc:          0
+      n_value:         20640
+    - n_strx:          765
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15364
+    - n_strx:          797
+      n_type:          0xF
+      n_sect:          19
+      n_desc:          0
+      n_value:         20704
+    - n_strx:          828
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7568
+    - n_strx:          860
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7520
+    - n_strx:          892
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         8272
+    - n_strx:          948
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15656
+    - n_strx:          1006
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         8192
+    - n_strx:          1050
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15648
+    - n_strx:          1096
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         8240
+    - n_strx:          1140
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         8464
+    - n_strx:          1180
+      n_type:          0xF
+      n_sect:          19
+      n_desc:          0
+      n_value:         21144
+    - n_strx:          1220
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15596
+    - n_strx:          1260
+      n_type:          0xF
+      n_sect:          19
+      n_desc:          0
+      n_value:         21208
+    - n_strx:          1299
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         8336
+    - n_strx:          1339
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         8304
+    - n_strx:          1379
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         8064
+    - n_strx:          1436
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15588
+    - n_strx:          1495
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7984
+    - n_strx:          1543
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15580
+    - n_strx:          1593
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         8032
+    - n_strx:          1641
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         8432
+    - n_strx:          1685
+      n_type:          0xF
+      n_sect:          19
+      n_desc:          0
+      n_value:         20984
+    - n_strx:          1729
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15528
+    - n_strx:          1773
+      n_type:          0xF
+      n_sect:          19
+      n_desc:          0
+      n_value:         21048
+    - n_strx:          1816
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         8128
+    - n_strx:          1860
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         8096
+    - n_strx:          1904
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7856
+    - n_strx:          1945
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15520
+    - n_strx:          1988
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7776
+    - n_strx:          2020
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15512
+    - n_strx:          2054
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7824
+    - n_strx:          2086
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         8400
+    - n_strx:          2114
+      n_type:          0xF
+      n_sect:          19
+      n_desc:          0
+      n_value:         20824
+    - n_strx:          2142
+      n_type:          0xF
+      n_sect:          5
+      n_desc:          0
+      n_value:         15460
+    - n_strx:          2170
+      n_type:          0xF
+      n_sect:          19
+      n_desc:          0
+      n_value:         20888
+    - n_strx:          2197
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7920
+    - n_strx:          2225
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         7888
+    - n_strx:          2253
+      n_type:          0xF
+      n_sect:          18
+      n_desc:          0
+      n_value:         20520
+    - n_strx:          2285
+      n_type:          0xF
+      n_sect:          18
+      n_desc:          0
+      n_value:         20600
+    - n_strx:          2310
+      n_type:          0xF
+      n_sect:          18
+      n_desc:          0
+      n_value:         20480
+    - n_strx:          2346
+      n_type:          0xF
+      n_sect:          18
+      n_desc:          0
+      n_value:         20560
+    - n_strx:          2375
+      n_type:          0xF
+      n_sect:          3
+      n_desc:          0
+      n_value:         14072
+    - n_strx:          2404
+      n_type:          0xF
+      n_sect:          3
+      n_desc:          0
+      n_value:         14016
+    - n_strx:          2433
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         6992
+    - n_strx:          2451
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         6960
+    - n_strx:          2479
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         6976
+    - n_strx:          2510
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1024
+      n_value:         0
+    - n_strx:          2518
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          2541
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1024
+      n_value:         0
+    - n_strx:          2575
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          2602
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1024
+      n_value:         0
+    - n_strx:          2640
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          896
+      n_value:         0
+    - n_strx:          2648
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          896
+      n_value:         0
+    - n_strx:          2655
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          2674
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1344
+      n_value:         0
+    - n_strx:          2715
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1600
+      n_value:         0
+    - n_strx:          2748
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1856
+      n_value:         0
+    - n_strx:          2783
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          2880
+      n_value:         0
+    - n_strx:          2820
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          2112
+      n_value:         0
+    - n_strx:          2852
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          2368
+      n_value:         0
+    - n_strx:          2889
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          2624
+      n_value:         0
+    - n_strx:          2919
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1024
+      n_value:         0
+    - n_strx:          2957
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          2964
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          2981
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          2988
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          3003
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          576
+      n_value:         0
+    - n_strx:          3015
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          576
+      n_value:         0
+    - n_strx:          3036
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          576
+      n_value:         0
+    - n_strx:          3060
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          576
+      n_value:         0
+    - n_strx:          3083
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3107
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3129
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3148
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3169
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3188
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3209
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3232
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3252
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3272
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3295
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3317
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3344
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3368
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3395
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3419
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3440
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3463
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3489
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3515
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3538
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3561
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          3582
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1024
+      n_value:         0
+    - n_strx:          3601
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1024
+      n_value:         0
+    - n_strx:          3620
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1024
+      n_value:         0
+    - n_strx:          3648
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1024
+      n_value:         0
+    - n_strx:          3665
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1024
+      n_value:         0
+    - n_strx:          3684
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1024
+      n_value:         0
+    - n_strx:          3696
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1024
+      n_value:         0
+    - n_strx:          3711
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          1024
+      n_value:         0
+    - n_strx:          3725
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          576
+      n_value:         0
+  StringTable:
+    - ' '
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvM'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvMTq'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvg'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvgTq'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvpMV'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvpWvd'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvpfi'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvs'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvsTq'
+    - '_$s14SuggestionsKit05TrailA0CACycfC'
+    - '_$s14SuggestionsKit05TrailA0CACycfCTq'
+    - '_$s14SuggestionsKit05TrailA0CACycfc'
+    - '_$s14SuggestionsKit05TrailA0CMa'
+    - '_$s14SuggestionsKit05TrailA0CMm'
+    - '_$s14SuggestionsKit05TrailA0CMn'
+    - '_$s14SuggestionsKit05TrailA0CN'
+    - '_$s14SuggestionsKit05TrailA0CfD'
+    - '_$s14SuggestionsKit05TrailA0Cfd'
+    - '_$s17AvailableServices12ServiceRaterC13publishRatingyyF'
+    - '_$s17AvailableServices12ServiceRaterC13publishRatingyyFTq'
+    - '_$s17AvailableServices12ServiceRaterCACycfC'
+    - '_$s17AvailableServices12ServiceRaterCACycfCTq'
+    - '_$s17AvailableServices12ServiceRaterCACycfc'
+    - '_$s17AvailableServices12ServiceRaterCMa'
+    - '_$s17AvailableServices12ServiceRaterCMm'
+    - '_$s17AvailableServices12ServiceRaterCMn'
+    - '_$s17AvailableServices12ServiceRaterCN'
+    - '_$s17AvailableServices12ServiceRaterCfD'
+    - '_$s17AvailableServices12ServiceRaterCfd'
+    - '_$s17AvailableServices16ServiceRequesterC10enterQueueyyF'
+    - '_$s17AvailableServices16ServiceRequesterC10enterQueueyyFTq'
+    - '_$s17AvailableServices16ServiceRequesterCACycfC'
+    - '_$s17AvailableServices16ServiceRequesterCACycfCTq'
+    - '_$s17AvailableServices16ServiceRequesterCACycfc'
+    - '_$s17AvailableServices16ServiceRequesterCMa'
+    - '_$s17AvailableServices16ServiceRequesterCMm'
+    - '_$s17AvailableServices16ServiceRequesterCMn'
+    - '_$s17AvailableServices16ServiceRequesterCN'
+    - '_$s17AvailableServices16ServiceRequesterCfD'
+    - '_$s17AvailableServices16ServiceRequesterCfd'
+    - '_$s17AvailableServicesAAC10getServiceyyF'
+    - '_$s17AvailableServicesAAC10getServiceyyFTq'
+    - '_$s17AvailableServicesAACABycfC'
+    - '_$s17AvailableServicesAACABycfCTq'
+    - '_$s17AvailableServicesAACABycfc'
+    - '_$s17AvailableServicesAACMa'
+    - '_$s17AvailableServicesAACMm'
+    - '_$s17AvailableServicesAACMn'
+    - '_$s17AvailableServicesAACN'
+    - '_$s17AvailableServicesAACfD'
+    - '_$s17AvailableServicesAACfd'
+    - '_OBJC_CLASS_$_PodcastSuggestion'
+    - '_OBJC_CLASS_$_Suggestion'
+    - '_OBJC_METACLASS_$_PodcastSuggestion'
+    - '_OBJC_METACLASS_$_Suggestion'
+    - _SuggestionsKitVersionNumber
+    - _SuggestionsKitVersionString
+    - _launchOnTestApps
+    - _setupTestSuggestionService
+    - _teardownTestSuggestionService
+    - '_$sBoWV'
+    - '_OBJC_CLASS_$_NSObject'
+    - '_OBJC_CLASS_$__TtCs12_SwiftObject'
+    - '_OBJC_METACLASS_$_NSObject'
+    - '_OBJC_METACLASS_$__TtCs12_SwiftObject'
+    - __ZdlPv
+    - __Znwm
+    - __objc_empty_cache
+    - '__swift_FORCE_LOAD_$_swiftCoreFoundation'
+    - '__swift_FORCE_LOAD_$_swiftDarwin'
+    - '__swift_FORCE_LOAD_$_swiftDispatch'
+    - '__swift_FORCE_LOAD_$_swiftFoundation'
+    - '__swift_FORCE_LOAD_$_swiftIOKit'
+    - '__swift_FORCE_LOAD_$_swiftObjectiveC'
+    - '__swift_FORCE_LOAD_$_swiftXPC'
+    - __swift_stdlib_operatingSystemVersion
+    - _abort
+    - _dispatch_once_f
+    - _dlsym
+    - _objc_opt_self
+    - _os_release
+    - _os_unfair_lock_lock
+    - _os_unfair_lock_trylock
+    - _os_unfair_lock_unlock
+    - _pthread_cond_broadcast
+    - _pthread_cond_destroy
+    - _pthread_cond_init
+    - _pthread_cond_signal
+    - _pthread_cond_wait
+    - _pthread_getspecific
+    - _pthread_mutex_destroy
+    - _pthread_mutex_init
+    - _pthread_mutex_lock
+    - _pthread_mutex_trylock
+    - _pthread_mutex_unlock
+    - _pthread_mutexattr_destroy
+    - _pthread_mutexattr_init
+    - _pthread_mutexattr_settype
+    - _pthread_rwlock_destroy
+    - _pthread_rwlock_init
+    - _pthread_rwlock_rdlock
+    - _pthread_rwlock_tryrdlock
+    - _pthread_rwlock_trywrlock
+    - _pthread_rwlock_unlock
+    - _pthread_rwlock_wrlock
+    - _pthread_setspecific
+    - _swift_allocObject
+    - _swift_beginAccess
+    - _swift_deallocClassInstance
+    - _swift_endAccess
+    - _swift_errorRetain
+    - _swift_once
+    - _swift_release
+    - _swift_retain
+    - _voucher_adopt
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvpACTK'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvpACTk'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvM.resume.0'
+    - __ZN5swift9AsyncTask10waitFutureEPS0_PNS_12AsyncContextEPU14swiftasynccallFvU19swift_async_contextS3_ES3_PNS_11OpaqueValueE
+    - __ZN5swift38swift56override_swift_task_future_waitEPNS_11OpaqueValueEPNS_12AsyncContextEPNS_9AsyncTaskEPU14swiftasynccallFvU19swift_async_contextS3_ES3_PU14swiftasynccallFvS1_U19swift_async_contextS3_S5_S7_S3_E
+    - __ZL31task_future_wait_resume_adapterPN5swift12AsyncContextE
+    - __ZN5swift47swift56override_swift_task_future_wait_throwingEPNS_11OpaqueValueEPNS_12AsyncContextEPNS_9AsyncTaskEPU14swiftasynccallFvU19swift_async_contextS3_U13swift_contextPvES3_PU14swiftasynccallFvS1_U19swift_async_contextS3_S5_S8_S3_E
+    - __ZL33task_wait_throwing_resume_adapterPN5swift12AsyncContextE
+    - __ZN5swift40swift56override_swift_task_create_commonEmPNS_16TaskOptionRecordEPKNS_14TargetMetadataINS_9InProcessEEEPU14swiftasynccallFvU19swift_async_contextPNS_12AsyncContextEEPvmPU9swiftcallFNS_19AsyncTaskAndContextEmS1_S6_SA_SB_mE
+    - __ZN5swift9AsyncTask18flagAsRunning_slowEv
+    - __ZN5swift9AsyncTask20flagAsSuspended_slowEv
+    - __ZN5swift31swift_task_escalateBackdeploy56EPNS_9AsyncTaskENS_11JobPriorityE
+    - '__ZZZN5swift31swift_task_escalateBackdeploy56EPNS_9AsyncTaskENS_11JobPriorityEENK3$_0clEvENUlPvE_8__invokeES4_'
+    - __ZN5swiftL25waitForStatusRecordUnlockEPNS_9AsyncTaskERNS_16ActiveTaskStatusE
+    - __ZN5swift28swift_Concurrency_fatalErrorEjPKcz
+    - __ZN5swift21swift_task_getCurrentEv
+    - __ZN5swift24_swift_task_clearCurrentEv
+    - __ZN5swift16adoptTaskVoucherEPNS_9AsyncTaskE
+    - __ZN5swift18restoreTaskVoucherEPNS_9AsyncTaskE
+    - __ZN5swift14VoucherManager19vouchersAreDisabledEv
+    - __ZL27_initializeVouchersDisabledPv
+    - __ZN5swift14VoucherManager9swapToJobEPNS_3JobE
+    - __ZZZL25swift_voucher_needs_adoptP9voucher_sENKUlvE_clEvENUlPvE_8__invokeES2_
+    - __ZN5swift19_swift_tsan_acquireEPv
+    - __ZN5swift19_swift_tsan_releaseEPv
+    - '__ZZZN5swift19_swift_tsan_acquireEPvENK3$_0clEvENUlS0_E_8__invokeES0_'
+    - '__ZZZN5swift19_swift_tsan_releaseEPvENK3$_0clEvENUlS0_E_8__invokeES0_'
+    - __ZN5swift23ConditionPlatformHelper4initER22_opaque_pthread_cond_t
+    - __ZL9errorNamei
+    - __ZN5swift23ConditionPlatformHelper7destroyER22_opaque_pthread_cond_t
+    - __ZN5swift23ConditionPlatformHelper9notifyOneER22_opaque_pthread_cond_t
+    - __ZN5swift23ConditionPlatformHelper9notifyAllER22_opaque_pthread_cond_t
+    - __ZN5swift23ConditionPlatformHelper4waitER22_opaque_pthread_cond_tR23_opaque_pthread_mutex_t
+    - __ZN5swift19MutexPlatformHelper4initER23_opaque_pthread_mutex_tb
+    - __ZN5swift19MutexPlatformHelper7destroyER23_opaque_pthread_mutex_t
+    - __ZN5swift19MutexPlatformHelper4lockER23_opaque_pthread_mutex_t
+    - __ZN5swift19MutexPlatformHelper6unlockER23_opaque_pthread_mutex_t
+    - __ZN5swift19MutexPlatformHelper8try_lockER23_opaque_pthread_mutex_t
+    - __ZN5swift19MutexPlatformHelper4initER16os_unfair_lock_sb
+    - __ZN5swift19MutexPlatformHelper7destroyER16os_unfair_lock_s
+    - __ZN5swift19MutexPlatformHelper4lockER16os_unfair_lock_s
+    - __ZN5swift19MutexPlatformHelper6unlockER16os_unfair_lock_s
+    - __ZN5swift19MutexPlatformHelper8try_lockER16os_unfair_lock_s
+    - __ZN5swift27ReadWriteLockPlatformHelper4initER24_opaque_pthread_rwlock_t
+    - __ZN5swift27ReadWriteLockPlatformHelper7destroyER24_opaque_pthread_rwlock_t
+    - __ZN5swift27ReadWriteLockPlatformHelper8readLockER24_opaque_pthread_rwlock_t
+    - __ZN5swift27ReadWriteLockPlatformHelper12try_readLockER24_opaque_pthread_rwlock_t
+    - __ZN5swift27ReadWriteLockPlatformHelper9writeLockER24_opaque_pthread_rwlock_t
+    - __ZN5swift27ReadWriteLockPlatformHelper13try_writeLockER24_opaque_pthread_rwlock_t
+    - __ZN5swift27ReadWriteLockPlatformHelper10readUnlockER24_opaque_pthread_rwlock_t
+    - __ZN5swift27ReadWriteLockPlatformHelper11writeUnlockER24_opaque_pthread_rwlock_t
+    - __ZN5swift46swift_task_enterThreadLocalContextBackdeploy56EPc
+    - __ZN5swift45swift_task_exitThreadLocalContextBackdeploy56EPc
+    - '__ZZZN5swift46swift_task_enterThreadLocalContextBackdeploy56EPcENK3$_0clEvENUlPvE_8__invokeES2_'
+    - '__ZZZN5swift45swift_task_exitThreadLocalContextBackdeploy56EPcENK3$_0clEvENUlPvE_8__invokeES2_'
+    - ___swift_reflection_version
+    - '_$s14SuggestionsKitMXM'
+    - '_$s17AvailableServicesMXM'
+    - _symbolic _____ 14SuggestionsKit05TrailA0C
+    - _symbolic _____ 17AvailableServicesAAC
+    - _symbolic _____ 17AvailableServices16ServiceRequesterC
+    - _symbolic _____ 17AvailableServices12ServiceRaterC
+    - '_$s14SuggestionsKit05TrailA0CMF'
+    - '_$s17AvailableServicesAACMF'
+    - '_$s17AvailableServices16ServiceRequesterCMF'
+    - '_$s17AvailableServices12ServiceRaterCMF'
+    - '__swift_FORCE_LOAD_$_swiftFoundation_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftObjectiveC_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftDarwin_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftCoreFoundation_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftDispatch_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftXPC_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftIOKit_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftCompatibility56_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftFoundation_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftObjectiveC_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftDarwin_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftCoreFoundation_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftDispatch_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftXPC_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftIOKit_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftCompatibility56_$_AvailableServices'
+    - '__OBJC_METACLASS_RO_$_PodcastSuggestion'
+    - '__OBJC_CLASS_RO_$_PodcastSuggestion'
+    - '__OBJC_METACLASS_RO_$_Suggestion'
+    - '__OBJC_CLASS_RO_$_Suggestion'
+    - __METACLASS_DATA__TtC14SuggestionsKit16TrailSuggestions
+    - __IVARS__TtC14SuggestionsKit16TrailSuggestions
+    - __DATA__TtC14SuggestionsKit16TrailSuggestions
+    - __METACLASS_DATA__TtC17AvailableServices17AvailableServices
+    - __DATA__TtC17AvailableServices17AvailableServices
+    - __METACLASS_DATA__TtC17AvailableServices16ServiceRequester
+    - __DATA__TtC17AvailableServices16ServiceRequester
+    - __METACLASS_DATA__TtC17AvailableServices12ServiceRater
+    - __DATA__TtC17AvailableServices12ServiceRater
+    - '_$s14SuggestionsKit05TrailA0CMf'
+    - '_$s17AvailableServicesAACMf'
+    - '_$s17AvailableServices16ServiceRequesterCMf'
+    - '_$s17AvailableServices12ServiceRaterCMf'
+    - '__swift_FORCE_LOAD_$_swiftCompatibility56'
+    - _Swift56ConcurrencyOverrides
+    - _Swift56RuntimeOverrides
+    - __ZN5swift25TaskAllocatorSlabMetadataE
+    - '__ZZZN5swift31swift_task_escalateBackdeploy56EPNS_9AsyncTaskENS_11JobPriorityEENK3$_0clEvE7TheLazy'
+    - __ZN5swiftL20StatusRecordLockLockE
+    - __ZL23voucherDisableCheckOnce
+    - __ZL16vouchersDisabled
+    - __ZZZL25swift_voucher_needs_adoptP9voucher_sENKUlvE_clEvE7TheLazy
+    - '__ZZZN5swift19_swift_tsan_acquireEPvENK3$_0clEvE7TheLazy'
+    - '__ZZZN5swift19_swift_tsan_releaseEPvENK3$_0clEvE7TheLazy'
+    - '__ZZZN5swift46swift_task_enterThreadLocalContextBackdeploy56EPcENK3$_0clEvE7TheLazy'
+    - '__ZZZN5swift45swift_task_exitThreadLocalContextBackdeploy56EPcENK3$_0clEvE7TheLazy'
+    - '/Users/cishida/Library/Developer/Xcode/DerivedData/SuggestionsKit-cqcvpwifregfbkgcgfkoxwikfhff/Build/Intermediates.noindex/InstallIntermediates/macosx/Intermediates.noindex/SuggestionsKit.build/Debug/SuggestionsKit.build/Objects-normal/x86_64/SuggestionsKit.swiftmodule'
+    - '/Users/cishida/Library/Developer/Xcode/DerivedData/SuggestionsKit-cqcvpwifregfbkgcgfkoxwikfhff/Build/Intermediates.noindex/InstallIntermediates/macosx/Intermediates.noindex/SuggestionsKit.build/Debug/AvailableServices.build/Objects-normal/x86_64/AvailableServices.swiftmodule'
+    - '/Users/cishida/Library/Developer/Xcode/DerivedData/SuggestionsKit-cqcvpwifregfbkgcgfkoxwikfhff/Build/Intermediates.noindex/InstallIntermediates/macosx/Intermediates.noindex/SuggestionsKit.build/Debug/SuggestionsKit.build/DerivedSources/'
+    - SuggestionsKit_vers.c
+    - '/Users/cishida/Library/Developer/Xcode/DerivedData/SuggestionsKit-cqcvpwifregfbkgcgfkoxwikfhff/Build/Intermediates.noindex/InstallIntermediates/macosx/Intermediates.noindex/SuggestionsKit.build/Debug/SuggestionsKit.build/Objects-normal/x86_64/SuggestionsKit_vers.o'
+    - _SuggestionsKitVersionString
+    - _SuggestionsKitVersionNumber
+    - '/Users/cishida/Projects/suggestionskit/SuggestionsKit/'
+    - PodcastSuggestion.m
+    - '/Users/cishida/Library/Developer/Xcode/DerivedData/SuggestionsKit-cqcvpwifregfbkgcgfkoxwikfhff/Build/Intermediates.noindex/InstallIntermediates/macosx/Intermediates.noindex/SuggestionsKit.build/Debug/SuggestionsKit.build/Objects-normal/x86_64/PodcastSuggestion.o'
+    - '__OBJC_METACLASS_RO_$_PodcastSuggestion'
+    - '__OBJC_CLASS_RO_$_PodcastSuggestion'
+    - '_OBJC_METACLASS_$_PodcastSuggestion'
+    - '_OBJC_CLASS_$_PodcastSuggestion'
+    - '/Users/cishida/Projects/suggestionskit/SuggestionsKit/'
+    - PrivateHelpers.m
+    - '/Users/cishida/Library/Developer/Xcode/DerivedData/SuggestionsKit-cqcvpwifregfbkgcgfkoxwikfhff/Build/Intermediates.noindex/InstallIntermediates/macosx/Intermediates.noindex/SuggestionsKit.build/Debug/SuggestionsKit.build/Objects-normal/x86_64/PrivateHelpers.o'
+    - _setupTestSuggestionService
+    - _teardownTestSuggestionService
+    - _launchOnTestApps
+    - '/Users/cishida/Projects/suggestionskit/SuggestionsKit/'
+    - Suggestion.m
+    - '/Users/cishida/Library/Developer/Xcode/DerivedData/SuggestionsKit-cqcvpwifregfbkgcgfkoxwikfhff/Build/Intermediates.noindex/InstallIntermediates/macosx/Intermediates.noindex/SuggestionsKit.build/Debug/SuggestionsKit.build/Objects-normal/x86_64/Suggestion.o'
+    - '__OBJC_METACLASS_RO_$_Suggestion'
+    - '__OBJC_CLASS_RO_$_Suggestion'
+    - '_OBJC_METACLASS_$_Suggestion'
+    - '_OBJC_CLASS_$_Suggestion'
+    - '/Users/cishida/Projects/suggestionskit/SuggestionsKit/'
+    - TrailSuggestions.swift
+    - '/Users/cishida/Library/Developer/Xcode/DerivedData/SuggestionsKit-cqcvpwifregfbkgcgfkoxwikfhff/Build/Intermediates.noindex/InstallIntermediates/macosx/Intermediates.noindex/SuggestionsKit.build/Debug/SuggestionsKit.build/Objects-normal/x86_64/TrailSuggestions.o'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvpfi'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvpACTK'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvpACTk'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvg'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvs'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvM'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvM.resume.0'
+    - '_$s14SuggestionsKit05TrailA0Cfd'
+    - '_$s14SuggestionsKit05TrailA0CfD'
+    - '_$s14SuggestionsKit05TrailA0CACycfC'
+    - '_$s14SuggestionsKit05TrailA0CACycfc'
+    - '_$s14SuggestionsKit05TrailA0CMa'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvpWvd'
+    - ___swift_reflection_version
+    - '_$s14SuggestionsKitMXM'
+    - '_$s14SuggestionsKit05TrailA0CMn'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvgTq'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvsTq'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvMTq'
+    - '_$s14SuggestionsKit05TrailA0CACycfCTq'
+    - _symbolic _____ 14SuggestionsKit05TrailA0C
+    - _symbolic _____ 17AvailableServicesAAC
+    - '_$s14SuggestionsKit05TrailA0CMF'
+    - '_$s14SuggestionsKit05TrailA0C14activeServices09AvailableE0AECvpMV'
+    - '__swift_FORCE_LOAD_$_swiftFoundation_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftObjectiveC_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftDarwin_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftCoreFoundation_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftDispatch_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftXPC_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftIOKit_$_SuggestionsKit'
+    - '__swift_FORCE_LOAD_$_swiftCompatibility56_$_SuggestionsKit'
+    - __METACLASS_DATA__TtC14SuggestionsKit16TrailSuggestions
+    - __IVARS__TtC14SuggestionsKit16TrailSuggestions
+    - __DATA__TtC14SuggestionsKit16TrailSuggestions
+    - '_$s14SuggestionsKit05TrailA0CMm'
+    - '_$s14SuggestionsKit05TrailA0CMf'
+    - '_$s14SuggestionsKit05TrailA0CN'
+    - '/Users/cishida/Projects/suggestionskit/AvailableServices/'
+    - Services.swift
+    - '/Users/cishida/Library/Developer/Xcode/DerivedData/SuggestionsKit-cqcvpwifregfbkgcgfkoxwikfhff/Build/Intermediates.noindex/InstallIntermediates/macosx/Products/Debug/libAvailableServices.a(Services.o)'
+    - '_$s17AvailableServicesAACABycfC'
+    - '_$s17AvailableServicesAACABycfc'
+    - '_$s17AvailableServicesAAC10getServiceyyF'
+    - '_$s17AvailableServicesAACfd'
+    - '_$s17AvailableServicesAACfD'
+    - '_$s17AvailableServices16ServiceRequesterCACycfC'
+    - '_$s17AvailableServices16ServiceRequesterCACycfc'
+    - '_$s17AvailableServices16ServiceRequesterC10enterQueueyyF'
+    - '_$s17AvailableServices16ServiceRequesterCfd'
+    - '_$s17AvailableServices16ServiceRequesterCfD'
+    - '_$s17AvailableServices12ServiceRaterCACycfC'
+    - '_$s17AvailableServices12ServiceRaterCACycfc'
+    - '_$s17AvailableServices12ServiceRaterC13publishRatingyyF'
+    - '_$s17AvailableServices12ServiceRaterCfd'
+    - '_$s17AvailableServices12ServiceRaterCfD'
+    - '_$s17AvailableServicesAACMa'
+    - '_$s17AvailableServices16ServiceRequesterCMa'
+    - '_$s17AvailableServices12ServiceRaterCMa'
+    - '_$s17AvailableServicesMXM'
+    - '_$s17AvailableServicesAACMn'
+    - '_$s17AvailableServicesAACABycfCTq'
+    - '_$s17AvailableServicesAAC10getServiceyyFTq'
+    - '_$s17AvailableServices16ServiceRequesterCMn'
+    - '_$s17AvailableServices16ServiceRequesterCACycfCTq'
+    - '_$s17AvailableServices16ServiceRequesterC10enterQueueyyFTq'
+    - '_$s17AvailableServices12ServiceRaterCMn'
+    - '_$s17AvailableServices12ServiceRaterCACycfCTq'
+    - '_$s17AvailableServices12ServiceRaterC13publishRatingyyFTq'
+    - _symbolic _____ 17AvailableServices16ServiceRequesterC
+    - _symbolic _____ 17AvailableServices12ServiceRaterC
+    - '_$s17AvailableServicesAACMF'
+    - '_$s17AvailableServices16ServiceRequesterCMF'
+    - '_$s17AvailableServices12ServiceRaterCMF'
+    - '__swift_FORCE_LOAD_$_swiftFoundation_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftObjectiveC_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftDarwin_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftCoreFoundation_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftDispatch_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftXPC_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftIOKit_$_AvailableServices'
+    - '__swift_FORCE_LOAD_$_swiftCompatibility56_$_AvailableServices'
+    - __METACLASS_DATA__TtC17AvailableServices17AvailableServices
+    - __DATA__TtC17AvailableServices17AvailableServices
+    - __METACLASS_DATA__TtC17AvailableServices16ServiceRequester
+    - __DATA__TtC17AvailableServices16ServiceRequester
+    - __METACLASS_DATA__TtC17AvailableServices12ServiceRater
+    - __DATA__TtC17AvailableServices12ServiceRater
+    - '_$s17AvailableServicesAACMm'
+    - '_$s17AvailableServicesAACMf'
+    - '_$s17AvailableServicesAACN'
+    - '_$s17AvailableServices16ServiceRequesterCMm'
+    - '_$s17AvailableServices16ServiceRequesterCMf'
+    - '_$s17AvailableServices16ServiceRequesterCN'
+    - '_$s17AvailableServices12ServiceRaterCMm'
+    - '_$s17AvailableServices12ServiceRaterCMf'
+    - '_$s17AvailableServices12ServiceRaterCN'
+    - ''
+    - ''
+    - ''
+    - ''
+  IndirectSymbols: [ 0x19A, 0x19B, 0x1A4, 0x1A5, 0x1A6, 0x1A7, 0x1A8, 0x1A9, 
+                     0x1AA, 0x1AB, 0x1AC, 0x1AD, 0x1AE, 0x1AF, 0x1B0, 0x1B1, 
+                     0x1B2, 0x1B3, 0x1B4, 0x1B5, 0x1B6, 0x1B7, 0x1B8, 0x1B9, 
+                     0x1BA, 0x1BB, 0x1BC, 0x1BD, 0x1BE, 0x1BF, 0x1C0, 0x1C1, 
+                     0x1C2, 0x1C3, 0x1C4, 0x1C5, 0x1C6, 0x1C7, 0x1C8, 0x1C9, 
+                     0x1CA, 0x1CB, 0x80000000, 0x19A, 0x19B, 0x1A4, 0x1A5, 
+                     0x1A6, 0x1A7, 0x1A8, 0x1A9, 0x1AA, 0x1AB, 0x1AC, 0x1AD, 
+                     0x1AE, 0x1AF, 0x1B0, 0x1B1, 0x1B2, 0x1B3, 0x1B4, 0x1B5, 
+                     0x1B6, 0x1B7, 0x1B8, 0x1B9, 0x1BA, 0x1BB, 0x1BC, 0x1BD, 
+                     0x1BE, 0x1BF, 0x1C0, 0x1C1, 0x1C2, 0x1C3, 0x1C4, 0x1C5, 
+                     0x1C6, 0x1C7, 0x1C8, 0x1C9, 0x1CA, 0x1CB ]
+  FunctionStarts:  [ 0x1B30, 0x1B40, 0x1B50, 0x1B60, 0x1B80, 0x1BD0, 0x1C40, 
+                     0x1C90, 0x1CF0, 0x1D30, 0x1D60, 0x1D90, 0x1DD0, 0x1E00, 
+                     0x1E40, 0x1E60, 0x1E90, 0x1EB0, 0x1ED0, 0x1EF0, 0x1F30, 
+                     0x1F60, 0x1F80, 0x1FA0, 0x1FC0, 0x2000, 0x2030, 0x2050, 
+                     0x2070, 0x2090, 0x20D0, 0x20F0, 0x2110, 0x2130, 0x2370, 
+                     0x2460, 0x2490, 0x25A0, 0x25D0, 0x25F0, 0x2780, 0x2920, 
+                     0x2970, 0x29A0, 0x2AC0, 0x2AD0, 0x2AE0, 0x2B10, 0x2B40, 
+                     0x2BE0, 0x2C10, 0x2C50, 0x2D10, 0x2D40, 0x2D90, 0x2DE0, 
+                     0x2E10, 0x2E40, 0x2E80, 0x2F60, 0x2FA0, 0x2FE0, 0x3020, 
+                     0x3060, 0x3140, 0x3180, 0x31C0, 0x3200, 0x3250, 0x3260, 
+                     0x3270, 0x3280, 0x3290, 0x32A0, 0x32E0, 0x3320, 0x3360, 
+                     0x33B0, 0x33F0, 0x3440, 0x3480, 0x34C0, 0x3510, 0x3560, 
+                     0x3590 ]
+  ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x68, 0x0, 
+                     0x0, 0x0, 0x28, 0x1, 0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 
+                     0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x18, 0x0, 0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x18, 0x0, 0x0, 0x0, 
+                     0x0, 0x10, 0x6, 0x0, 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 
+                     0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0, 0x0, 0x50, 
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x1, 0x0, 0x0, 0x0, 0xFD, 0x2, 0x0, 0x0, 0xFD, 0x12, 
+                     0x0, 0x0, 0x4, 0x20, 0x0, 0x0, 0x2, 0x6C, 0x0, 0x0, 
+                     0x2, 0x7A, 0x0, 0x0, 0x2, 0x9C, 0x0, 0x0, 0x1, 0xAA, 
+                     0x0, 0x0, 0x2, 0xC9, 0x0, 0x0, 0x2, 0xE1, 0x0, 0x0, 
+                     0x2, 0xB, 0x1, 0x0, 0x2, 0x3B, 0x1, 0x0, 0x2, 0x68, 
+                     0x1, 0x0, 0x2, 0x98, 0x1, 0x0, 0x2, 0xC4, 0x1, 0x0, 
+                     0x2, 0xEA, 0x1, 0x0, 0x2, 0x14, 0x2, 0x0, 0x2, 0x3A, 
+                     0x2, 0x0, 0x2, 0x64, 0x2, 0x0, 0x2, 0x92, 0x2, 0x0, 
+                     0x2, 0xBA, 0x2, 0x0, 0x2, 0xE2, 0x2, 0x0, 0x2, 0x10, 
+                     0x3, 0x0, 0x2, 0x3C, 0x3, 0x0, 0x2, 0x72, 0x3, 0x0, 
+                     0x2, 0xA2, 0x3, 0x0, 0x2, 0xD8, 0x3, 0x0, 0x2, 0x8, 
+                     0x4, 0x0, 0x2, 0x32, 0x4, 0x0, 0x2, 0x60, 0x4, 0x0, 
+                     0x2, 0x94, 0x4, 0x0, 0x2, 0xC8, 0x4, 0x0, 0x2, 0xF6, 
+                     0x4, 0x0, 0x2, 0x24, 0x5, 0x0, 0x4, 0x4E, 0x5, 0x0, 
+                     0x4, 0x74, 0x5, 0x0, 0x4, 0x9A, 0x5, 0x0, 0x4, 0xD2, 
+                     0x5, 0x0, 0x4, 0xF4, 0x5, 0x0, 0x4, 0x1A, 0x6, 0x0, 
+                     0x4, 0x32, 0x6, 0x0, 0x4, 0x50, 0x6, 0x0, 0x2, 0x6D, 
+                     0x6, 0x0, 0x1, 0x8A, 0x6, 0x0, 0x1, 0xB0, 0x6, 0x0, 
+                     0x1, 0xE6, 0x6, 0x0, 0x4, 0x14, 0x7, 0x0, 0x4, 0x60, 
+                     0x7, 0x0, 0x4, 0x70, 0x7, 0x0, 0x0, 0x5F, 0x5F, 0x5A, 
+                     0x64, 0x6C, 0x50, 0x76, 0x0, 0x5F, 0x5F, 0x5A, 0x6E, 
+                     0x77, 0x6D, 0x0, 0x5F, 0x5F, 0x73, 0x77, 0x69, 0x66, 
+                     0x74, 0x5F, 0x73, 0x74, 0x64, 0x6C, 0x69, 0x62, 0x5F, 
+                     0x6F, 0x70, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6E, 0x67, 
+                     0x53, 0x79, 0x73, 0x74, 0x65, 0x6D, 0x56, 0x65, 0x72, 
+                     0x73, 0x69, 0x6F, 0x6E, 0x0, 0x5F, 0x61, 0x62, 0x6F, 
+                     0x72, 0x74, 0x0, 0x5F, 0x64, 0x69, 0x73, 0x70, 0x61, 
+                     0x74, 0x63, 0x68, 0x5F, 0x6F, 0x6E, 0x63, 0x65, 0x5F, 
+                     0x66, 0x0, 0x5F, 0x64, 0x6C, 0x73, 0x79, 0x6D, 0x0, 
+                     0x5F, 0x6F, 0x62, 0x6A, 0x63, 0x5F, 0x6F, 0x70, 0x74, 
+                     0x5F, 0x73, 0x65, 0x6C, 0x66, 0x0, 0x5F, 0x6F, 0x73, 
+                     0x5F, 0x72, 0x65, 0x6C, 0x65, 0x61, 0x73, 0x65, 0x0, 
+                     0x5F, 0x6F, 0x73, 0x5F, 0x75, 0x6E, 0x66, 0x61, 0x69, 
+                     0x72, 0x5F, 0x6C, 0x6F, 0x63, 0x6B, 0x5F, 0x6C, 0x6F, 
+                     0x63, 0x6B, 0x0, 0x5F, 0x6F, 0x73, 0x5F, 0x75, 0x6E, 
+                     0x66, 0x61, 0x69, 0x72, 0x5F, 0x6C, 0x6F, 0x63, 0x6B, 
+                     0x5F, 0x74, 0x72, 0x79, 0x6C, 0x6F, 0x63, 0x6B, 0x0, 
+                     0x5F, 0x6F, 0x73, 0x5F, 0x75, 0x6E, 0x66, 0x61, 0x69, 
+                     0x72, 0x5F, 0x6C, 0x6F, 0x63, 0x6B, 0x5F, 0x75, 0x6E, 
+                     0x6C, 0x6F, 0x63, 0x6B, 0x0, 0x5F, 0x70, 0x74, 0x68, 
+                     0x72, 0x65, 0x61, 0x64, 0x5F, 0x63, 0x6F, 0x6E, 0x64, 
+                     0x5F, 0x62, 0x72, 0x6F, 0x61, 0x64, 0x63, 0x61, 0x73, 
+                     0x74, 0x0, 0x5F, 0x70, 0x74, 0x68, 0x72, 0x65, 0x61, 
+                     0x64, 0x5F, 0x63, 0x6F, 0x6E, 0x64, 0x5F, 0x64, 0x65, 
+                     0x73, 0x74, 0x72, 0x6F, 0x79, 0x0, 0x5F, 0x70, 0x74, 
+                     0x68, 0x72, 0x65, 0x61, 0x64, 0x5F, 0x63, 0x6F, 0x6E, 
+                     0x64, 0x5F, 0x69, 0x6E, 0x69, 0x74, 0x0, 0x5F, 0x70, 
+                     0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x5F, 0x63, 0x6F, 
+                     0x6E, 0x64, 0x5F, 0x73, 0x69, 0x67, 0x6E, 0x61, 0x6C, 
+                     0x0, 0x5F, 0x70, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 
+                     0x5F, 0x63, 0x6F, 0x6E, 0x64, 0x5F, 0x77, 0x61, 0x69, 
+                     0x74, 0x0, 0x5F, 0x70, 0x74, 0x68, 0x72, 0x65, 0x61, 
+                     0x64, 0x5F, 0x67, 0x65, 0x74, 0x73, 0x70, 0x65, 0x63, 
+                     0x69, 0x66, 0x69, 0x63, 0x0, 0x5F, 0x70, 0x74, 0x68, 
+                     0x72, 0x65, 0x61, 0x64, 0x5F, 0x6D, 0x75, 0x74, 0x65, 
+                     0x78, 0x5F, 0x64, 0x65, 0x73, 0x74, 0x72, 0x6F, 0x79, 
+                     0x0, 0x5F, 0x70, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 
+                     0x5F, 0x6D, 0x75, 0x74, 0x65, 0x78, 0x5F, 0x69, 0x6E, 
+                     0x69, 0x74, 0x0, 0x5F, 0x70, 0x74, 0x68, 0x72, 0x65, 
+                     0x61, 0x64, 0x5F, 0x6D, 0x75, 0x74, 0x65, 0x78, 0x5F, 
+                     0x6C, 0x6F, 0x63, 0x6B, 0x0, 0x5F, 0x70, 0x74, 0x68, 
+                     0x72, 0x65, 0x61, 0x64, 0x5F, 0x6D, 0x75, 0x74, 0x65, 
+                     0x78, 0x5F, 0x74, 0x72, 0x79, 0x6C, 0x6F, 0x63, 0x6B, 
+                     0x0, 0x5F, 0x70, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 
+                     0x5F, 0x6D, 0x75, 0x74, 0x65, 0x78, 0x5F, 0x75, 0x6E, 
+                     0x6C, 0x6F, 0x63, 0x6B, 0x0, 0x5F, 0x70, 0x74, 0x68, 
+                     0x72, 0x65, 0x61, 0x64, 0x5F, 0x6D, 0x75, 0x74, 0x65, 
+                     0x78, 0x61, 0x74, 0x74, 0x72, 0x5F, 0x64, 0x65, 0x73, 
+                     0x74, 0x72, 0x6F, 0x79, 0x0, 0x5F, 0x70, 0x74, 0x68, 
+                     0x72, 0x65, 0x61, 0x64, 0x5F, 0x6D, 0x75, 0x74, 0x65, 
+                     0x78, 0x61, 0x74, 0x74, 0x72, 0x5F, 0x69, 0x6E, 0x69, 
+                     0x74, 0x0, 0x5F, 0x70, 0x74, 0x68, 0x72, 0x65, 0x61, 
+                     0x64, 0x5F, 0x6D, 0x75, 0x74, 0x65, 0x78, 0x61, 0x74, 
+                     0x74, 0x72, 0x5F, 0x73, 0x65, 0x74, 0x74, 0x79, 0x70, 
+                     0x65, 0x0, 0x5F, 0x70, 0x74, 0x68, 0x72, 0x65, 0x61, 
+                     0x64, 0x5F, 0x72, 0x77, 0x6C, 0x6F, 0x63, 0x6B, 0x5F, 
+                     0x64, 0x65, 0x73, 0x74, 0x72, 0x6F, 0x79, 0x0, 0x5F, 
+                     0x70, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x5F, 0x72, 
+                     0x77, 0x6C, 0x6F, 0x63, 0x6B, 0x5F, 0x69, 0x6E, 0x69, 
+                     0x74, 0x0, 0x5F, 0x70, 0x74, 0x68, 0x72, 0x65, 0x61, 
+                     0x64, 0x5F, 0x72, 0x77, 0x6C, 0x6F, 0x63, 0x6B, 0x5F, 
+                     0x72, 0x64, 0x6C, 0x6F, 0x63, 0x6B, 0x0, 0x5F, 0x70, 
+                     0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x5F, 0x72, 0x77, 
+                     0x6C, 0x6F, 0x63, 0x6B, 0x5F, 0x74, 0x72, 0x79, 0x72, 
+                     0x64, 0x6C, 0x6F, 0x63, 0x6B, 0x0, 0x5F, 0x70, 0x74, 
+                     0x68, 0x72, 0x65, 0x61, 0x64, 0x5F, 0x72, 0x77, 0x6C, 
+                     0x6F, 0x63, 0x6B, 0x5F, 0x74, 0x72, 0x79, 0x77, 0x72, 
+                     0x6C, 0x6F, 0x63, 0x6B, 0x0, 0x5F, 0x70, 0x74, 0x68, 
+                     0x72, 0x65, 0x61, 0x64, 0x5F, 0x72, 0x77, 0x6C, 0x6F, 
+                     0x63, 0x6B, 0x5F, 0x75, 0x6E, 0x6C, 0x6F, 0x63, 0x6B, 
+                     0x0, 0x5F, 0x70, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 
+                     0x5F, 0x72, 0x77, 0x6C, 0x6F, 0x63, 0x6B, 0x5F, 0x77, 
+                     0x72, 0x6C, 0x6F, 0x63, 0x6B, 0x0, 0x5F, 0x70, 0x74, 
+                     0x68, 0x72, 0x65, 0x61, 0x64, 0x5F, 0x73, 0x65, 0x74, 
+                     0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x63, 0x0, 
+                     0x5F, 0x73, 0x77, 0x69, 0x66, 0x74, 0x5F, 0x61, 0x6C, 
+                     0x6C, 0x6F, 0x63, 0x4F, 0x62, 0x6A, 0x65, 0x63, 0x74, 
+                     0x0, 0x5F, 0x73, 0x77, 0x69, 0x66, 0x74, 0x5F, 0x62, 
+                     0x65, 0x67, 0x69, 0x6E, 0x41, 0x63, 0x63, 0x65, 0x73, 
+                     0x73, 0x0, 0x5F, 0x73, 0x77, 0x69, 0x66, 0x74, 0x5F, 
+                     0x64, 0x65, 0x61, 0x6C, 0x6C, 0x6F, 0x63, 0x43, 0x6C, 
+                     0x61, 0x73, 0x73, 0x49, 0x6E, 0x73, 0x74, 0x61, 0x6E, 
+                     0x63, 0x65, 0x0, 0x5F, 0x73, 0x77, 0x69, 0x66, 0x74, 
+                     0x5F, 0x65, 0x6E, 0x64, 0x41, 0x63, 0x63, 0x65, 0x73, 
+                     0x73, 0x0, 0x5F, 0x73, 0x77, 0x69, 0x66, 0x74, 0x5F, 
+                     0x65, 0x72, 0x72, 0x6F, 0x72, 0x52, 0x65, 0x74, 0x61, 
+                     0x69, 0x6E, 0x0, 0x5F, 0x73, 0x77, 0x69, 0x66, 0x74, 
+                     0x5F, 0x6F, 0x6E, 0x63, 0x65, 0x0, 0x5F, 0x73, 0x77, 
+                     0x69, 0x66, 0x74, 0x5F, 0x72, 0x65, 0x6C, 0x65, 0x61, 
+                     0x73, 0x65, 0x0, 0x5F, 0x73, 0x77, 0x69, 0x66, 0x74, 
+                     0x5F, 0x72, 0x65, 0x74, 0x61, 0x69, 0x6E, 0x0, 0x5F, 
+                     0x76, 0x6F, 0x75, 0x63, 0x68, 0x65, 0x72, 0x5F, 0x61, 
+                     0x64, 0x6F, 0x70, 0x74, 0x0, 0x5F, 0x5F, 0x6F, 0x62, 
+                     0x6A, 0x63, 0x5F, 0x65, 0x6D, 0x70, 0x74, 0x79, 0x5F, 
+                     0x63, 0x61, 0x63, 0x68, 0x65, 0x0, 0x5F, 0x4F, 0x42, 
+                     0x4A, 0x43, 0x5F, 0x4D, 0x45, 0x54, 0x41, 0x43, 0x4C, 
+                     0x41, 0x53, 0x53, 0x5F, 0x24, 0x5F, 0x4E, 0x53, 0x4F, 
+                     0x62, 0x6A, 0x65, 0x63, 0x74, 0x0, 0x5F, 0x4F, 0x42, 
+                     0x4A, 0x43, 0x5F, 0x43, 0x4C, 0x41, 0x53, 0x53, 0x5F, 
+                     0x24, 0x5F, 0x4E, 0x53, 0x4F, 0x62, 0x6A, 0x65, 0x63, 
+                     0x74, 0x0, 0x5F, 0x4F, 0x42, 0x4A, 0x43, 0x5F, 0x4D, 
+                     0x45, 0x54, 0x41, 0x43, 0x4C, 0x41, 0x53, 0x53, 0x5F, 
+                     0x24, 0x5F, 0x5F, 0x54, 0x74, 0x43, 0x73, 0x31, 0x32, 
+                     0x5F, 0x53, 0x77, 0x69, 0x66, 0x74, 0x4F, 0x62, 0x6A, 
+                     0x65, 0x63, 0x74, 0x0, 0x5F, 0x24, 0x73, 0x42, 0x6F, 
+                     0x57, 0x56, 0x0, 0x5F, 0x4F, 0x42, 0x4A, 0x43, 0x5F, 
+                     0x43, 0x4C, 0x41, 0x53, 0x53, 0x5F, 0x24, 0x5F, 0x5F, 
+                     0x54, 0x74, 0x43, 0x73, 0x31, 0x32, 0x5F, 0x53, 0x77, 
+                     0x69, 0x66, 0x74, 0x4F, 0x62, 0x6A, 0x65, 0x63, 0x74, 
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+  DataInCode:
+    - Offset:          0x2ED4
+      Length:          140
+      Kind:            0x4
+...
diff --git a/llvm/test/tools/llvm-readtapi/Inputs/objc.yaml b/llvm/test/tools/llvm-readtapi/Inputs/objc.yaml
new file mode 100644
index 000000000000..50219ab1fefb
--- /dev/null
+++ b/llvm/test/tools/llvm-readtapi/Inputs/objc.yaml
@@ -0,0 +1,692 @@
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x6
+  ncmds:           16
+  sizeofcmds:      1912
+  flags:           0x100085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         552
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          16384
+    fileoff:         0
+    filesize:        16384
+    maxprot:         5
+    initprot:        5
+    nsects:          6
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x7B8
+        size:            44
+        offset:          0x7B8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         FF8300D1FD7B01A9FD430091E00700F9E10300F9E80740F90041009104000094FD7B41A9FF830091C0035FD6
+      - sectname:        __stubs
+        segname:         __TEXT
+        addr:            0x7E4
+        size:            12
+        offset:          0x7E4
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000408
+        reserved1:       0x1
+        reserved2:       0xC
+        reserved3:       0x0
+        content:         50000090100240F900021FD6
+      - sectname:        __stub_helper
+        segname:         __TEXT
+        addr:            0x7F0
+        size:            36
+        offset:          0x7F0
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         5100009031220091F047BFA930000090100240F900021FD650000018F9FFFF1700000000
+      - sectname:        __cstring
+        segname:         __TEXT
+        addr:            0x814
+        size:            35
+        offset:          0x814
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x2
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         53756767657374696F6E00110076313640303A3800420040224E53537472696E672200
+      - sectname:        __objc_methname
+        segname:         __TEXT
+        addr:            0x837
+        size:            32
+        offset:          0x837
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x2
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         2E6378785F6465737472756374005F746F7043686F696365005F6F7468657200
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x858
+        size:            4152
+        offset:          0x858
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         010000001C000000010000002000000000000000200000000200000000000004B80700003800000038000000E40700000000000038000000030000000C0001001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         312
+    segname:         __DATA_CONST
+    vmaddr:          16384
+    vmsize:          16384
+    fileoff:         16384
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          3
+    flags:           16
+    Sections:
+      - sectname:        __got
+        segname:         __DATA_CONST
+        addr:            0x4000
+        size:            8
+        offset:          0x4000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x6
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000000000000'
+      - sectname:        __objc_classlist
+        segname:         __DATA_CONST
+        addr:            0x4008
+        size:            8
+        offset:          0x4008
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x10000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '3081000000000000'
+      - sectname:        __objc_imageinfo
+        segname:         __DATA_CONST
+        addr:            0x4010
+        size:            8
+        offset:          0x4010
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000040000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         472
+    segname:         __DATA
+    vmaddr:          32768
+    vmsize:          16384
+    fileoff:         32768
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          5
+    flags:           0
+    Sections:
+      - sectname:        __la_symbol_ptr
+        segname:         __DATA
+        addr:            0x8000
+        size:            8
+        offset:          0x8000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x7
+        reserved1:       0x2
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0808000000000000'
+      - sectname:        __data
+        segname:         __DATA
+        addr:            0x8008
+        size:            8
+        offset:          0x8008
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000000000000'
+      - sectname:        __objc_const
+        segname:         __DATA
+        addr:            0x8010
+        size:            248
+        offset:          0x8010
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         850100002800000028000000000000000000000000000000140800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000180000000100000037080000000000002108000000000000B807000000000000200000000200000058810000000000004508000000000000290800000000000000000000010000005C8100000000000050080000000000002B08000000000000030000000800000084010000080000001800000000000000000000000000000014080000000000005880000000000000000000000000000078800000000000001F080000000000000000000000000000
+      - sectname:        __objc_data
+        segname:         __DATA
+        addr:            0x8108
+        size:            80
+        offset:          0x8108
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         000000000000000000000000000000000000000000000000000000000000000010800000000000000881000000000000000000000000000000000000000000000000000000000000C080000000000000
+      - sectname:        __objc_ivar
+        segname:         __DATA
+        addr:            0x8158
+        size:            8
+        offset:          0x8158
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0800000010000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          49152
+    vmsize:          2336
+    fileoff:         49152
+    filesize:        2336
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      49152
+    rebase_size:     24
+    bind_off:        49176
+    bind_size:       128
+    weak_bind_off:   0
+    weak_bind_size:  0
+    lazy_bind_off:   49304
+    lazy_bind_size:  24
+    export_off:      49328
+    export_size:     128
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          49464
+    nsyms:           28
+    stroff:          49928
+    strsize:         1016
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       19
+    iextdefsym:      19
+    nextdefsym:      4
+    iundefsym:       23
+    nundefsym:       5
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  49912
+    nindirectsyms:   3
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         40
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 0
+      compatibility_version: 0
+    Content:         tmp.dylib
+    ZeroPadBytes:    7
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C44F3-5555-3144-A1E7-371E4713DEDE
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           851968
+    sdk:             983040
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1179648
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         96
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 197001216
+      compatibility_version: 19660800
+    Content:         '/System/Library/Frameworks/Foundation.framework/Versions/C/Foundation'
+    ZeroPadBytes:    3
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 14942208
+      compatibility_version: 65536
+    Content:         '/usr/lib/libobjc.A.dylib'
+    ZeroPadBytes:    8
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 88342528
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         49456
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         49464
+    datasize:        0
+  - cmd:             LC_CODE_SIGNATURE
+    cmdsize:         16
+    dataoff:         50944
+    datasize:        544
+LinkEditData:
+  RebaseOpcodes:
+    - Opcode:          REBASE_OPCODE_SET_TYPE_IMM
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ExtraData:       [ 0x8 ]
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             2
+      ExtraData:       [ 0x0 ]
+    - Opcode:          REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB
+      Imm:             0
+      ExtraData:       [ 0x2, 0x20 ]
+    - Opcode:          REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      Imm:             2
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             3
+    - Opcode:          REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             3
+    - Opcode:          REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             3
+    - Opcode:          REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      Imm:             4
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             2
+    - Opcode:          REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             2
+    - Opcode:          REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      Imm:             5
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             2
+    - Opcode:          REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      Imm:             3
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DONE
+      Imm:             0
+  BindOpcodes:
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          dyld_stub_binder
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_DYLIB_ORDINAL_IMM
+      Imm:             3
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ULEBExtraData:   [ 0x0 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          '_OBJC_METACLASS_$_NSObject'
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_DYLIB_ORDINAL_IMM
+      Imm:             2
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             2
+      ULEBExtraData:   [ 0x108 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          __objc_empty_cache
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_ADD_ADDR_ULEB
+      Imm:             0
+      ULEBExtraData:   [ 0x20 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          '_OBJC_CLASS_$_NSObject'
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_ADD_ADDR_ULEB
+      Imm:             0
+      ULEBExtraData:   [ 0xFFFFFFFFFFFFFFF0 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+  LazyBindOpcodes:
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             2
+      ULEBExtraData:   [ 0x0 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_DYLIB_ORDINAL_IMM
+      Imm:             2
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          _objc_destroyWeak
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    0
+        NodeOffset:      10
+        Name:            _OBJC_
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    4
+            NodeOffset:      77
+            Name:            'CLASS_$_Suggestion'
+            Flags:           0x0
+            Address:         0x8130
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    4
+            NodeOffset:      83
+            Name:            'METACLASS_$_Suggestion'
+            Flags:           0x0
+            Address:         0x8108
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    0
+            NodeOffset:      89
+            Name:            'IVAR_$_Suggestion._'
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    4
+                NodeOffset:      109
+                Name:            other
+                Flags:           0x0
+                Address:         0x815C
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    4
+                NodeOffset:      115
+                Name:            topChoice
+                Flags:           0x0
+                Address:         0x8158
+                Other:           0x0
+                ImportName:      ''
+  NameList:
+    - n_strx:          407
+      n_type:          0x64
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          474
+      n_type:          0x66
+      n_sect:          0
+      n_desc:          1
+      n_value:         0
+    - n_strx:          729
+      n_type:          0x24
+      n_sect:          1
+      n_desc:          0
+      n_value:         1976
+    - n_strx:          1
+      n_type:          0x24
+      n_sect:          0
+      n_desc:          0
+      n_value:         44
+    - n_strx:          757
+      n_type:          0x26
+      n_sect:          12
+      n_desc:          0
+      n_value:         32784
+    - n_strx:          790
+      n_type:          0x26
+      n_sect:          12
+      n_desc:          0
+      n_value:         32856
+    - n_strx:          827
+      n_type:          0x26
+      n_sect:          12
+      n_desc:          0
+      n_value:         32888
+    - n_strx:          866
+      n_type:          0x26
+      n_sect:          12
+      n_desc:          0
+      n_value:         32960
+    - n_strx:          895
+      n_type:          0x20
+      n_sect:          13
+      n_desc:          0
+      n_value:         33032
+    - n_strx:          924
+      n_type:          0x20
+      n_sect:          13
+      n_desc:          0
+      n_value:         33072
+    - n_strx:          949
+      n_type:          0x20
+      n_sect:          14
+      n_desc:          0
+      n_value:         33112
+    - n_strx:          984
+      n_type:          0x20
+      n_sect:          14
+      n_desc:          0
+      n_value:         33116
+    - n_strx:          1
+      n_type:          0x64
+      n_sect:          1
+      n_desc:          0
+      n_value:         0
+    - n_strx:          2
+      n_type:          0xE
+      n_sect:          1
+      n_desc:          0
+      n_value:         1976
+    - n_strx:          30
+      n_type:          0xE
+      n_sect:          12
+      n_desc:          0
+      n_value:         32784
+    - n_strx:          63
+      n_type:          0xE
+      n_sect:          12
+      n_desc:          0
+      n_value:         32856
+    - n_strx:          100
+      n_type:          0xE
+      n_sect:          12
+      n_desc:          0
+      n_value:         32888
+    - n_strx:          139
+      n_type:          0xE
+      n_sect:          12
+      n_desc:          0
+      n_value:         32960
+    - n_strx:          168
+      n_type:          0xE
+      n_sect:          11
+      n_desc:          0
+      n_value:         32776
+    - n_strx:          183
+      n_type:          0xF
+      n_sect:          13
+      n_desc:          0
+      n_value:         33032
+    - n_strx:          212
+      n_type:          0xF
+      n_sect:          13
+      n_desc:          0
+      n_value:         33072
+    - n_strx:          237
+      n_type:          0xF
+      n_sect:          14
+      n_desc:          0
+      n_value:         33112
+    - n_strx:          272
+      n_type:          0xF
+      n_sect:          14
+      n_desc:          0
+      n_value:         33116
+    - n_strx:          303
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          326
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          353
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          372
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+    - n_strx:          390
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          768
+      n_value:         0
+  StringTable:
+    - ' '
+    - '-[Suggestion .cxx_destruct]'
+    - '__OBJC_METACLASS_RO_$_Suggestion'
+    - '__OBJC_$_INSTANCE_METHODS_Suggestion'
+    - '__OBJC_$_INSTANCE_VARIABLES_Suggestion'
+    - '__OBJC_CLASS_RO_$_Suggestion'
+    - __dyld_private
+    - '_OBJC_METACLASS_$_Suggestion'
+    - '_OBJC_CLASS_$_Suggestion'
+    - '_OBJC_IVAR_$_Suggestion._topChoice'
+    - '_OBJC_IVAR_$_Suggestion._other'
+    - '_OBJC_CLASS_$_NSObject'
+    - '_OBJC_METACLASS_$_NSObject'
+    - __objc_empty_cache
+    - _objc_destroyWeak
+    - dyld_stub_binder
+    - '-[Suggestion .cxx_destruct]'
+    - '__OBJC_METACLASS_RO_$_Suggestion'
+    - '__OBJC_$_INSTANCE_METHODS_Suggestion'
+    - '__OBJC_$_INSTANCE_VARIABLES_Suggestion'
+    - '__OBJC_CLASS_RO_$_Suggestion'
+    - '_OBJC_METACLASS_$_Suggestion'
+    - '_OBJC_CLASS_$_Suggestion'
+    - '_OBJC_IVAR_$_Suggestion._topChoice'
+    - '_OBJC_IVAR_$_Suggestion._other'
+    - ''
+  IndirectSymbols: [ 0x1B, 0x1A, 0x1A ]
+  FunctionStarts:  [ 0x7B8 ]
+...
diff --git a/llvm/test/tools/llvm-readtapi/Inputs/thread_local.yaml b/llvm/test/tools/llvm-readtapi/Inputs/thread_local.yaml
new file mode 100644
index 000000000000..bc247110e1ff
--- /dev/null
+++ b/llvm/test/tools/llvm-readtapi/Inputs/thread_local.yaml
@@ -0,0 +1,228 @@
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x00000006
+  ncmds:           13
+  sizeofcmds:      872
+  flags:           0x00900085
+  reserved:        0x00000000
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          16384
+    fileoff:         0
+    filesize:        16384
+    maxprot:         5
+    initprot:        5
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000000004000
+        size:            0
+        offset:          0x00004000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+        content:         ''
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA
+    vmaddr:          16384
+    vmsize:          16384
+    fileoff:         16384
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __thread_vars
+        segname:         __DATA
+        addr:            0x0000000000004000
+        size:            24
+        offset:          0x00004000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000013
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+        content:         '000000000000000000000000000000000000000000000000'
+      - sectname:        __thread_bss
+        segname:         __DATA
+        addr:            0x0000000000004018
+        size:            1
+        offset:          0x00000000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000012
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          32768
+    vmsize:          16384
+    fileoff:         32768
+    filesize:        184
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         104
+    dylib:
+      name:            24
+      timestamp:       1
+      current_version: 0
+      compatibility_version: 0
+    Content:   '/System/Library/Frameworks/ThreadLocal.framework/ThreadLocal'
+    ZeroPadBytes:    2
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      0
+    rebase_size:     0
+    bind_off:        32768
+    bind_size:       24
+    weak_bind_off:   0
+    weak_bind_size:  0
+    lazy_bind_off:   0
+    lazy_bind_size:  0
+    export_off:      32792
+    export_size:     24
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          32824
+    nsyms:           4
+    stroff:          32888
+    strsize:         64
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       1
+    iextdefsym:      1
+    nextdefsym:      1
+    iundefsym:       2
+    nundefsym:       2
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            E4FA1DB7-CD39-3568-B80D-BFAF224E5ADC
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           720896
+    sdk:             720896
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         39913216
+  - cmd:             LC_SOURCE_VERSION
+    cmdsize:         16
+    version:         0
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 84672512
+      compatibility_version: 65536
+    Content:   '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         32816
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         32824
+    datasize:        0
+LinkEditData:
+  BindOpcodes:
+    - Opcode:          BIND_OPCODE_SET_DYLIB_ORDINAL_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          __tlv_bootstrap
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ULEBExtraData:   [ 0x0000000000000000 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0000000000000000
+    Address:         0x0000000000000000
+    Other:           0x0000000000000000
+    ImportName:      ''
+    Children:
+      - TerminalSize:    4
+        NodeOffset:      13
+        Name:            _MySymbol
+        Flags:           0x0000000000000001
+        Address:         0x0000000000004000
+        Other:           0x0000000000000000
+        ImportName:      ''
+  NameList:
+    - n_strx:          45
+      n_type:          0x0E
+      n_sect:          3
+      n_desc:          0
+      n_value:         16408
+    - n_strx:          2
+      n_type:          0x0F
+      n_sect:          2
+      n_desc:          0
+      n_value:         16384
+    - n_strx:          12
+      n_type:          0x01
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          28
+      n_type:          0x01
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+  StringTable:
+    - ' '
+    - _MySymbol
+    - __tlv_bootstrap
+    - dyld_stub_binder
+    - '_MySymbol$tlv$init'
+...
diff --git a/llvm/test/tools/llvm-readtapi/Inputs/universal.yaml b/llvm/test/tools/llvm-readtapi/Inputs/universal.yaml
new file mode 100644
index 000000000000..abf17e645cc6
--- /dev/null
+++ b/llvm/test/tools/llvm-readtapi/Inputs/universal.yaml
@@ -0,0 +1,372 @@
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       2
+FatArchs:
+  - cputype:         0x0000000C
+    cpusubtype:      0x00000009
+    offset:          0x0000000000004000
+    size:            16424
+    align:           14
+  - cputype:         0x0100000C
+    cpusubtype:      0x00000000
+    offset:          0x000000000000C000
+    size:            16464
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACE
+      cputype:         0x0000000C
+      cpusubtype:      0x00000009
+      filetype:        0x00000006
+      ncmds:           13
+      sizeofcmds:      608
+      flags:           0x00100085
+    LoadCommands:
+      - cmd:             LC_SEGMENT
+        cmdsize:         124
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          1
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x0000000000004000
+            size:            0
+            offset:          0x00004000
+            align:           0
+            reloff:          0x00000000
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x00000000
+            reserved2:       0x00000000
+            reserved3:       0x00000000
+            content:         ''
+      - cmd:             LC_SEGMENT
+        cmdsize:         124
+        segname:         __DATA
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         0
+        filesize:        0
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           0
+        Sections:
+          - sectname:        __common
+            segname:         __DATA
+            addr:            0x0000000000004000
+            size:            4
+            offset:          0x00000000
+            align:           2
+            reloff:          0x00000000
+            nreloc:          0
+            flags:           0x00000001
+            reserved1:       0x00000000
+            reserved2:       0x00000000
+            reserved3:       0x00000000
+      - cmd:             LC_SEGMENT
+        cmdsize:         56
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        40
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         44
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:   '@rpath/libfoo.dylib'
+        ZeroPadBytes:    1
+      - cmd:             LC_DYLD_INFO_ONLY
+        cmdsize:         48
+        rebase_off:      0
+        rebase_size:     0
+        bind_off:        0
+        bind_size:       0
+        weak_bind_off:   0
+        weak_bind_size:  0
+        lazy_bind_off:   0
+        lazy_bind_size:  0
+        export_off:      16384
+        export_size:     16
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          16404
+        nsyms:           1
+        stroff:          16416
+        strsize:         8
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       0
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  0
+        nindirectsyms:   0
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            55BB3EB2-88FF-39A2-BAFB-C8695DC33D4B
+      - cmd:             LC_VERSION_MIN_IPHONEOS
+        cmdsize:         16
+        version:         680803
+        sdk:             917504
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_ENCRYPTION_INFO
+        cmdsize:         20
+        cryptoff:        16384
+        cryptsize:       0
+        cryptid:         0
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         16400
+        datasize:        4
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         16404
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0000000000000000
+        Address:         0x0000000000000000
+        Other:           0x0000000000000000
+        ImportName:      ''
+        Children:
+          - TerminalSize:    4
+            NodeOffset:      8
+            Name:            _foo
+            Flags:           0x0000000000000000
+            Address:         0x0000000000004000
+            Other:           0x0000000000000000
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0x0F
+          n_sect:          2
+          n_desc:          0
+          n_value:         16384
+      StringTable:
+        - ' '
+        - _foo
+        - ''
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x0100000C
+      cpusubtype:      0x00000000
+      filetype:        0x00000006
+      ncmds:           14
+      sizeofcmds:      760
+      flags:           0x00100085
+      reserved:        0x00000000
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __TEXT
+        vmaddr:          0
+        vmsize:          16384
+        fileoff:         0
+        filesize:        16384
+        maxprot:         5
+        initprot:        5
+        nsects:          1
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x0000000000004000
+            size:            0
+            offset:          0x00004000
+            align:           0
+            reloff:          0x00000000
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x00000000
+            reserved2:       0x00000000
+            reserved3:       0x00000000
+            content:         ''
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __DATA
+        vmaddr:          16384
+        vmsize:          16384
+        fileoff:         0
+        filesize:        0
+        maxprot:         3
+        initprot:        3
+        nsects:          1
+        flags:           0
+        Sections:
+          - sectname:        __common
+            segname:         __DATA
+            addr:            0x0000000000004000
+            size:            4
+            offset:          0x00000000
+            align:           2
+            reloff:          0x00000000
+            nreloc:          0
+            flags:           0x00000001
+            reserved1:       0x00000000
+            reserved2:       0x00000000
+            reserved3:       0x00000000
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          32768
+        vmsize:          16384
+        fileoff:         16384
+        filesize:        80
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+      - cmd:             LC_ID_DYLIB
+        cmdsize:         48
+        dylib:
+          name:            24
+          timestamp:       1
+          current_version: 0
+          compatibility_version: 0
+        Content:   '@rpath/libfoo.dylib'
+        ZeroPadBytes:    5
+      - cmd:             LC_DYLD_INFO_ONLY
+        cmdsize:         48
+        rebase_off:      0
+        rebase_size:     0
+        bind_off:        0
+        bind_size:       0
+        weak_bind_off:   0
+        weak_bind_size:  0
+        lazy_bind_off:   0
+        lazy_bind_size:  0
+        export_off:      16384
+        export_size:     16
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          16408
+        nsyms:           2
+        stroff:          16440
+        strsize:         24
+      - cmd:             LC_DYSYMTAB
+        cmdsize:         80
+        ilocalsym:       0
+        nlocalsym:       0
+        iextdefsym:      0
+        nextdefsym:      1
+        iundefsym:       1
+        nundefsym:       1
+        tocoff:          0
+        ntoc:            0
+        modtaboff:       0
+        nmodtab:         0
+        extrefsymoff:    0
+        nextrefsyms:     0
+        indirectsymoff:  0
+        nindirectsyms:   0
+        extreloff:       0
+        nextrel:         0
+        locreloff:       0
+        nlocrel:         0
+      - cmd:             LC_UUID
+        cmdsize:         24
+        uuid:            57F7A4EB-2EDE-3150-80B6-8D587A135790
+      - cmd:             LC_BUILD_VERSION
+        cmdsize:         32
+        platform:        2
+        minos:           917504
+        sdk:             917504
+        ntools:          1
+        Tools:
+          - tool:            3
+            version:         39322368
+      - cmd:             LC_SOURCE_VERSION
+        cmdsize:         16
+        version:         0
+      - cmd:             LC_ENCRYPTION_INFO_64
+        cmdsize:         24
+        cryptoff:        16384
+        cryptsize:       0
+        cryptid:         0
+        pad:             0
+      - cmd:             LC_LOAD_DYLIB
+        cmdsize:         56
+        dylib:
+          name:            24
+          timestamp:       2
+          current_version: 84017152
+          compatibility_version: 65536
+        Content:   '/usr/lib/libSystem.B.dylib'
+        ZeroPadBytes:    6
+      - cmd:             LC_FUNCTION_STARTS
+        cmdsize:         16
+        dataoff:         16400
+        datasize:        8
+      - cmd:             LC_DATA_IN_CODE
+        cmdsize:         16
+        dataoff:         16408
+        datasize:        0
+    LinkEditData:
+      ExportTrie:
+        TerminalSize:    0
+        NodeOffset:      0
+        Name:            ''
+        Flags:           0x0000000000000000
+        Address:         0x0000000000000000
+        Other:           0x0000000000000000
+        ImportName:      ''
+        Children:
+          - TerminalSize:    4
+            NodeOffset:      8
+            Name:            _foo
+            Flags:           0x0000000000000000
+            Address:         0x0000000000004000
+            Other:           0x0000000000000000
+            ImportName:      ''
+      NameList:
+        - n_strx:          2
+          n_type:          0x0F
+          n_sect:          2
+          n_desc:          0
+          n_value:         16384
+        - n_strx:          7
+          n_type:          0x01
+          n_sect:          0
+          n_desc:          256
+          n_value:         0
+      StringTable:
+        - ' '
+        - _foo
+        - dyld_stub_binder
+...
diff --git a/llvm/test/tools/llvm-readtapi/compare-incorrect-format.test b/llvm/test/tools/llvm-readtapi/compare-incorrect-format.test
deleted file mode 100644
index 09dc768518eb..000000000000
--- a/llvm/test/tools/llvm-readtapi/compare-incorrect-format.test
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: mkdir -p %t
-; RUN: yaml2obj %S/Inputs/macho.yaml -o %t/macho.dylib
-; RUN: not llvm-readtapi --compare %S/Inputs/v4A.tbd %t/macho.dylib 2>&1 | FileCheck %s
-
-; CHECK: error: {{.*}}macho.dylib' unsupported file type 
-; CHECK-NOT: error:
-; CHECK-NOT: warning:
diff --git a/llvm/test/tools/llvm-readtapi/compare-tbd-dylib.test b/llvm/test/tools/llvm-readtapi/compare-tbd-dylib.test
new file mode 100644
index 000000000000..d31097ff2289
--- /dev/null
+++ b/llvm/test/tools/llvm-readtapi/compare-tbd-dylib.test
@@ -0,0 +1,41 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: mkdir -p %t
+; RUN: yaml2obj %S/Inputs/macho.yaml -o %t/macho.dylib
+; RUN: not llvm-readtapi --compare %t/macho.tbd %t/macho.dylib 2>&1 | FileCheck %s
+
+; CHECK:      macho.tbd
+; CHECK-NEXT: macho.dylib
+; CHECK:      Current Version
+; CHECK-NEXT: 	< 1.2
+; CHECK-NEXT: 	> 1
+; CHECK-NEXT: Compatibility Version
+; CHECK-NEXT: 	< 3.1
+; CHECK-NEXT: 	> 1
+
+;--- macho.tbd
+{
+  "main_library": {
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "current_versions":[{"version": "1.2"}],
+    "compatibility_versions":[{ "version": "3.1"}],
+    "install_names": [
+      {
+        "name": "macho-no-exports.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "10.10",
+        "target": "x86_64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
diff --git a/llvm/test/tools/llvm-readtapi/stubify-invalid.test b/llvm/test/tools/llvm-readtapi/stubify-invalid.test
new file mode 100644
index 000000000000..ae97d47c5199
--- /dev/null
+++ b/llvm/test/tools/llvm-readtapi/stubify-invalid.test
@@ -0,0 +1,8 @@
+; RUN: rm -rf %t
+; RUN: mkdir -p %t
+; RUN: not llvm-readtapi -stubify %t/objc.dylib %t/flat_namespace.dylib %t/thread_local.dylib %t/fat.dylib --o %t/tmp.tbd 2>&1 | FileCheck %s --allow-empty --check-prefix OUT
+; RUN: not llvm-readtapi -stubify --o %t/tmp.tbd 2>&1 | FileCheck %s --allow-empty --check-prefix IN
+
+; OUT: error: cannot write multiple inputs into single output file
+; IN: error: stubify requires at least one input file
+
diff --git a/llvm/test/tools/llvm-readtapi/stubify.test b/llvm/test/tools/llvm-readtapi/stubify.test
new file mode 100644
index 000000000000..177301cff918
--- /dev/null
+++ b/llvm/test/tools/llvm-readtapi/stubify.test
@@ -0,0 +1,158 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: yaml2obj %S/Inputs/flat_namespace.yaml -o %t/flat_namespace.dylib
+; RUN: yaml2obj %S/Inputs/thread_local.yaml -o %t/thread_local.dylib
+; RUN: yaml2obj %S/Inputs/universal.yaml -o %t/fat.dylib
+; RUN: yaml2obj %S/Inputs/objc.yaml -o %t/objc.dylib
+; RUN: llvm-readtapi -stubify %t/objc.dylib -o %t/new_objc.tbd 2>&1 | FileCheck %s --allow-empty 
+; RUN: llvm-readtapi -stubify %t/objc.dylib %t/flat_namespace.dylib %t/thread_local.dylib %t/fat.dylib -delete-input 2>&1 | FileCheck %s --allow-empty 
+; RUN: llvm-readtapi -compare %t/expected_flat.tbd %t/flat_namespace.tbd 2>&1 | FileCheck %s --allow-empty 
+; RUN: llvm-readtapi -compare %t/expected_tl.tbd %t/thread_local.tbd 2>&1 | FileCheck %s --allow-empty 
+; RUN: llvm-readtapi -compare %t/objc.tbd %t/expected_objc.tbd 2>&1 | FileCheck %s --allow-empty 
+; RUN: llvm-readtapi -compare %t/new_objc.tbd %t/expected_objc.tbd 2>&1 | FileCheck %s --allow-empty 
+
+; CHECK-NOT: error: 
+; CHECK-NOT: warning: 
+
+;--- expected_flat.tbd
+{
+  "main_library": {
+    "compatibility_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "current_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "text": {
+          "global": [ "_foo" ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "flat_namespace",
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "foo.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "10.14",
+        "target": "x86_64-macos"
+      }
+    ],
+    "undefined_symbols": [
+      {
+        "data": {
+          "global": [ "dyld_stub_binder", "_bar", "_putchar" ]
+        }
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
+
+;--- expected_tl.tbd
+{
+  "main_library": {
+    "compatibility_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "current_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "data": {
+          "thread_local": [
+            "_MySymbol"
+          ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "/System/Library/Frameworks/ThreadLocal.framework/ThreadLocal"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "11",
+        "target": "x86_64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
+
+;--- expected_objc.tbd
+{
+  "main_library": {
+    "compatibility_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "current_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "data": {
+          "objc_class": [
+            "Suggestion"
+          ],
+          "objc_ivar": [
+            "Suggestion._topChoice",
+            "Suggestion._other"
+          ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "tmp.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "13",
+        "target": "arm64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
+
diff --git a/llvm/test/tools/llvm-readtapi/write.test b/llvm/test/tools/llvm-readtapi/write.test
index 1ec7a40a2e40..90ba1c25ec12 100644
--- a/llvm/test/tools/llvm-readtapi/write.test
+++ b/llvm/test/tools/llvm-readtapi/write.test
@@ -1,77 +1,34 @@
 ; RUN: rm -rf %t
 ; RUN: split-file %s %t
-; RUN: llvm-readtapi %t/arm64.tbd 2>&1 | FileCheck %s
+; RUN: mkdir -p %t
+; RUN: yaml2obj %S/Inputs/macho.yaml -o %t/macho.dylib
+; RUN: llvm-readtapi %t/macho.dylib -o %t/out.tbd 2>&1 | FileCheck %s --allow-empty 
+; RUN: llvm-readtapi -compare %t/out.tbd %t/expected.tbd 2>&1 | FileCheck %s --allow-empty 
 
-; CHECK-NOT: error 
-; CHECK-NOT: warning
-; CHECK: {
-; CHECK-NEXT:   "main_library": {
-; CHECK-NEXT:     "allowable_clients": [
-; CHECK-NEXT:       {
-; CHECK-NEXT:         "clients": [
-; CHECK-NEXT:           "ClientAll"
-; CHECK-NEXT:         ]
-; CHECK-NEXT:       }
-; CHECK-NEXT:     ],
-; CHECK-NEXT:     "exported_symbols": [
-; CHECK-NEXT:       {
-; CHECK-NEXT:         "data": {
-; CHECK-NEXT:           "global": [
-; CHECK-NEXT:             "_sym1"
-; CHECK-NEXT:           ],
-; CHECK-NEXT:           "objc_class": [
-; CHECK-NEXT:             "_A"
-; CHECK-NEXT:           ],
-; CHECK-NEXT:           "objc_ivar": [
-; CHECK-NEXT:             "_A._ivar1"
-; CHECK-NEXT:           ],
-; CHECK-NEXT:           "thread_local": [
-; CHECK-NEXT:             "_tlv1"
-; CHECK-NEXT:           ],
-; CHECK-NEXT:           "weak": [
-; CHECK-NEXT:             "_weak1"
-; CHECK-NEXT:           ]
-; CHECK-NEXT:         }
-; CHECK-NEXT:       }
-; CHECK-NEXT:     ],
-; CHECK-NEXT:     "install_names": [
-; CHECK-NEXT:       {
-; CHECK-NEXT:         "name": "/usr/lib/libfat.dylib"
-; CHECK-NEXT:       }
-; CHECK-NEXT:     ],
-; CHECK-NEXT:     "reexported_libraries": [
-; CHECK-NEXT:       {
-; CHECK-NEXT:         "names": [
-; CHECK-NEXT:           "/usr/lib/liball.dylib"
-; CHECK-NEXT:         ]
-; CHECK-NEXT:       }
-; CHECK-NEXT:     ],
-; CHECK-NEXT:     "target_info": [
-; CHECK-NEXT:       {
-; CHECK-NEXT:         "target": "arm64-macos"
-; CHECK-NEXT:       }
-; CHECK-NEXT:     ]
-; CHECK-NEXT:   },
-; CHECK-NEXT:   "tapi_tbd_version": 5
-; CHECK-NEXT: }
+; CHECK-NOT: error
+; CHECK-NOT: warning  
 
-
-;--- arm64.tbd
---- !tapi-tbd
-tbd-version:     4
-targets:         [ arm64-macos ]
-install-name:    '/usr/lib/libfat.dylib'
-allowable-clients:
-  - targets:         [ arm64-macos ]
-    clients:         [ ClientAll ]
-reexported-libraries:
-  - targets:         [ arm64-macos ]
-    libraries:       [ '/usr/lib/liball.dylib' ]
-exports:
-  - targets:         [ arm64-macos ]
-    symbols:         [ _sym1 ]
-    objc-classes:    [ _A ]
-    objc-ivars:      [ _A._ivar1 ]
-    weak-symbols:    [ _weak1 ]
-    thread-local-symbols: [ _tlv1 ]
-...
+;--- expected.tbd
+{
+  "main_library": {
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "macho-no-exports.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "10.10",
+        "target": "x86_64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
diff --git a/llvm/tools/llvm-ar/llvm-ar.cpp b/llvm/tools/llvm-ar/llvm-ar.cpp
index fcb6392a1d95..299b7856ec0b 100644
--- a/llvm/tools/llvm-ar/llvm-ar.cpp
+++ b/llvm/tools/llvm-ar/llvm-ar.cpp
@@ -1287,8 +1287,7 @@ static const char *matchFlagWithArg(StringRef Expected,
                                     ArrayRef<const char *> Args) {
   StringRef Arg = *ArgIt;
 
-  if (Arg.starts_with("--"))
-    Arg = Arg.substr(2);
+  Arg.consume_front("--");
 
   size_t len = Expected.size();
   if (Arg == Expected) {
diff --git a/llvm/tools/llvm-cxxfilt/Opts.td b/llvm/tools/llvm-cxxfilt/Opts.td
index f652a1a7f88b..034cb267aab8 100644
--- a/llvm/tools/llvm-cxxfilt/Opts.td
+++ b/llvm/tools/llvm-cxxfilt/Opts.td
@@ -17,6 +17,7 @@ multiclass Eq<string name, string help> {
 def help : FF<"help", "Display this help">;
 defm strip_underscore : BB<"strip-underscore", "Strip the leading underscore", "Don't strip the leading underscore">;
 def types : FF<"types", "Attempt to demangle types as well as function names">;
+def no_params : FF<"no-params", "Skip function parameters and return types">;
 def version : FF<"version", "Display the version">;
 
 defm : Eq<"format", "Specify mangling format. Currently ignored because only 'gnu' is supported">;
@@ -25,4 +26,5 @@ def : F<"s", "Alias for --format">;
 def : F<"_", "Alias for --strip-underscore">, Alias<strip_underscore>;
 def : F<"h", "Alias for --help">, Alias<help>;
 def : F<"n", "Alias for --no-strip-underscore">, Alias<no_strip_underscore>;
+def : F<"p", "Alias for --no-params">, Alias<no_params>;
 def : F<"t", "Alias for --types">, Alias<types>;
diff --git a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
index 4b9d88a65066..26a1f2f4afeb 100644
--- a/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
+++ b/llvm/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
@@ -54,6 +54,7 @@ public:
 };
 } // namespace
 
+static bool ParseParams;
 static bool StripUnderscore;
 static bool Types;
 
@@ -74,18 +75,19 @@ static std::string demangle(const std::string &Mangled) {
   }
 
   std::string Result;
-  if (nonMicrosoftDemangle(DecoratedStr, Result, CanHaveLeadingDot))
+  if (nonMicrosoftDemangle(DecoratedStr, Result, CanHaveLeadingDot,
+                           ParseParams))
     return Result;
 
   std::string Prefix;
   char *Undecorated = nullptr;
 
   if (Types)
-    Undecorated = itaniumDemangle(DecoratedStr);
+    Undecorated = itaniumDemangle(DecoratedStr, ParseParams);
 
   if (!Undecorated && starts_with(DecoratedStr, "__imp_")) {
     Prefix = "import thunk for ";
-    Undecorated = itaniumDemangle(DecoratedStr.substr(6));
+    Undecorated = itaniumDemangle(DecoratedStr.substr(6), ParseParams);
   }
 
   Result = Undecorated ? Prefix + Undecorated : Mangled;
@@ -173,6 +175,8 @@ int llvm_cxxfilt_main(int argc, char **argv, const llvm::ToolContext &) {
   else
     StripUnderscore = Triple(sys::getProcessTriple()).isOSBinFormatMachO();
 
+  ParseParams = !Args.hasArg(OPT_no_params);
+
   Types = Args.hasArg(OPT_types);
 
   std::vector<std::string> Decorated = Args.getAllArgValues(OPT_INPUT);
diff --git a/llvm/tools/llvm-diff/llvm-diff.cpp b/llvm/tools/llvm-diff/llvm-diff.cpp
index 6fe18a51c9f5..3e77b1ed89b0 100644
--- a/llvm/tools/llvm-diff/llvm-diff.cpp
+++ b/llvm/tools/llvm-diff/llvm-diff.cpp
@@ -42,8 +42,7 @@ static std::unique_ptr<Module> readModule(LLVMContext &Context,
 static void diffGlobal(DifferenceEngine &Engine, Module &L, Module &R,
                        StringRef Name) {
   // Drop leading sigils from the global name.
-  if (Name.starts_with("@"))
-    Name = Name.substr(1);
+  Name.consume_front("@");
 
   Function *LFn = L.getFunction(Name);
   Function *RFn = R.getFunction(Name);
diff --git a/llvm/tools/llvm-dwarfdump/fuzzer/llvm-dwarfdump-fuzzer.cpp b/llvm/tools/llvm-dwarfdump/fuzzer/llvm-dwarfdump-fuzzer.cpp
index 1d74856c0fb8..0e74d0be76f1 100644
--- a/llvm/tools/llvm-dwarfdump/fuzzer/llvm-dwarfdump-fuzzer.cpp
+++ b/llvm/tools/llvm-dwarfdump/fuzzer/llvm-dwarfdump-fuzzer.cpp
@@ -20,8 +20,8 @@ using namespace llvm;
 using namespace object;
 
 extern "C" int LLVMFuzzerTestOneInput(uint8_t *data, size_t size) {
-  std::unique_ptr<MemoryBuffer> Buff = MemoryBuffer::getMemBuffer(
-      StringRef((const char *)data, size), "", false);
+  std::string Payload(reinterpret_cast<const char *>(data), size);
+  std::unique_ptr<MemoryBuffer> Buff = MemoryBuffer::getMemBuffer(Payload);
 
   Expected<std::unique_ptr<ObjectFile>> ObjOrErr =
       ObjectFile::createObjectFile(Buff->getMemBufferRef());
diff --git a/llvm/tools/llvm-exegesis/lib/Assembler.cpp b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
index c2fad7c731a7..96b5a068ff21 100644
--- a/llvm/tools/llvm-exegesis/lib/Assembler.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
@@ -97,7 +97,7 @@ static bool generateSnippetSetupCode(
       // Load in the stack register now as we're done using it elsewhere
       // and need to set the value in preparation for executing the
       // snippet.
-      if (RV.Register == StackPointerRegister)
+      if (RV.Register != StackPointerRegister)
         continue;
       const auto SetRegisterCode = ET.setRegTo(*MSI, RV.Register, RV.Value);
       if (SetRegisterCode.empty())
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 12e774e1a4b8..1ee59a86ebbd 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -499,19 +499,20 @@ BenchmarkRunner::getRunnableConfiguration(
     const SnippetRepetitor &Repetitor) const {
   RunnableConfiguration RC;
 
-  Benchmark &InstrBenchmark = RC.InstrBenchmark;
-  InstrBenchmark.Mode = Mode;
-  InstrBenchmark.CpuName = std::string(State.getTargetMachine().getTargetCPU());
-  InstrBenchmark.LLVMTriple =
+  Benchmark &BenchmarkResult = RC.BenchmarkResult;
+  BenchmarkResult.Mode = Mode;
+  BenchmarkResult.CpuName =
+      std::string(State.getTargetMachine().getTargetCPU());
+  BenchmarkResult.LLVMTriple =
       State.getTargetMachine().getTargetTriple().normalize();
-  InstrBenchmark.NumRepetitions = NumRepetitions;
-  InstrBenchmark.Info = BC.Info;
+  BenchmarkResult.NumRepetitions = NumRepetitions;
+  BenchmarkResult.Info = BC.Info;
 
   const std::vector<MCInst> &Instructions = BC.Key.Instructions;
 
   bool GenerateMemoryInstructions = ExecutionMode == ExecutionModeE::SubProcess;
 
-  InstrBenchmark.Key = BC.Key;
+  BenchmarkResult.Key = BC.Key;
 
   // Assemble at least kMinInstructionsForSnippet instructions by repeating
   // the snippet for debug/analysis. This is so that the user clearly
@@ -526,7 +527,7 @@ BenchmarkRunner::getRunnableConfiguration(
       return std::move(E);
 
     if (auto Err = getBenchmarkFunctionBytes(*Snippet,
-                                             InstrBenchmark.AssembledSnippet))
+                                             BenchmarkResult.AssembledSnippet))
       return std::move(Err);
   }
 
@@ -534,8 +535,9 @@ BenchmarkRunner::getRunnableConfiguration(
   // measurements.
   if (BenchmarkPhaseSelector >
       BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
-    auto Snippet = assembleSnippet(BC, Repetitor, InstrBenchmark.NumRepetitions,
-                                   LoopBodySize, GenerateMemoryInstructions);
+    auto Snippet =
+        assembleSnippet(BC, Repetitor, BenchmarkResult.NumRepetitions,
+                        LoopBodySize, GenerateMemoryInstructions);
     if (Error E = Snippet.takeError())
       return std::move(E);
     RC.ObjectFile = getObjectFromBuffer(*Snippet);
@@ -577,7 +579,7 @@ BenchmarkRunner::createFunctionExecutor(
 std::pair<Error, Benchmark> BenchmarkRunner::runConfiguration(
     RunnableConfiguration &&RC,
     const std::optional<StringRef> &DumpFile) const {
-  Benchmark &InstrBenchmark = RC.InstrBenchmark;
+  Benchmark &BenchmarkResult = RC.BenchmarkResult;
   object::OwningBinary<object::ObjectFile> &ObjectFile = RC.ObjectFile;
 
   if (DumpFile && BenchmarkPhaseSelector >
@@ -585,38 +587,38 @@ std::pair<Error, Benchmark> BenchmarkRunner::runConfiguration(
     auto ObjectFilePath =
         writeObjectFile(ObjectFile.getBinary()->getData(), *DumpFile);
     if (Error E = ObjectFilePath.takeError()) {
-      return {std::move(E), std::move(InstrBenchmark)};
+      return {std::move(E), std::move(BenchmarkResult)};
     }
     outs() << "Check generated assembly with: /usr/bin/objdump -d "
            << *ObjectFilePath << "\n";
   }
 
   if (BenchmarkPhaseSelector < BenchmarkPhaseSelectorE::Measure) {
-    InstrBenchmark.Error = "actual measurements skipped.";
-    return {Error::success(), std::move(InstrBenchmark)};
+    BenchmarkResult.Error = "actual measurements skipped.";
+    return {Error::success(), std::move(BenchmarkResult)};
   }
 
   Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>> Executor =
-      createFunctionExecutor(std::move(ObjectFile), RC.InstrBenchmark.Key);
+      createFunctionExecutor(std::move(ObjectFile), RC.BenchmarkResult.Key);
   if (!Executor)
-    return {Executor.takeError(), std::move(InstrBenchmark)};
+    return {Executor.takeError(), std::move(BenchmarkResult)};
   auto NewMeasurements = runMeasurements(**Executor);
 
   if (Error E = NewMeasurements.takeError()) {
-    return {std::move(E), std::move(InstrBenchmark)};
+    return {std::move(E), std::move(BenchmarkResult)};
   }
-  assert(InstrBenchmark.NumRepetitions > 0 && "invalid NumRepetitions");
+  assert(BenchmarkResult.NumRepetitions > 0 && "invalid NumRepetitions");
   for (BenchmarkMeasure &BM : *NewMeasurements) {
     // Scale the measurements by instruction.
-    BM.PerInstructionValue /= InstrBenchmark.NumRepetitions;
+    BM.PerInstructionValue /= BenchmarkResult.NumRepetitions;
     // Scale the measurements by snippet.
     BM.PerSnippetValue *=
-        static_cast<double>(InstrBenchmark.Key.Instructions.size()) /
-        InstrBenchmark.NumRepetitions;
+        static_cast<double>(BenchmarkResult.Key.Instructions.size()) /
+        BenchmarkResult.NumRepetitions;
   }
-  InstrBenchmark.Measurements = std::move(*NewMeasurements);
+  BenchmarkResult.Measurements = std::move(*NewMeasurements);
 
-  return {Error::success(), std::move(InstrBenchmark)};
+  return {Error::success(), std::move(BenchmarkResult)};
 }
 
 Expected<std::string>
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
index 2c48d07e37ca..d746a0f77564 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
@@ -56,7 +56,7 @@ public:
   private:
     RunnableConfiguration() = default;
 
-    Benchmark InstrBenchmark;
+    Benchmark BenchmarkResult;
     object::OwningBinary<object::ObjectFile> ObjectFile;
   };
 
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index 2c2d1adb0fcf..0ab74b8e00da 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -39,6 +39,9 @@
 #endif
 
 #ifdef __linux__
+#ifdef __x86_64__
+#include <asm/prctl.h>
+#endif // __x86_64__
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <unistd.h>
@@ -907,9 +910,94 @@ void ExegesisX86Target::decrementLoopCounterAndJump(
       .addImm(X86::COND_NE);
 }
 
+void generateRegisterStackPush(unsigned int Register,
+                               std::vector<MCInst> &GeneratedCode) {
+  GeneratedCode.push_back(MCInstBuilder(X86::PUSH64r).addReg(Register));
+}
+
+void generateRegisterStackPop(unsigned int Register,
+                              std::vector<MCInst> &GeneratedCode) {
+  GeneratedCode.push_back(MCInstBuilder(X86::POP64r).addReg(Register));
+}
+
+void generateSyscall(long SyscallNumber, std::vector<MCInst> &GeneratedCode) {
+  GeneratedCode.push_back(
+      loadImmediate(X86::RAX, 64, APInt(64, SyscallNumber)));
+  GeneratedCode.push_back(MCInstBuilder(X86::SYSCALL));
+}
+
+// The functions below for saving and restoring system call registers are only
+// used when llvm-exegesis is built on Linux.
+#ifdef __linux__
+constexpr std::array<unsigned, 6> SyscallArgumentRegisters{
+    X86::RDI, X86::RSI, X86::RDX, X86::R10, X86::R8, X86::R9};
+
+static void saveSyscallRegisters(std::vector<MCInst> &GeneratedCode,
+                                 unsigned ArgumentCount) {
+  assert(ArgumentCount <= 6 &&
+         "System calls only X86-64 Linux can only take six arguments");
+  // Preserve RCX and R11 (Clobbered by the system call).
+  generateRegisterStackPush(X86::RCX, GeneratedCode);
+  generateRegisterStackPush(X86::R11, GeneratedCode);
+  // Preserve RAX (used for the syscall number/return value).
+  generateRegisterStackPush(X86::RAX, GeneratedCode);
+  // Preserve the registers used to pass arguments to the system call.
+  for (unsigned I = 0; I < ArgumentCount; ++I)
+    generateRegisterStackPush(SyscallArgumentRegisters[I], GeneratedCode);
+}
+
+static void restoreSyscallRegisters(std::vector<MCInst> &GeneratedCode,
+                                    unsigned ArgumentCount) {
+  assert(ArgumentCount <= 6 &&
+         "System calls only X86-64 Linux can only take six arguments");
+  // Restore the argument registers, in the opposite order of the way they are
+  // saved.
+  for (unsigned I = ArgumentCount; I > 0; --I) {
+    generateRegisterStackPop(SyscallArgumentRegisters[I - 1], GeneratedCode);
+  }
+  generateRegisterStackPop(X86::RAX, GeneratedCode);
+  generateRegisterStackPop(X86::R11, GeneratedCode);
+  generateRegisterStackPop(X86::RCX, GeneratedCode);
+}
+#endif // __linux__
+
+static std::vector<MCInst> loadImmediateSegmentRegister(unsigned Reg,
+                                                        const APInt &Value) {
+#if defined(__x86_64__) && defined(__linux__)
+  assert(Value.getBitWidth() <= 64 && "Value must fit in the register.");
+  std::vector<MCInst> loadSegmentRegisterCode;
+  // Preserve the syscall registers here as we don't
+  // want to make any assumptions about the ordering of what registers are
+  // loaded in first, and we might have already loaded in registers that we are
+  // going to be clobbering here.
+  saveSyscallRegisters(loadSegmentRegisterCode, 2);
+  // Generate the instructions to make the arch_prctl system call to set
+  // the registers.
+  int SyscallCode = 0;
+  if (Reg == X86::FS)
+    SyscallCode = ARCH_SET_FS;
+  else if (Reg == X86::GS)
+    SyscallCode = ARCH_SET_GS;
+  else
+    llvm_unreachable("Only the segment registers GS and FS are supported");
+  loadSegmentRegisterCode.push_back(
+      loadImmediate(X86::RDI, 64, APInt(64, SyscallCode)));
+  loadSegmentRegisterCode.push_back(loadImmediate(X86::RSI, 64, Value));
+  generateSyscall(SYS_arch_prctl, loadSegmentRegisterCode);
+  // Restore the registers in reverse order
+  restoreSyscallRegisters(loadSegmentRegisterCode, 2);
+  return loadSegmentRegisterCode;
+#else
+  llvm_unreachable("Loading immediate segment registers is only supported with "
+                   "x86-64 llvm-exegesis");
+#endif // defined(__x86_64__) && defined(__linux__)
+}
+
 std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
                                                 unsigned Reg,
                                                 const APInt &Value) const {
+  if (X86::SEGMENT_REGRegClass.contains(Reg))
+    return loadImmediateSegmentRegister(Reg, Value);
   if (X86::GR8RegClass.contains(Reg))
     return {loadImmediate(Reg, 8, Value)};
   if (X86::GR16RegClass.contains(Reg))
@@ -992,12 +1080,6 @@ static constexpr const intptr_t VAddressSpaceCeiling = 0xC0000000;
 static constexpr const intptr_t VAddressSpaceCeiling = 0x0000800000000000;
 #endif
 
-void generateSyscall(long SyscallNumber, std::vector<MCInst> &GeneratedCode) {
-  GeneratedCode.push_back(
-      loadImmediate(X86::RAX, 64, APInt(64, SyscallNumber)));
-  GeneratedCode.push_back(MCInstBuilder(X86::SYSCALL));
-}
-
 void generateRoundToNearestPage(unsigned int Register,
                                 std::vector<MCInst> &GeneratedCode) {
   int PageSizeShift = static_cast<int>(round(log2(getpagesize())));
@@ -1157,29 +1239,11 @@ intptr_t ExegesisX86Target::getAuxiliaryMemoryStartAddress() const {
   return VAddressSpaceCeiling - 2 * getpagesize();
 }
 
-void generateRegisterStackPush(unsigned int Register,
-                               std::vector<MCInst> &GeneratedCode) {
-  GeneratedCode.push_back(MCInstBuilder(X86::PUSH64r).addReg(Register));
-}
-
-void generateRegisterStackPop(unsigned int Register,
-                              std::vector<MCInst> &GeneratedCode) {
-  GeneratedCode.push_back(MCInstBuilder(X86::POP64r).addReg(Register));
-}
-
 std::vector<MCInst>
 ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const {
   std::vector<MCInst> ConfigurePerfCounterCode;
-  if(SaveRegisters) {
-    // Preserve RAX, RDI, and RSI by pushing them to the stack.
-    generateRegisterStackPush(X86::RAX, ConfigurePerfCounterCode);
-    generateRegisterStackPush(X86::RDI, ConfigurePerfCounterCode);
-    generateRegisterStackPush(X86::RSI, ConfigurePerfCounterCode);
-    // RCX and R11 will get clobbered by the syscall instruction, so save them
-    // as well.
-    generateRegisterStackPush(X86::RCX, ConfigurePerfCounterCode);
-    generateRegisterStackPush(X86::R11, ConfigurePerfCounterCode);
-  }
+  if (SaveRegisters)
+    saveSyscallRegisters(ConfigurePerfCounterCode, 2);
   ConfigurePerfCounterCode.push_back(
       loadImmediate(X86::RDI, 64, APInt(64, getAuxiliaryMemoryStartAddress())));
   ConfigurePerfCounterCode.push_back(MCInstBuilder(X86::MOV32rm)
@@ -1192,15 +1256,8 @@ ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const
   ConfigurePerfCounterCode.push_back(
       loadImmediate(X86::RSI, 64, APInt(64, Request)));
   generateSyscall(SYS_ioctl, ConfigurePerfCounterCode);
-  if(SaveRegisters) {
-    // Restore R11 then RCX
-    generateRegisterStackPop(X86::R11, ConfigurePerfCounterCode);
-    generateRegisterStackPop(X86::RCX, ConfigurePerfCounterCode);
-    // Restore RAX, RDI, and RSI, in reverse order.
-    generateRegisterStackPop(X86::RSI, ConfigurePerfCounterCode);
-    generateRegisterStackPop(X86::RDI, ConfigurePerfCounterCode);
-    generateRegisterStackPop(X86::RAX, ConfigurePerfCounterCode);
-  }
+  if (SaveRegisters)
+    restoreSyscallRegisters(ConfigurePerfCounterCode, 2);
   return ConfigurePerfCounterCode;
 }
 
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
index a5f8a09dcb24..1b35fde815f1 100644
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -410,7 +410,7 @@ static void runBenchmarkConfigurations(
       std::optional<StringRef> DumpFile;
       if (DumpObjectToDisk.getNumOccurrences())
         DumpFile = DumpObjectToDisk;
-      auto [Err, InstrBenchmark] =
+      auto [Err, BenchmarkResult] =
           Runner.runConfiguration(std::move(RC), DumpFile);
       if (Err) {
         // Errors from executing the snippets are fine.
@@ -419,9 +419,9 @@ static void runBenchmarkConfigurations(
           llvm::errs() << "llvm-exegesis error: " << toString(std::move(Err));
           exit(1);
         }
-        InstrBenchmark.Error = toString(std::move(Err));
+        BenchmarkResult.Error = toString(std::move(Err));
       }
-      AllResults.push_back(std::move(InstrBenchmark));
+      AllResults.push_back(std::move(BenchmarkResult));
     }
     Benchmark &Result = AllResults.front();
 
diff --git a/llvm/tools/llvm-objdump/XCOFFDump.cpp b/llvm/tools/llvm-objdump/XCOFFDump.cpp
index 0f6147924f8a..d9c00c096209 100644
--- a/llvm/tools/llvm-objdump/XCOFFDump.cpp
+++ b/llvm/tools/llvm-objdump/XCOFFDump.cpp
@@ -43,6 +43,7 @@ objdump::createXCOFFDumper(const object::XCOFFObjectFile &Obj) {
 
 Error objdump::getXCOFFRelocationValueString(const XCOFFObjectFile &Obj,
                                              const RelocationRef &Rel,
+                                             bool SymbolDescription,
                                              SmallVectorImpl<char> &Result) {
   symbol_iterator SymI = Rel.getSymbol();
   if (SymI == Obj.symbol_end())
diff --git a/llvm/tools/llvm-objdump/XCOFFDump.h b/llvm/tools/llvm-objdump/XCOFFDump.h
index cf5b19f910ea..0ba6ba4cdaaa 100644
--- a/llvm/tools/llvm-objdump/XCOFFDump.h
+++ b/llvm/tools/llvm-objdump/XCOFFDump.h
@@ -33,6 +33,7 @@ std::string getXCOFFSymbolDescription(const SymbolInfoTy &SymbolInfo,
 
 Error getXCOFFRelocationValueString(const object::XCOFFObjectFile &Obj,
                                     const object::RelocationRef &RelRef,
+                                    bool SymbolDescription,
                                     llvm::SmallVectorImpl<char> &Result);
 
 void dumpTracebackTable(ArrayRef<uint8_t> Bytes, uint64_t Address,
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 463d73e73ef8..7467a6062b5a 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -424,6 +424,7 @@ bool objdump::isRelocAddressLess(RelocationRef A, RelocationRef B) {
 }
 
 static Error getRelocationValueString(const RelocationRef &Rel,
+                                      bool SymbolDescription,
                                       SmallVectorImpl<char> &Result) {
   const ObjectFile *Obj = Rel.getObject();
   if (auto *ELF = dyn_cast<ELFObjectFileBase>(Obj))
@@ -435,7 +436,8 @@ static Error getRelocationValueString(const RelocationRef &Rel,
   if (auto *MachO = dyn_cast<MachOObjectFile>(Obj))
     return getMachORelocationValueString(MachO, Rel, Result);
   if (auto *XCOFF = dyn_cast<XCOFFObjectFile>(Obj))
-    return getXCOFFRelocationValueString(*XCOFF, Rel, Result);
+    return getXCOFFRelocationValueString(*XCOFF, Rel, SymbolDescription,
+                                         Result);
   llvm_unreachable("unknown object file format");
 }
 
@@ -527,7 +529,7 @@ static void printRelocation(formatted_raw_ostream &OS, StringRef FileName,
   SmallString<16> Name;
   SmallString<32> Val;
   Rel.getTypeName(Name);
-  if (Error E = getRelocationValueString(Rel, Val))
+  if (Error E = getRelocationValueString(Rel, SymbolDescription, Val))
     reportError(std::move(E), FileName);
   OS << (Is64Bits || !LeadingAddr ? "\t\t" : "\t\t\t");
   if (LeadingAddr)
@@ -1289,7 +1291,8 @@ collectLocalBranchTargets(ArrayRef<uint8_t> Bytes, MCInstrAnalysis *MIA,
                           uint64_t Start, uint64_t End,
                           std::unordered_map<uint64_t, std::string> &Labels) {
   // So far only supports PowerPC and X86.
-  if (!STI->getTargetTriple().isPPC() && !STI->getTargetTriple().isX86())
+  const bool isPPC = STI->getTargetTriple().isPPC();
+  if (!isPPC && !STI->getTargetTriple().isX86())
     return;
 
   if (MIA)
@@ -1299,8 +1302,8 @@ collectLocalBranchTargets(ArrayRef<uint8_t> Bytes, MCInstrAnalysis *MIA,
   unsigned LabelCount = 0;
   Start += SectionAddr;
   End += SectionAddr;
-  uint64_t Index = Start;
-  while (Index < End) {
+  const bool isXCOFF = STI->getTargetTriple().isOSBinFormatXCOFF();
+  for (uint64_t Index = Start; Index < End;) {
     // Disassemble a real instruction and record function-local branch labels.
     MCInst Inst;
     uint64_t Size;
@@ -1311,18 +1314,22 @@ collectLocalBranchTargets(ArrayRef<uint8_t> Bytes, MCInstrAnalysis *MIA,
       Size = std::min<uint64_t>(ThisBytes.size(),
                                 DisAsm->suggestBytesToSkip(ThisBytes, Index));
 
-    if (Disassembled && MIA) {
-      uint64_t Target;
-      bool TargetKnown = MIA->evaluateBranch(Inst, Index, Size, Target);
-      // On PowerPC, if the address of a branch is the same as the target, it
-      // means that it's a function call. Do not mark the label for this case.
-      if (TargetKnown && (Target >= Start && Target < End) &&
-          !Labels.count(Target) &&
-          !(STI->getTargetTriple().isPPC() && Target == Index))
-        Labels[Target] = ("L" + Twine(LabelCount++)).str();
-      MIA->updateState(Inst, Index);
-    } else if (!Disassembled && MIA) {
-      MIA->resetState();
+    if (MIA) {
+      if (Disassembled) {
+        uint64_t Target;
+        bool TargetKnown = MIA->evaluateBranch(Inst, Index, Size, Target);
+        if (TargetKnown && (Target >= Start && Target < End) &&
+            !Labels.count(Target)) {
+          // On PowerPC and AIX, a function call is encoded as a branch to 0.
+          // On other PowerPC platforms (ELF), a function call is encoded as
+          // a branch to self. Do not add a label for these cases.
+          if (!(isPPC &&
+                ((Target == 0 && isXCOFF) || (Target == Index && !isXCOFF))))
+            Labels[Target] = ("L" + Twine(LabelCount++)).str();
+        }
+        MIA->updateState(Inst, Index);
+      } else
+        MIA->resetState();
     }
     Index += Size;
   }
@@ -1486,7 +1493,7 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
   }
 
   std::map<SectionRef, std::vector<RelocationRef>> RelocMap;
-  if (InlineRelocs)
+  if (InlineRelocs || Obj.isXCOFF())
     RelocMap = getRelocsMap(Obj);
   bool Is64Bits = Obj.getBytesInAddress() > 4;
 
@@ -1985,6 +1992,8 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
         DT->InstrAnalysis->resetState();
 
       while (Index < End) {
+        uint64_t RelOffset;
+
         // ARM and AArch64 ELF binaries can interleave data and text in the
         // same section. We rely on the markers introduced to understand what
         // we need to dump. If the data marker is within a function, it is
@@ -2019,6 +2028,26 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
           }
         }
 
+        auto findRel = [&]() {
+          while (RelCur != RelEnd) {
+            RelOffset = RelCur->getOffset() - RelAdjustment;
+            // If this relocation is hidden, skip it.
+            if (getHidden(*RelCur) || SectionAddr + RelOffset < StartAddress) {
+              ++RelCur;
+              continue;
+            }
+
+            // Stop when RelCur's offset is past the disassembled
+            // instruction/data.
+            if (RelOffset >= Index + Size)
+              return false;
+            if (RelOffset >= Index)
+              return true;
+            ++RelCur;
+          }
+          return false;
+        };
+
         if (DumpARMELFData) {
           Size = dumpARMELFData(SectionAddr, Index, End, Obj, Bytes,
                                 MappingSymbols, *DT->SubtargetInfo, FOS);
@@ -2029,7 +2058,7 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
             uint64_t MaxOffset = End - Index;
             // For --reloc: print zero blocks patched by relocations, so that
             // relocations can be shown in the dump.
-            if (RelCur != RelEnd)
+            if (InlineRelocs && RelCur != RelEnd)
               MaxOffset = std::min(RelCur->getOffset() - RelAdjustment - Index,
                                    MaxOffset);
 
@@ -2087,17 +2116,19 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
 
           DT->InstPrinter->setCommentStream(llvm::nulls());
 
-          // If disassembly has failed, avoid analysing invalid/incomplete
-          // instruction information. Otherwise, try to resolve the target
-          // address (jump target or memory operand address) and print it on the
+          // If disassembly succeeds, we try to resolve the target address
+          // (jump target or memory operand address) and print it to the
           // right of the instruction.
+          //
+          // Otherwise, we don't print anything else so that we avoid
+          // analyzing invalid or incomplete instruction information.
           if (Disassembled && DT->InstrAnalysis) {
-            // Branch targets are printed just after the instructions.
             llvm::raw_ostream *TargetOS = &FOS;
             uint64_t Target;
             bool PrintTarget = DT->InstrAnalysis->evaluateBranch(
                 Inst, SectionAddr + Index, Size, Target);
-            if (!PrintTarget)
+
+            if (!PrintTarget) {
               if (std::optional<uint64_t> MaybeTarget =
                       DT->InstrAnalysis->evaluateMemoryOperandAddress(
                           Inst, DT->SubtargetInfo.get(), SectionAddr + Index,
@@ -2111,6 +2142,8 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
                   *TargetOS << "0x" << Twine::utohexstr(Target);
                 }
               }
+            }
+
             if (PrintTarget) {
               // In a relocatable object, the target's section must reside in
               // the same section as the call instruction or it is accessed
@@ -2120,7 +2153,8 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
               // In that case, locate the section(s) containing the target
               // address and find the symbol in one of those, if possible.
               //
-              // N.B. We don't walk the relocations in the relocatable case yet.
+              // N.B. Except for XCOFF, we don't walk the relocations in the
+              // relocatable case yet.
               std::vector<const SectionSymbolsTy *> TargetSectionSymbols;
               if (!Obj.isRelocatableObject()) {
                 auto It = llvm::partition_point(
@@ -2166,19 +2200,65 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
                   break;
               }
 
+              // Branch targets are printed just after the instructions.
               // Print the labels corresponding to the target if there's any.
               bool BBAddrMapLabelAvailable = BBAddrMapLabels.count(Target);
               bool LabelAvailable = AllLabels.count(Target);
+
               if (TargetSym != nullptr) {
                 uint64_t TargetAddress = TargetSym->Addr;
                 uint64_t Disp = Target - TargetAddress;
                 std::string TargetName = Demangle ? demangle(TargetSym->Name)
                                                   : TargetSym->Name.str();
+                bool RelFixedUp = false;
+                SmallString<32> Val;
 
                 *TargetOS << " <";
-                if (!Disp) {
-                  // Always Print the binary symbol precisely corresponding to
-                  // the target address.
+                // On XCOFF, we use relocations, even without -r, so we
+                // can print the correct name for an extern function call.
+                if (Obj.isXCOFF() && findRel()) {
+                  // Check for possible branch relocations and
+                  // branches to fixup code.
+                  bool BranchRelocationType = true;
+                  XCOFF::RelocationType RelocType;
+                  if (Obj.is64Bit()) {
+                    const XCOFFRelocation64 *Reloc =
+                        reinterpret_cast<XCOFFRelocation64 *>(
+                            RelCur->getRawDataRefImpl().p);
+                    RelFixedUp = Reloc->isFixupIndicated();
+                    RelocType = Reloc->Type;
+                  } else {
+                    const XCOFFRelocation32 *Reloc =
+                        reinterpret_cast<XCOFFRelocation32 *>(
+                            RelCur->getRawDataRefImpl().p);
+                    RelFixedUp = Reloc->isFixupIndicated();
+                    RelocType = Reloc->Type;
+                  }
+                  BranchRelocationType =
+                      RelocType == XCOFF::R_BA || RelocType == XCOFF::R_BR ||
+                      RelocType == XCOFF::R_RBA || RelocType == XCOFF::R_RBR;
+
+                  // If we have a valid relocation, try to print its
+                  // corresponding symbol name. Multiple relocations on the
+                  // same instruction are not handled.
+                  // Branches to fixup code will have the RelFixedUp flag set in
+                  // the RLD. For these instructions, we print the correct
+                  // branch target, but print the referenced symbol as a
+                  // comment.
+                  if (Error E = getRelocationValueString(*RelCur, false, Val)) {
+                    // If -r was used, this error will be printed later.
+                    // Otherwise, we ignore the error and print what
+                    // would have been printed without using relocations.
+                    consumeError(std::move(E));
+                    *TargetOS << TargetName;
+                    RelFixedUp = false; // Suppress comment for RLD sym name
+                  } else if (BranchRelocationType && !RelFixedUp)
+                    *TargetOS << Val;
+                  else
+                    *TargetOS << TargetName;
+                  if (Disp)
+                    *TargetOS << "+0x" << Twine::utohexstr(Disp);
+                } else if (!Disp) {
                   *TargetOS << TargetName;
                 } else if (BBAddrMapLabelAvailable) {
                   *TargetOS << BBAddrMapLabels[Target].front();
@@ -2190,6 +2270,12 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
                   *TargetOS << TargetName << "+0x" << Twine::utohexstr(Disp);
                 }
                 *TargetOS << ">";
+                if (RelFixedUp && !InlineRelocs) {
+                  // We have fixup code for a relocation. We print the
+                  // referenced symbol as a comment.
+                  *TargetOS << "\t# " << Val;
+                }
+
               } else if (BBAddrMapLabelAvailable) {
                 *TargetOS << " <" << BBAddrMapLabels[Target].front() << ">";
               } else if (LabelAvailable) {
@@ -2215,36 +2301,20 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
         if (BTF)
           printBTFRelocation(FOS, *BTF, {Index, Section.getIndex()}, LVP);
 
-        // Hexagon does this in pretty printer
-        if (Obj.getArch() != Triple::hexagon) {
-          // Print relocation for instruction and data.
-          while (RelCur != RelEnd) {
-            uint64_t Offset = RelCur->getOffset() - RelAdjustment;
-            // If this relocation is hidden, skip it.
-            if (getHidden(*RelCur)) {
-              ++RelCur;
-              continue;
-            }
-
-            // Stop when RelCur's offset is past the disassembled
-            // instruction/data. Note that it's possible the disassembled data
-            // is not the complete data: we might see the relocation printed in
-            // the middle of the data, but this matches the binutils objdump
-            // output.
-            if (Offset >= Index + Size)
-              break;
-
+        // Hexagon handles relocs in pretty printer
+        if (InlineRelocs && Obj.getArch() != Triple::hexagon) {
+          while (findRel()) {
             // When --adjust-vma is used, update the address printed.
             if (RelCur->getSymbol() != Obj.symbol_end()) {
               Expected<section_iterator> SymSI =
                   RelCur->getSymbol()->getSection();
               if (SymSI && *SymSI != Obj.section_end() &&
                   shouldAdjustVA(**SymSI))
-                Offset += AdjustVMA;
+                RelOffset += AdjustVMA;
             }
 
             printRelocation(FOS, Obj.getFileName(), *RelCur,
-                            SectionAddr + Offset, Is64Bits);
+                            SectionAddr + RelOffset, Is64Bits);
             LVP.printAfterOtherLine(FOS, true);
             ++RelCur;
           }
@@ -2428,7 +2498,8 @@ void Dumper::printRelocations() {
         if (Address < StartAddress || Address > StopAddress || getHidden(Reloc))
           continue;
         Reloc.getTypeName(RelocName);
-        if (Error E = getRelocationValueString(Reloc, ValueStr))
+        if (Error E =
+                getRelocationValueString(Reloc, SymbolDescription, ValueStr))
           reportUniqueWarning(std::move(E));
 
         outs() << format(Fmt.data(), Address) << " "
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 322b7da2678f..12b81d411cfa 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -1651,10 +1651,10 @@ struct SampleOverlapStats {
 
 namespace {
 struct FuncSampleStats {
-  uint64_t SampleSum;
-  uint64_t MaxSample;
-  uint64_t HotBlockCount;
-  FuncSampleStats() : SampleSum(0), MaxSample(0), HotBlockCount(0) {}
+  uint64_t SampleSum = 0;
+  uint64_t MaxSample = 0;
+  uint64_t HotBlockCount = 0;
+  FuncSampleStats() = default;
   FuncSampleStats(uint64_t SampleSum, uint64_t MaxSample,
                   uint64_t HotBlockCount)
       : SampleSum(SampleSum), MaxSample(MaxSample),
@@ -2563,12 +2563,10 @@ static int overlap_main(int argc, const char *argv[]) {
 
 namespace {
 struct ValueSitesStats {
-  ValueSitesStats()
-      : TotalNumValueSites(0), TotalNumValueSitesWithValueProfile(0),
-        TotalNumValues(0) {}
-  uint64_t TotalNumValueSites;
-  uint64_t TotalNumValueSitesWithValueProfile;
-  uint64_t TotalNumValues;
+  ValueSitesStats() = default;
+  uint64_t TotalNumValueSites = 0;
+  uint64_t TotalNumValueSitesWithValueProfile = 0;
+  uint64_t TotalNumValues = 0;
   std::vector<unsigned> ValueSitesHistogram;
 };
 } // namespace
@@ -2867,13 +2865,12 @@ static void showSectionInfo(sampleprof::SampleProfileReader *Reader,
 namespace {
 struct HotFuncInfo {
   std::string FuncName;
-  uint64_t TotalCount;
-  double TotalCountPercent;
-  uint64_t MaxCount;
-  uint64_t EntryCount;
+  uint64_t TotalCount = 0;
+  double TotalCountPercent = 0.0f;
+  uint64_t MaxCount = 0;
+  uint64_t EntryCount = 0;
 
-  HotFuncInfo()
-      : TotalCount(0), TotalCountPercent(0.0f), MaxCount(0), EntryCount(0) {}
+  HotFuncInfo() = default;
 
   HotFuncInfo(StringRef FN, uint64_t TS, double TSP, uint64_t MS, uint64_t ES)
       : FuncName(FN.begin(), FN.end()), TotalCount(TS), TotalCountPercent(TSP),
diff --git a/llvm/tools/llvm-profgen/CSPreInliner.cpp b/llvm/tools/llvm-profgen/CSPreInliner.cpp
index 025d3ca5a6da..87df6996aa43 100644
--- a/llvm/tools/llvm-profgen/CSPreInliner.cpp
+++ b/llvm/tools/llvm-profgen/CSPreInliner.cpp
@@ -128,9 +128,8 @@ bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
     uint64_t CallsiteCount = 0;
     LineLocation Callsite = CalleeNode->getCallSiteLoc();
     if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
-      SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
-      auto It = TargetCounts.find(CalleeSamples->getFunction());
-      if (It != TargetCounts.end())
+      auto It = CallTargets->find(CalleeSamples->getFunction());
+      if (It != CallTargets->end())
         CallsiteCount = It->second;
     }
 
diff --git a/llvm/tools/llvm-readtapi/CMakeLists.txt b/llvm/tools/llvm-readtapi/CMakeLists.txt
index ef88d786d832..855a1473888f 100644
--- a/llvm/tools/llvm-readtapi/CMakeLists.txt
+++ b/llvm/tools/llvm-readtapi/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  BinaryFormat
   Object
   Support
   Option
diff --git a/llvm/tools/llvm-readtapi/TapiOpts.td b/llvm/tools/llvm-readtapi/TapiOpts.td
index 552690ce1385..34ec5616a42e 100644
--- a/llvm/tools/llvm-readtapi/TapiOpts.td
+++ b/llvm/tools/llvm-readtapi/TapiOpts.td
@@ -15,6 +15,7 @@ def compare : FF<"compare", "compare tapi files for library differences">, Group
 def merge : FF<"merge", "merge the input files that represent the same library">, Group<action_group>;
 def extract: FF<"extract", "extract <architecture> from input file">, Group<action_group>;
 def remove: FF<"remove", "remove <architecture> from input file">, Group<action_group>;
+def stubify: FF<"stubify", "create a tapi file from a dynamic library or framework">, Group<action_group>;
 
 //
 // General Driver options 
@@ -26,3 +27,8 @@ defm output: JS<"o", "write output to <file>","<file>">;
 def compact: FF<"compact", "write compact tapi output file">;
 defm filetype: JS<"filetype", "specify the output file type (tbd-v3, tbd-v4 or tbd-v5)","<value>">;
 defm arch: JS<"arch", "specify the <architecture>", "<architecture>">;
+
+//
+// Stub options
+//
+def delete_input : FF<"delete-input", "delete and replace input file on success">;
diff --git a/llvm/tools/llvm-readtapi/llvm-readtapi.cpp b/llvm/tools/llvm-readtapi/llvm-readtapi.cpp
index 5fa023d12525..193a281d6341 100644
--- a/llvm/tools/llvm-readtapi/llvm-readtapi.cpp
+++ b/llvm/tools/llvm-readtapi/llvm-readtapi.cpp
@@ -10,23 +10,32 @@
 //
 //===----------------------------------------------------------------------===//
 #include "DiffEngine.h"
+#include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TextAPI/DylibReader.h"
 #include "llvm/TextAPI/TextAPIError.h"
 #include "llvm/TextAPI/TextAPIReader.h"
 #include "llvm/TextAPI/TextAPIWriter.h"
+#include "llvm/TextAPI/Utils.h"
 #include <cstdlib>
 
 using namespace llvm;
 using namespace MachO;
 using namespace object;
 
+#if !defined(PATH_MAX)
+#define PATH_MAX 1024
+#endif
+
 namespace {
 using namespace llvm::opt;
 enum ID {
@@ -56,10 +65,15 @@ public:
   }
 };
 
+struct StubOptions {
+  bool DeleteInput = false;
+};
+
 struct Context {
   std::vector<std::string> Inputs;
   std::unique_ptr<llvm::raw_fd_stream> OutStream;
   FileType WriteFT = FileType::TBD_V5;
+  StubOptions StubOpt;
   bool Compact = false;
   Architecture Arch = AK_unknown;
 };
@@ -86,13 +100,35 @@ getInterfaceFile(const StringRef Filename, bool ResetBanner = true) {
       MemoryBuffer::getFile(Filename);
   if (BufferOrErr.getError())
     ExitOnErr(errorCodeToError(BufferOrErr.getError()));
-  Expected<std::unique_ptr<InterfaceFile>> IF =
-      TextAPIReader::get((*BufferOrErr)->getMemBufferRef());
-  if (!IF)
-    ExitOnErr(IF.takeError());
+  auto Buffer = std::move(*BufferOrErr);
+
+  std::unique_ptr<InterfaceFile> IF;
+  switch (identify_magic(Buffer->getBuffer())) {
+  case file_magic::macho_dynamically_linked_shared_lib:
+    LLVM_FALLTHROUGH;
+  case file_magic::macho_dynamically_linked_shared_lib_stub:
+    LLVM_FALLTHROUGH;
+  case file_magic::macho_universal_binary: {
+    auto IFOrErr = DylibReader::get(Buffer->getMemBufferRef());
+    if (!IFOrErr)
+      ExitOnErr(IFOrErr.takeError());
+    IF = std::move(*IFOrErr);
+    break;
+  }
+  case file_magic::tapi_file: {
+    auto IFOrErr = TextAPIReader::get(Buffer->getMemBufferRef());
+    if (!IFOrErr)
+      ExitOnErr(IFOrErr.takeError());
+    IF = std::move(*IFOrErr);
+    break;
+  }
+  default:
+    reportError(Filename + ": unsupported file type");
+  }
+
   if (ResetBanner)
     ExitOnErr.setBanner(TOOLNAME + ": error: ");
-  return std::move(*IF);
+  return IF;
 }
 
 static bool handleCompareAction(const Context &Ctx) {
@@ -142,6 +178,30 @@ static bool handleMergeAction(const Context &Ctx) {
   return handleWriteAction(Ctx, std::move(Out));
 }
 
+static bool handleStubifyAction(Context &Ctx) {
+  if (Ctx.Inputs.empty())
+    reportError("stubify requires at least one input file");
+
+  if ((Ctx.Inputs.size() > 1) && (Ctx.OutStream != nullptr))
+    reportError("cannot write multiple inputs into single output file");
+
+  for (StringRef FileName : Ctx.Inputs) {
+    auto IF = getInterfaceFile(FileName);
+    if (Ctx.StubOpt.DeleteInput) {
+      std::error_code EC;
+      SmallString<PATH_MAX> OutputLoc = FileName;
+      MachO::replace_extension(OutputLoc, ".tbd");
+      Ctx.OutStream = std::make_unique<llvm::raw_fd_stream>(OutputLoc, EC);
+      if (EC)
+        reportError("opening file '" + OutputLoc + ": " + EC.message());
+      if (auto Err = sys::fs::remove(FileName))
+        reportError("deleting file '" + FileName + ": " + EC.message());
+    }
+    handleWriteAction(Ctx, std::move(IF));
+  }
+  return EXIT_SUCCESS;
+}
+
 using IFOperation =
     std::function<llvm::Expected<std::unique_ptr<InterfaceFile>>(
         const llvm::MachO::InterfaceFile &, Architecture)>;
@@ -160,6 +220,10 @@ static bool handleSingleFileAction(const Context &Ctx, const StringRef Action,
   return handleWriteAction(Ctx, std::move(*OutIF));
 }
 
+static void setStubOptions(opt::InputArgList &Args, StubOptions &Opt) {
+  Opt.DeleteInput = Args.hasArg(OPT_delete_input);
+}
+
 int main(int Argc, char **Argv) {
   InitLLVM X(Argc, Argv);
   BumpPtrAllocator A;
@@ -183,6 +247,7 @@ int main(int Argc, char **Argv) {
     return EXIT_SUCCESS;
   }
 
+  // TODO: Add support for picking up libraries from directory input.
   for (opt::Arg *A : Args.filtered(OPT_INPUT))
     Ctx.Inputs.push_back(A->getValue());
 
@@ -237,6 +302,9 @@ int main(int Argc, char **Argv) {
     return handleSingleFileAction(Ctx, "extract", &InterfaceFile::extract);
   case OPT_remove:
     return handleSingleFileAction(Ctx, "remove", &InterfaceFile::remove);
+  case OPT_stubify:
+    setStubOptions(Args, Ctx.StubOpt);
+    return handleStubifyAction(Ctx);
   }
 
   return EXIT_SUCCESS;
diff --git a/llvm/tools/llvm-special-case-list-fuzzer/special-case-list-fuzzer.cpp b/llvm/tools/llvm-special-case-list-fuzzer/special-case-list-fuzzer.cpp
index aaab5f8470c9..0691f294fa0e 100644
--- a/llvm/tools/llvm-special-case-list-fuzzer/special-case-list-fuzzer.cpp
+++ b/llvm/tools/llvm-special-case-list-fuzzer/special-case-list-fuzzer.cpp
@@ -12,8 +12,9 @@
 #include <cstdlib>
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
-  std::unique_ptr<llvm::MemoryBuffer> Buf = llvm::MemoryBuffer::getMemBuffer(
-      llvm::StringRef(reinterpret_cast<const char *>(Data), Size), "", false);
+  std::string Payload(reinterpret_cast<const char *>(Data), Size);
+  std::unique_ptr<llvm::MemoryBuffer> Buf =
+      llvm::MemoryBuffer::getMemBuffer(Payload);
 
   if (!Buf)
     return 0;
diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
index 292b5cade950..34b06fe480f3 100644
--- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -621,3 +621,38 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
     EXPECT_TRUE(isLibFunc(F, LF));
   }
 }
+
+namespace {
+
+/// Creates TLI for AArch64 and uses it to get the LibFunc names for the given
+/// Instruction opcode and Type.
+class TLITestAarch64 : public ::testing::Test {
+private:
+  const Triple TargetTriple;
+
+protected:
+  LLVMContext Ctx;
+  std::unique_ptr<TargetLibraryInfoImpl> TLII;
+  std::unique_ptr<TargetLibraryInfo> TLI;
+
+  /// Create TLI for AArch64
+  TLITestAarch64() : TargetTriple(Triple("aarch64-unknown-linux-gnu")) {
+    TLII = std::make_unique<TargetLibraryInfoImpl>(
+        TargetLibraryInfoImpl(TargetTriple));
+    TLI = std::make_unique<TargetLibraryInfo>(TargetLibraryInfo(*TLII));
+  }
+
+  /// Returns the TLI function name for the given \p Opcode and type \p Ty.
+  StringRef getScalarName(unsigned int Opcode, Type *Ty) {
+    LibFunc Func;
+    if (!TLI->getLibFunc(Opcode, Ty, Func))
+      return "";
+    return TLI->getName(Func);
+  }
+};
+} // end anonymous namespace
+
+TEST_F(TLITestAarch64, TestFrem) {
+  EXPECT_EQ(getScalarName(Instruction::FRem, Type::getDoubleTy(Ctx)), "fmod");
+  EXPECT_EQ(getScalarName(Instruction::FRem, Type::getFloatTy(Ctx)), "fmodf");
+}
+\ No newline at end of file
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 0d3a594da0c0..27f631884072 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -1177,12 +1177,12 @@ TEST(ValueTracking, canCreatePoisonOrUndef) {
       {{false, false},
        "shufflevector <4 x i32> %vx, <4 x i32> %vx2, "
        "<4 x i32> <i32 0, i32 1, i32 2, i32 3>"},
-      {{false, true},
+      {{true, false},
        "shufflevector <4 x i32> %vx, <4 x i32> %vx2, "
-       "<4 x i32> <i32 0, i32 1, i32 2, i32 undef>"},
-      {{false, true},
+       "<4 x i32> <i32 0, i32 1, i32 2, i32 poison>"},
+      {{true, false},
        "shufflevector <vscale x 4 x i32> %svx, "
-       "<vscale x 4 x i32> %svx, <vscale x 4 x i32> undef"},
+       "<vscale x 4 x i32> %svx, <vscale x 4 x i32> poison"},
       {{true, false}, "call i32 @g(i32 %x)"},
       {{false, false}, "call noundef i32 @g(i32 %x)"},
       {{true, false}, "fcmp nnan oeq float %fx, %fy"},
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index acbcd247fa9a..7d4eaf388f72 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -497,8 +497,7 @@ body:  |
 
   std::pair<FuncValueTable, FuncValueTable>
   allocValueTables(unsigned Blocks, unsigned Locs) {
-    return {FuncValueTable(Blocks, ValueTable(Locs)),
-            FuncValueTable(Blocks, ValueTable(Locs))};
+    return {FuncValueTable(Blocks, Locs), FuncValueTable(Blocks, Locs)};
   }
 };
 
@@ -917,8 +916,7 @@ TEST_F(InstrRefLDVTest, MLocSingleBlock) {
 
   // Set up live-in and live-out tables for this function: two locations (we
   // add one later) in a single block.
-  FuncValueTable MOutLocs, MInLocs;
-  std::tie(MOutLocs, MInLocs) = allocValueTables(1, 2);
+  auto [MOutLocs, MInLocs] = allocValueTables(1, 2);
 
   // Transfer function: nothing.
   SmallVector<MLocTransferMap, 1> TransferFunc;
@@ -983,8 +981,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondBlocks) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(4, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(4, 2);
 
   // Transfer function: start with nothing.
   SmallVector<MLocTransferMap, 1> TransferFunc;
@@ -1137,8 +1134,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) {
   // There are other locations, for things like xmm0, which we're going to
   // ignore here.
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(4, 11);
+  auto [MInLocs, MOutLocs] = allocValueTables(4, 11);
 
   // Transfer function: start with nothing.
   SmallVector<MLocTransferMap, 1> TransferFunc;
@@ -1199,8 +1195,7 @@ TEST_F(InstrRefLDVTest, MLocSimpleLoop) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(3, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(3, 2);
 
   SmallVector<MLocTransferMap, 1> TransferFunc;
   TransferFunc.resize(3);
@@ -1298,8 +1293,7 @@ TEST_F(InstrRefLDVTest, MLocNestedLoop) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(5, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(5, 2);
 
   SmallVector<MLocTransferMap, 1> TransferFunc;
   TransferFunc.resize(5);
@@ -1500,8 +1494,7 @@ TEST_F(InstrRefLDVTest, MLocNoDominatingLoop) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(5, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(5, 2);
 
   SmallVector<MLocTransferMap, 1> TransferFunc;
   TransferFunc.resize(5);
@@ -1656,8 +1649,7 @@ TEST_F(InstrRefLDVTest, MLocBadlyNestedLoops) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(5, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(5, 2);
 
   SmallVector<MLocTransferMap, 1> TransferFunc;
   TransferFunc.resize(5);
@@ -1789,8 +1781,7 @@ TEST_F(InstrRefLDVTest, pickVPHILocDiamond) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(4, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(4, 2);
 
   initValueArray(MOutLocs, 4, 2);
 
@@ -1986,8 +1977,7 @@ TEST_F(InstrRefLDVTest, pickVPHILocLoops) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(3, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(3, 2);
 
   initValueArray(MOutLocs, 3, 2);
 
@@ -2117,8 +2107,7 @@ TEST_F(InstrRefLDVTest, pickVPHILocBadlyNestedLoops) {
   Register RBX = getRegByName("RBX");
   LocIdx RbxLoc = MTracker->lookupOrTrackRegister(RBX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(5, 3);
+  auto [MInLocs, MOutLocs] = allocValueTables(5, 3);
 
   initValueArray(MOutLocs, 5, 3);
 
@@ -2635,8 +2624,7 @@ TEST_F(InstrRefLDVTest, VLocSingleBlock) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(1, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(1, 2);
 
   ValueIDNum LiveInRsp = ValueIDNum(0, 0, RspLoc);
   DbgOpID LiveInRspID = addValueDbgOp(LiveInRsp);
@@ -2699,8 +2687,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
   DbgOpID LiveInRaxID = addValueDbgOp(LiveInRax);
   DbgOpID RspPHIInBlk3ID = addValueDbgOp(RspPHIInBlk3);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(4, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(4, 2);
 
   initValueArray(MInLocs, 4, 2);
   initValueArray(MOutLocs, 4, 2);
@@ -2921,8 +2908,7 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   DbgOpID RspDefInBlk1ID = addValueDbgOp(RspDefInBlk1);
   DbgOpID RaxPHIInBlk1ID = addValueDbgOp(RaxPHIInBlk1);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(3, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(3, 2);
 
   initValueArray(MInLocs, 3, 2);
   initValueArray(MOutLocs, 3, 2);
@@ -3200,8 +3186,7 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) {
   DbgOpID RspPHIInBlk2ID = addValueDbgOp(RspPHIInBlk2);
   DbgOpID RspDefInBlk2ID = addValueDbgOp(RspDefInBlk2);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(5, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(5, 2);
 
   initValueArray(MInLocs, 5, 2);
   initValueArray(MOutLocs, 5, 2);
diff --git a/llvm/unittests/Support/RISCVISAInfoTest.cpp b/llvm/unittests/Support/RISCVISAInfoTest.cpp
index 2dd307603a82..7463824b5b52 100644
--- a/llvm/unittests/Support/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/Support/RISCVISAInfoTest.cpp
@@ -734,7 +734,6 @@ R"(All available -march extensions for RISC-V
     xcvmac              1.0
     xcvmem              1.0
     xcvsimd             1.0
-    xsfcie              1.0
     xsfvcp              1.0
     xsfvfnrclipxfqf     1.0
     xsfvfwmaccqqq       1.0
@@ -755,9 +754,12 @@ R"(All available -march extensions for RISC-V
 
 Experimental extensions
     zicfilp             0.4       This is a long dummy description
+    zicfiss             0.4
     zicond              1.0
+    zimop               0.1
     zacas               1.0
     zfbfmin             0.8
+    zcmop               0.2
     ztso                0.1
     zvfbfmin            0.8
     zvfbfwma            0.8
diff --git a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
index 5ef86bb3f7b4..5836239bc56f 100644
--- a/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
+++ b/llvm/unittests/Target/RISCV/RISCVInstrInfoTest.cpp
@@ -10,6 +10,7 @@
 #include "RISCVSubtarget.h"
 #include "RISCVTargetMachine.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -174,6 +175,83 @@ TEST_P(RISCVInstrInfoTest, GetMemOperandsWithOffsetWidth) {
   EXPECT_EQ(Width, 4u);
 }
 
+static void expectDIEPrintResult(const DIExpression *Expr, StringRef Expected) {
+  std::string Output;
+  raw_string_ostream OS(Output);
+  Expr->print(OS);
+  OS.flush();
+  EXPECT_EQ(OS.str(), Expected);
+}
+
+TEST_P(RISCVInstrInfoTest, DescribeLoadedValue) {
+  const RISCVInstrInfo *TII = ST->getInstrInfo();
+  DebugLoc DL;
+
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MF->getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+
+  // Register move.
+  auto *MI1 = BuildMI(*MBB, MBB->begin(), DL, TII->get(RISCV::ADDI), RISCV::X1)
+                  .addReg(RISCV::X2)
+                  .addImm(0)
+                  .getInstr();
+  EXPECT_FALSE(TII->describeLoadedValue(*MI1, RISCV::X2).has_value());
+  std::optional<ParamLoadedValue> MI1Res =
+      TII->describeLoadedValue(*MI1, RISCV::X1);
+  ASSERT_TRUE(MI1Res.has_value());
+  ASSERT_TRUE(MI1Res->first.isReg());
+  EXPECT_EQ(MI1Res->first.getReg(), RISCV::X2);
+  expectDIEPrintResult(MI1Res->second, "!DIExpression()");
+
+  // Load immediate.
+  auto *MI2 = BuildMI(*MBB, MBB->begin(), DL, TII->get(RISCV::ADDI), RISCV::X3)
+                  .addReg(RISCV::X0)
+                  .addImm(111)
+                  .getInstr();
+  std::optional<ParamLoadedValue> MI2Res =
+      TII->describeLoadedValue(*MI2, RISCV::X3);
+  ASSERT_TRUE(MI2Res.has_value());
+  ASSERT_TRUE(MI2Res->first.isReg());
+  EXPECT_EQ(MI2Res->first.getReg(), RISCV::X0);
+  // TODO: Could be a DW_OP_constu if this is recognised as a immediate load
+  // rather than just an addi.
+  expectDIEPrintResult(MI2Res->second, "!DIExpression(DW_OP_plus_uconst, 111)");
+
+  // Add immediate.
+  auto *MI3 = BuildMI(*MBB, MBB->begin(), DL, TII->get(RISCV::ADDI), RISCV::X2)
+                  .addReg(RISCV::X3)
+                  .addImm(222)
+                  .getInstr();
+  std::optional<ParamLoadedValue> MI3Res =
+      TII->describeLoadedValue(*MI3, RISCV::X2);
+  ASSERT_TRUE(MI3Res.has_value());
+  ASSERT_TRUE(MI3Res->first.isReg());
+  EXPECT_EQ(MI3Res->first.getReg(), RISCV::X3);
+  expectDIEPrintResult(MI3Res->second, "!DIExpression(DW_OP_plus_uconst, 222)");
+
+  // Load value from memory.
+  // It would be better (more reflective of real-world describeLoadedValue
+  // usage) to test using MachinePointerInfo::getFixedStack, but
+  // unfortunately it would be overly fiddly to make this work.
+  auto MMO = MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+                                      MachineMemOperand::MOLoad, 1, Align(1));
+  auto *MI4 = BuildMI(*MBB, MBB->begin(), DL, TII->get(RISCV::LB), RISCV::X1)
+                  .addReg(RISCV::X2)
+                  .addImm(-128)
+                  .addMemOperand(MMO)
+                  .getInstr();
+  std::optional<ParamLoadedValue> MI4Res =
+      TII->describeLoadedValue(*MI4, RISCV::X1);
+  ASSERT_TRUE(MI4Res.has_value());
+  ASSERT_TRUE(MI4Res->first.isReg());
+  EXPECT_EQ(MI4Res->first.getReg(), RISCV::X2);
+  expectDIEPrintResult(
+      MI4Res->second,
+      "!DIExpression(DW_OP_constu, 128, DW_OP_minus, DW_OP_deref_size, 1)");
+
+  MF->deleteMachineBasicBlock(MBB);
+}
+
 } // namespace
 
 INSTANTIATE_TEST_SUITE_P(RV32And64, RISCVInstrInfoTest,
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 30e60ad92b68..92bd4da1d3a4 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1812,7 +1812,9 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_SSVE_FP8DOT4, AArch64::AEK_LUT,
       AArch64::AEK_SME_LUTv2,    AArch64::AEK_SMEF8F16,
       AArch64::AEK_SMEF8F32,     AArch64::AEK_SMEFA64,
-      AArch64::AEK_CPA};
+      AArch64::AEK_CPA,          AArch64::AEK_PAUTHLR,
+      AArch64::AEK_TLBIW,
+  };
 
   std::vector<StringRef> Features;
 
@@ -1899,6 +1901,8 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-f8f32"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-fa64"));
   EXPECT_TRUE(llvm::is_contained(Features, "+cpa"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+pauth-lr"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+tlbiw"));
 
   // Assuming we listed every extension above, this should produce the same
   // result. (note that AEK_NONE doesn't have a name so it won't be in the
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 7deeff8887db..73724e662f9e 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -985,7 +985,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info,
   bool IsIsolatedToken = true;
   for (size_t i = 0, e = String.size(); i != e; ++i) {
     char Char = String[i];
-    if (Variant.BreakCharacters.find(Char) != std::string::npos) {
+    if (Variant.BreakCharacters.contains(Char)) {
       if (InTok) {
         addAsmOperand(String.slice(Prev, i), false);
         Prev = i;
@@ -994,7 +994,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info,
       InTok = true;
       continue;
     }
-    if (Variant.TokenizingCharacters.find(Char) != std::string::npos) {
+    if (Variant.TokenizingCharacters.contains(Char)) {
       if (InTok) {
         addAsmOperand(String.slice(Prev, i), IsIsolatedToken);
         InTok = false;
@@ -1005,7 +1005,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info,
       IsIsolatedToken = true;
       continue;
     }
-    if (Variant.SeparatorCharacters.find(Char) != std::string::npos) {
+    if (Variant.SeparatorCharacters.contains(Char)) {
       if (InTok) {
         addAsmOperand(String.slice(Prev, i), IsIsolatedToken);
         InTok = false;
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index 06e7ec3b9230..9ee1472bdf5c 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -563,6 +563,13 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_EVEX_L2_W_XD_KZ_B:
   case IC_EVEX_L2_W_OPSIZE_KZ_B:
     return false;
+  case IC_EVEX_NF:
+  case IC_EVEX_B_NF:
+  case IC_EVEX_OPSIZE_NF:
+  case IC_EVEX_OPSIZE_B_NF:
+  case IC_EVEX_W_NF:
+  case IC_EVEX_W_B_NF:
+    return false;
   default:
     errs() << "Unknown instruction class: "
            << stringForContext((InstructionContext)parent) << "\n";
@@ -889,7 +896,19 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
 
     if ((index & ATTR_EVEX) && (index & ATTR_OPSIZE) && (index & ATTR_ADSIZE))
       o << "IC_EVEX_OPSIZE_ADSIZE";
-    else if ((index & ATTR_EVEX) || (index & ATTR_VEX) || (index & ATTR_VEXL)) {
+    else if (index & ATTR_EVEXNF) {
+      o << "IC_EVEX";
+      if (index & ATTR_REXW)
+        o << "_W";
+      else if (index & ATTR_OPSIZE)
+        o << "_OPSIZE";
+
+      if (index & ATTR_EVEXB)
+        o << "_B";
+
+      o << "_NF";
+    } else if ((index & ATTR_EVEX) || (index & ATTR_VEX) ||
+               (index & ATTR_VEXL)) {
       if (index & ATTR_EVEX)
         o << "IC_EVEX";
       else
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index d3299e281031..101b75e2f087 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -348,7 +348,9 @@ public:
     // memory form: broadcast
     if (IsBroadcast && (RegRI.HasEVEX_B || !MemRI.HasEVEX_B))
       return false;
-    if (!IsBroadcast && (RegRI.HasEVEX_B || MemRI.HasEVEX_B))
+    // EVEX_B indicates NDD for MAP4 instructions
+    if (!IsBroadcast && (RegRI.HasEVEX_B || MemRI.HasEVEX_B) &&
+        RegRI.OpMap != X86Local::T_MAP4)
       return false;
 
     if (!mayFoldFromLeftToRight(RegRI.Form, MemRI.Form))
@@ -369,7 +371,8 @@ public:
                         RegRI.OpMap, RegRI.OpSize, RegRI.AdSize, RegRI.HasREX_W,
                         RegRI.HasVEX_4V, RegRI.HasVEX_L, RegRI.IgnoresVEX_L,
                         RegRI.IgnoresW, RegRI.HasEVEX_K, RegRI.HasEVEX_KZ,
-                        RegRI.HasEVEX_L2, RegRec->getValueAsBit("hasEVEX_RC"),
+                        RegRI.HasEVEX_L2, RegRI.HasEVEX_NF,
+                        RegRec->getValueAsBit("hasEVEX_RC"),
                         RegRec->getValueAsBit("hasLockPrefix"),
                         RegRec->getValueAsBit("hasNoTrackPrefix"),
                         RegRec->getValueAsBit("EVEX_W1_VEX_W0")) !=
@@ -377,7 +380,8 @@ public:
                         MemRI.OpMap, MemRI.OpSize, MemRI.AdSize, MemRI.HasREX_W,
                         MemRI.HasVEX_4V, MemRI.HasVEX_L, MemRI.IgnoresVEX_L,
                         MemRI.IgnoresW, MemRI.HasEVEX_K, MemRI.HasEVEX_KZ,
-                        MemRI.HasEVEX_L2, MemRec->getValueAsBit("hasEVEX_RC"),
+                        MemRI.HasEVEX_L2, MemRI.HasEVEX_NF,
+                        MemRec->getValueAsBit("hasEVEX_RC"),
                         MemRec->getValueAsBit("hasLockPrefix"),
                         MemRec->getValueAsBit("hasNoTrackPrefix"),
                         MemRec->getValueAsBit("EVEX_W1_VEX_W0")))
@@ -668,6 +672,14 @@ void X86FoldTablesEmitter::run(raw_ostream &O) {
     if (NoFoldSet.find(Rec->getName()) != NoFoldSet.end())
       continue;
 
+    // Promoted legacy instruction is in EVEX space, and has REX2-encoding
+    // alternative. It's added due to HW design and never emitted by compiler.
+    if (byteFromBitsInit(Rec->getValueAsBitsInit("OpMapBits")) ==
+            X86Local::T_MAP4 &&
+        byteFromBitsInit(Rec->getValueAsBitsInit("explicitOpPrefixBits")) ==
+            X86Local::ExplicitEVEX)
+      continue;
+
     // - Instructions including RST register class operands are not relevant
     //   for memory folding (for further details check the explanation in
     //   lib/Target/X86/X86InstrFPStack.td file).
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index 47ee9544f323..fb430676c504 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -125,6 +125,7 @@ RecognizableInstrBase::RecognizableInstrBase(const CodeGenInstruction &insn) {
   HasEVEX_K = Rec->getValueAsBit("hasEVEX_K");
   HasEVEX_KZ = Rec->getValueAsBit("hasEVEX_Z");
   HasEVEX_B = Rec->getValueAsBit("hasEVEX_B");
+  HasEVEX_NF = Rec->getValueAsBit("hasEVEX_NF");
   IsCodeGenOnly = Rec->getValueAsBit("isCodeGenOnly");
   IsAsmParserOnly = Rec->getValueAsBit("isAsmParserOnly");
   ForceDisassemble = Rec->getValueAsBit("ForceDisassemble");
@@ -185,6 +186,9 @@ void RecognizableInstr::processInstr(DisassemblerTables &tables,
               : (HasEVEX_KZ ? n##_KZ                                           \
                             : (HasEVEX_K ? n##_K : (HasEVEX_B ? n##_B : n)))))
 
+#define EVEX_NF(n) (HasEVEX_NF ? n##_NF : n)
+#define EVEX_B_NF(n) (HasEVEX_B ? EVEX_NF(n##_B) : EVEX_NF(n))
+
 InstructionContext RecognizableInstr::insnContext() const {
   InstructionContext insnContext;
 
@@ -193,8 +197,15 @@ InstructionContext RecognizableInstr::insnContext() const {
       errs() << "Don't support VEX.L if EVEX_L2 is enabled: " << Name << "\n";
       llvm_unreachable("Don't support VEX.L if EVEX_L2 is enabled");
     }
-    // VEX_L & VEX_W
-    if (!EncodeRC && HasVEX_L && HasREX_W) {
+    if (HasEVEX_NF) {
+      if (OpPrefix == X86Local::PD)
+        insnContext = EVEX_B_NF(IC_EVEX_OPSIZE);
+      else if (HasREX_W)
+        insnContext = EVEX_B_NF(IC_EVEX_W);
+      else
+        insnContext = EVEX_B_NF(IC_EVEX);
+    } else if (!EncodeRC && HasVEX_L && HasREX_W) {
+      // VEX_L & VEX_W
       if (OpPrefix == X86Local::PD)
         insnContext = EVEX_KB(IC_EVEX_L_W_OPSIZE);
       else if (OpPrefix == X86Local::XS)
@@ -486,6 +497,7 @@ void RecognizableInstr::emitInstructionSpecifier() {
     ++additionalOperands;
 #endif
 
+  bool IsND = OpMap == X86Local::T_MAP4 && HasEVEX_B && HasVEX_4V;
   switch (Form) {
   default:
     llvm_unreachable("Unhandled form");
@@ -536,11 +548,14 @@ void RecognizableInstr::emitInstructionSpecifier() {
            numPhysicalOperands <= 3 + additionalOperands &&
            "Unexpected number of operands for MRMDestReg");
 
+    if (IsND)
+      HANDLE_OPERAND(vvvvRegister)
+
     HANDLE_OPERAND(rmRegister)
     if (HasEVEX_K)
       HANDLE_OPERAND(writemaskRegister)
 
-    if (HasVEX_4V)
+    if (!IsND && HasVEX_4V)
       // FIXME: In AVX, the register below becomes the one encoded
       // in ModRMVEX and the one above the one in the VEX.VVVV field
       HANDLE_OPERAND(vvvvRegister)
@@ -570,12 +585,15 @@ void RecognizableInstr::emitInstructionSpecifier() {
            numPhysicalOperands <= 3 + additionalOperands &&
            "Unexpected number of operands for MRMDestMemFrm with VEX_4V");
 
+    if (IsND)
+      HANDLE_OPERAND(vvvvRegister)
+
     HANDLE_OPERAND(memory)
 
     if (HasEVEX_K)
       HANDLE_OPERAND(writemaskRegister)
 
-    if (HasVEX_4V)
+    if (!IsND && HasVEX_4V)
       // FIXME: In AVX, the register below becomes the one encoded
       // in ModRMVEX and the one above the one in the VEX.VVVV field
       HANDLE_OPERAND(vvvvRegister)
@@ -594,12 +612,15 @@ void RecognizableInstr::emitInstructionSpecifier() {
            numPhysicalOperands <= 4 + additionalOperands &&
            "Unexpected number of operands for MRMSrcRegFrm");
 
+    if (IsND)
+      HANDLE_OPERAND(vvvvRegister)
+
     HANDLE_OPERAND(roRegister)
 
     if (HasEVEX_K)
       HANDLE_OPERAND(writemaskRegister)
 
-    if (HasVEX_4V)
+    if (!IsND && HasVEX_4V)
       // FIXME: In AVX, the register below becomes the one encoded
       // in ModRMVEX and the one above the one in the VEX.VVVV field
       HANDLE_OPERAND(vvvvRegister)
@@ -641,13 +662,15 @@ void RecognizableInstr::emitInstructionSpecifier() {
     assert(numPhysicalOperands >= 2 + additionalOperands &&
            numPhysicalOperands <= 4 + additionalOperands &&
            "Unexpected number of operands for MRMSrcMemFrm");
+    if (IsND)
+      HANDLE_OPERAND(vvvvRegister)
 
     HANDLE_OPERAND(roRegister)
 
     if (HasEVEX_K)
       HANDLE_OPERAND(writemaskRegister)
 
-    if (HasVEX_4V)
+    if (!IsND && HasVEX_4V)
       // FIXME: In AVX, the register below becomes the one encoded
       // in ModRMVEX and the one above the one in the VEX.VVVV field
       HANDLE_OPERAND(vvvvRegister)
@@ -1216,6 +1239,8 @@ RecognizableInstr::roRegisterEncodingFromString(const std::string &s,
 OperandEncoding
 RecognizableInstr::vvvvRegisterEncodingFromString(const std::string &s,
                                                   uint8_t OpSize) {
+  ENCODING("GR8", ENCODING_VVVV)
+  ENCODING("GR16", ENCODING_VVVV)
   ENCODING("GR32", ENCODING_VVVV)
   ENCODING("GR64", ENCODING_VVVV)
   ENCODING("FR32", ENCODING_VVVV)
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h
index 61ad5e32b3fb..007c700cdfaf 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.h
+++ b/llvm/utils/TableGen/X86RecognizableInstr.h
@@ -172,7 +172,7 @@ enum { PD = 1, XS = 2, XD = 3, PS = 4 };
 enum { VEX = 1, XOP = 2, EVEX = 3 };
 enum { OpSize16 = 1, OpSize32 = 2 };
 enum { AdSize16 = 1, AdSize32 = 2, AdSize64 = 3 };
-enum { ExplicitREX2 = 1 };
+enum { ExplicitREX2 = 1, ExplicitEVEX = 3 };
 } // namespace X86Local
 
 namespace X86Disassembler {
@@ -212,6 +212,8 @@ struct RecognizableInstrBase {
   bool HasEVEX_KZ;
   /// The hasEVEX_B field from the record
   bool HasEVEX_B;
+  /// The hasEVEX_NF field from the record
+  bool HasEVEX_NF;
   /// Indicates that the instruction uses the L and L' fields for RC.
   bool EncodeRC;
   /// The isCodeGenOnly field from the record
diff --git a/llvm/utils/chunk-print-before-all.py b/llvm/utils/chunk-print-before-all.py
index fe0eaaea1c20..fef8eb64c540 100755
--- a/llvm/utils/chunk-print-before-all.py
+++ b/llvm/utils/chunk-print-before-all.py
@@ -30,13 +30,13 @@ def print_chunk(lines, prefix, pass_name):
 is_dump = False
 cur = []
 for line in sys.stdin:
-    if line.startswith("*** IR Dump Before "):
+    if "*** IR Dump Before " in line:
         if len(cur) != 0:
             print_chunk(cur, "before", pass_name)
             cur = []
         cur.append("; " + line)
         pass_name = get_pass_name(line, "Before")
-    elif line.startswith("*** IR Dump After "):
+    elif "*** IR Dump After " in line:
         if len(cur) != 0:
             print_chunk(cur, "after", pass_name)
             cur = []
diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index 849ae996f733..8a29a57d8d16 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -130,9 +130,10 @@ View the diff from {self.name} here.
 
         if diff is None:
             if should_update_gh:
-                comment_text = f"""
-:white_check_mark: With the latest revision this PR passed the {self.friendly_name}.
-"""
+                comment_text = (
+                    ":white_check_mark: With the latest revision "
+                    f"this PR passed the {self.friendly_name}."
+                )
                 self.update_pr(comment_text, args, create_new=False)
             return True
         elif len(diff) > 0:
@@ -141,15 +142,17 @@ View the diff from {self.name} here.
                 self.update_pr(comment_text, args, create_new=True)
             else:
                 print(
-                    f"Warning: {self.friendly_name}, {self.name} detected some issues with your code formatting..."
+                    f"Warning: {self.friendly_name}, {self.name} detected "
+                    "some issues with your code formatting..."
                 )
             return False
         else:
             # The formatter failed but didn't output a diff (e.g. some sort of
             # infrastructure failure).
-            comment_text = f"""
-:warning: The {self.friendly_name} failed without printing a diff. Check the logs for stderr output. :warning:
-"""
+            comment_text = (
+                f":warning: The {self.friendly_name} failed without printing "
+                "a diff. Check the logs for stderr output. :warning:"
+            )
             self.update_pr(comment_text, args, create_new=False)
             return False
 
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index dcc252000698..7f9302e06f8b 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -91,6 +91,7 @@ static_library("bugprone") {
     "UnhandledSelfAssignmentCheck.cpp",
     "UniquePtrArrayMismatchCheck.cpp",
     "UnsafeFunctionsCheck.cpp",
+    "UnusedLocalNonTrivialVariableCheck.cpp",
     "UnusedRaiiCheck.cpp",
     "UnusedReturnValueCheck.cpp",
     "UseAfterMoveCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn
index 808010deef04..7273803dd516 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn
@@ -44,9 +44,17 @@ write_cmake_config("Config") {
   }
 
   if (host_os != "win") {
-    values += [ "CLANG_HAVE_RLIMITS=1" ]
+    values += [
+      "CLANG_HAVE_DLADDR=1",
+      "CLANG_HAVE_DLFCN_H=1",
+      "CLANG_HAVE_RLIMITS=1",
+    ]
   } else {
-    values += [ "CLANG_HAVE_RLIMITS=" ]
+    values += [
+      "CLANG_HAVE_DLADDR=",
+      "CLANG_HAVE_DLFCN_H=",
+      "CLANG_HAVE_RLIMITS=",
+    ]
   }
 
   if (llvm_enable_libxml2) {
diff --git a/llvm/utils/gn/secondary/clang/lib/Format/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Format/BUILD.gn
index ecb96b36ea74..58cd59a7c941 100644
--- a/llvm/utils/gn/secondary/clang/lib/Format/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Format/BUILD.gn
@@ -19,6 +19,7 @@ static_library("Format") {
     "IntegerLiteralSeparatorFixer.cpp",
     "MacroCallReconstructor.cpp",
     "MacroExpander.cpp",
+    "MatchFilePath.cpp",
     "NamespaceEndCommentsFixer.cpp",
     "ObjCPropertyAttributeOrderFixer.cpp",
     "QualifierAlignmentFixer.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 3debc48a4bb9..6059074dfa27 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -120,6 +120,7 @@ copy("Headers") {
     "__stddef_wint_t.h",
     "__wmmintrin_aes.h",
     "__wmmintrin_pclmul.h",
+    "adcintrin.h",
     "adxintrin.h",
     "altivec.h",
     "ammintrin.h",
diff --git a/llvm/utils/gn/secondary/clang/unittests/Format/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Format/BUILD.gn
index c34c11fc55c4..b35061970c22 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Format/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Format/BUILD.gn
@@ -35,6 +35,7 @@ unittest("FormatTests") {
     "IntegerLiteralSeparatorTest.cpp",
     "MacroCallReconstructorTest.cpp",
     "MacroExpanderTest.cpp",
+    "MatchFilePathTest.cpp",
     "NamespaceEndCommentsFixerTest.cpp",
     "ObjCPropertyAttributeOrderFixerTest.cpp",
     "QualifierFixerTest.cpp",
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/sanitizer_common/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/sanitizer_common/BUILD.gn
index 57603c6363c1..051907323943 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/sanitizer_common/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/sanitizer_common/BUILD.gn
@@ -66,7 +66,6 @@ source_set("sources") {
     "sanitizer_flags.cpp",
     "sanitizer_flags.h",
     "sanitizer_flat_map.h",
-    "sanitizer_freebsd.h",
     "sanitizer_fuchsia.cpp",
     "sanitizer_fuchsia.h",
     "sanitizer_getauxval.h",
@@ -116,8 +115,8 @@ source_set("sources") {
     "sanitizer_procmaps_solaris.cpp",
     "sanitizer_ptrauth.h",
     "sanitizer_quarantine.h",
-    "sanitizer_range.h",
     "sanitizer_range.cpp",
+    "sanitizer_range.h",
     "sanitizer_redefine_builtins.h",
     "sanitizer_report_decorator.h",
     "sanitizer_ring_buffer.h",
@@ -143,7 +142,6 @@ source_set("sources") {
     "sanitizer_suppressions.h",
     "sanitizer_symbolizer.cpp",
     "sanitizer_symbolizer.h",
-    "sanitizer_symbolizer_markup_constants.h",
     "sanitizer_symbolizer_internal.h",
     "sanitizer_symbolizer_libbacktrace.cpp",
     "sanitizer_symbolizer_libbacktrace.h",
@@ -152,6 +150,7 @@ source_set("sources") {
     "sanitizer_symbolizer_mac.h",
     "sanitizer_symbolizer_markup.cpp",
     "sanitizer_symbolizer_markup.h",
+    "sanitizer_symbolizer_markup_constants.h",
     "sanitizer_symbolizer_markup_fuchsia.cpp",
     "sanitizer_symbolizer_posix_libcdep.cpp",
     "sanitizer_symbolizer_report.cpp",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index cdd74ecbbca3..fc24717974c5 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -980,10 +980,8 @@ if (current_toolchain == default_toolchain) {
       "execution",
       "experimental/__config",
       "experimental/__memory",
-      "experimental/__simd/abi_tag.h",
       "experimental/__simd/aligned_tag.h",
       "experimental/__simd/declaration.h",
-      "experimental/__simd/internal_declaration.h",
       "experimental/__simd/reference.h",
       "experimental/__simd/scalar.h",
       "experimental/__simd/simd.h",
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index 2478b2f8a861..e5fb529b455f 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -205,6 +205,8 @@ write_cmake_config("config") {
   if (current_os == "win") {
     values += [
       "HAVE_DECL_STRERROR_S=1",
+      "HAVE_DLADDR=",
+      "HAVE_DLFCN_H=",
       "HAVE_DLOPEN=",
       "HAVE_FUTIMES=",
       "HAVE_GETPAGESIZE=",
@@ -239,6 +241,8 @@ write_cmake_config("config") {
     # POSIX-y system defaults.
     values += [
       "HAVE_DECL_STRERROR_S=",
+      "HAVE_DLADDR=1",
+      "HAVE_DLFCN_H=1",
       "HAVE_DLOPEN=1",
       "HAVE_FUTIMES=1",
       "HAVE_GETPAGESIZE=1",
@@ -358,16 +362,12 @@ write_cmake_config("llvm-config") {
 
   if (current_os == "win") {
     values += [
-      "HAVE_DLADDR=",
-      "HAVE_DLFCN_H=",
       "HAVE_SYSEXITS_H=",
       "LLVM_ENABLE_PLUGINS=",
       "LLVM_ON_UNIX=",
     ]
   } else {
     values += [
-      "HAVE_DLADDR=1",
-      "HAVE_DLFCN_H=1",
       "HAVE_SYSEXITS_H=1",
       "LLVM_ENABLE_PLUGINS=1",
       "LLVM_ON_UNIX=1",
diff --git a/llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn
index f1c604aa420b..ed312b248656 100644
--- a/llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn
@@ -20,5 +20,6 @@ static_library("TextAPI") {
     "TextStub.cpp",
     "TextStubCommon.cpp",
     "TextStubV5.cpp",
+    "Utils.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/TextAPI/BinaryReader/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/TextAPI/BinaryReader/BUILD.gn
new file mode 100644
index 000000000000..3eeb32aae4e3
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/lib/TextAPI/BinaryReader/BUILD.gn
@@ -0,0 +1,10 @@
+static_library("BinaryReader") {
+  output_name = "LLVMTextAPIBinaryReader"
+  deps = [
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+    "//llvm/lib/TargetParser",
+    "//llvm/lib/TextAPI",
+  ]
+  sources = [ "DylibReader.cpp" ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn
index a562d04c4e90..1e67b1dcf036 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn
@@ -26,10 +26,12 @@ group("symlinks") {
 executable("llvm-readtapi") {
   deps = [
     ":TapiOpts",
+    "//llvm/lib/BinaryFormat",
     "//llvm/lib/Object",
     "//llvm/lib/Option",
     "//llvm/lib/Support",
     "//llvm/lib/TextAPI",
+    "//llvm/lib/TextAPI/BinaryReader",
   ]
   sources = [
     "DiffEngine.cpp",
diff --git a/llvm/utils/lit/lit/cl_arguments.py b/llvm/utils/lit/lit/cl_arguments.py
index ba3706659550..b9122d07afd8 100644
--- a/llvm/utils/lit/lit/cl_arguments.py
+++ b/llvm/utils/lit/lit/cl_arguments.py
@@ -36,7 +36,7 @@ def parse_args():
         metavar="N",
         help="Number of workers used for testing",
         type=_positive_int,
-        default=lit.util.usable_core_count(),
+        default=os.getenv("LIT_MAX_WORKERS", lit.util.usable_core_count()),
     )
     parser.add_argument(
         "--config-prefix",
diff --git a/mlir/include/mlir-c/Dialect/SPIRV.h b/mlir/include/mlir-c/Dialect/SPIRV.h
new file mode 100644
index 000000000000..f22708c9db04
--- /dev/null
+++ b/mlir/include/mlir-c/Dialect/SPIRV.h
@@ -0,0 +1,26 @@
+//===-- mlir-c/Dialect/SPIRV.h - C API for SPIRV dialect ----------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_C_DIALECT_SPIRV_H
+#define MLIR_C_DIALECT_SPIRV_H
+
+#include "mlir-c/IR.h"
+#include "mlir-c/Support.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(SPIRV, spirv);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MLIR_C_DIALECT_SPIRV_H
diff --git a/mlir/include/mlir/Analysis/DataFlowFramework.h b/mlir/include/mlir/Analysis/DataFlowFramework.h
index 541cdb1e237c..c76cfac07fc7 100644
--- a/mlir/include/mlir/Analysis/DataFlowFramework.h
+++ b/mlir/include/mlir/Analysis/DataFlowFramework.h
@@ -30,8 +30,8 @@ namespace mlir {
 //===----------------------------------------------------------------------===//
 
 /// A result type used to indicate if a change happened. Boolean operations on
-/// ChangeResult behave as though `Change` is truthy.
-enum class ChangeResult {
+/// ChangeResult behave as though `Change` is truth.
+enum class [[nodiscard]] ChangeResult {
   NoChange,
   Change,
 };
diff --git a/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h b/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h
index 9fe2abafd36b..91ed349f461c 100644
--- a/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h
+++ b/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h
@@ -290,6 +290,11 @@ public:
   /// the symbols in two spaces are aligned.
   bool isAligned(const PresburgerSpace &other, VarKind kind) const;
 
+  /// Merge and align symbol variables of `this` and `other` with respect to
+  /// identifiers. After this operation the symbol variables of both spaces have
+  /// the same identifiers in the same order.
+  void mergeAndAlignSymbols(PresburgerSpace &other);
+
   void print(llvm::raw_ostream &os) const;
   void dump() const;
 
diff --git a/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h b/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h
new file mode 100644
index 000000000000..f8ce8524e41b
--- /dev/null
+++ b/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h
@@ -0,0 +1,71 @@
+//===- QuasiPolynomial.h - QuasiPolynomial Class ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of the QuasiPolynomial class for Barvinok's algorithm,
+// which represents a single-valued function on a set of parameters.
+// It is an expression of the form
+// f(x) = \sum_i c_i * \prod_j ⌊g_{ij}(x)⌋
+// where c_i \in Q and
+// g_{ij} : Q^d -> Q are affine functionals over d parameters.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_PRESBURGER_QUASIPOLYNOMIAL_H
+#define MLIR_ANALYSIS_PRESBURGER_QUASIPOLYNOMIAL_H
+
+#include "mlir/Analysis/Presburger/Fraction.h"
+#include "mlir/Analysis/Presburger/PresburgerSpace.h"
+
+namespace mlir {
+namespace presburger {
+
+// A class to describe quasi-polynomials.
+// A quasipolynomial consists of a set of terms.
+// The ith term is a constant `coefficients[i]`, multiplied
+// by the product of a set of affine functions on n parameters.
+// Represents functions f : Q^n -> Q of the form
+//
+// f(x) = \sum_i c_i * \prod_j ⌊g_{ij}(x)⌋
+//
+// where c_i \in Q and
+// g_{ij} : Q^n -> Q are affine functionals.
+class QuasiPolynomial : public PresburgerSpace {
+public:
+  QuasiPolynomial(unsigned numVars, SmallVector<Fraction> coeffs = {},
+                  std::vector<std::vector<SmallVector<Fraction>>> aff = {});
+
+  // Find the number of inputs (numDomain) to the polynomial.
+  // numSymbols is set to zero.
+  unsigned getNumInputs() const {
+    return getNumDomainVars() + getNumSymbolVars();
+  }
+
+  const SmallVector<Fraction> &getCoefficients() const { return coefficients; }
+
+  const std::vector<std::vector<SmallVector<Fraction>>> &getAffine() const {
+    return affine;
+  }
+
+  // Arithmetic operations.
+  QuasiPolynomial operator+(const QuasiPolynomial &x) const;
+  QuasiPolynomial operator-(const QuasiPolynomial &x) const;
+  QuasiPolynomial operator*(const QuasiPolynomial &x) const;
+  QuasiPolynomial operator/(const Fraction x) const;
+
+  // Removes terms which evaluate to zero from the expression.
+  QuasiPolynomial simplify();
+
+private:
+  SmallVector<Fraction> coefficients;
+  std::vector<std::vector<SmallVector<Fraction>>> affine;
+};
+
+} // namespace presburger
+} // namespace mlir
+
+#endif // MLIR_ANALYSIS_PRESBURGER_QUASIPOLYNOMIAL_H
+\ No newline at end of file
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index c72fde2ab351..efef61b5c6e7 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -19,10 +19,11 @@ include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
 include "mlir/Dialect/GPU/IR/CompilationAttrs.td"
 include "mlir/Dialect/GPU/IR/ParallelLoopMapperAttr.td"
 include "mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td"
+include "mlir/IR/CommonTypeConstraints.td"
 include "mlir/IR/EnumAttr.td"
-include "mlir/Interfaces/FunctionInterfaces.td"
 include "mlir/IR/SymbolInterfaces.td"
 include "mlir/Interfaces/DataLayoutInterfaces.td"
+include "mlir/Interfaces/FunctionInterfaces.td"
 include "mlir/Interfaces/InferIntRangeInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
@@ -712,7 +713,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
 
     ```
     operation ::= `gpu.launch` (`async` (`[` ssa-id-list `]`)? )?
-                             `block` `(` ssa-id-list `)` `in` ssa-reassignment
+                             `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
                              `threads` `(` ssa-id-list `)` `in` ssa-reassignment
                              (dynamic_shared_memory_size ssa-use)?
                              memory-attribution
@@ -1023,16 +1024,23 @@ def GPU_AllReduceOp : GPU_Op<"all_reduce",
   let hasRegionVerifier = 1;
 }
 
+def AnyIntegerOrFloatOr1DVector :
+  AnyTypeOf<[AnyIntegerOrFloat, VectorOfRankAndType<[1], [AnyIntegerOrFloat]>]>;
+
 def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]> {
   let summary = "Reduce values among subgroup.";
   let description = [{
     The `subgroup_reduce` op reduces the value of every work item across a
     subgroup. The result is equal for all work items of a subgroup.
 
+    When the reduced value is of a vector type, each vector element is reduced
+    independently. Only 1-d vector types are allowed.
+
     Example:
 
     ```mlir
-    %1 = gpu.subgroup_reduce add %0 : (f32) -> (f32)
+    %1 = gpu.subgroup_reduce add %a : (f32) -> (f32)
+    %2 = gpu.subgroup_reduce add %b : (vector<4xf16>) -> (vector<4xf16>)
     ```
 
     If `uniform` flag is set either none or all work items of a subgroup
@@ -1045,11 +1053,11 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]
   }];
 
   let arguments = (ins
-    AnyIntegerOrFloat:$value,
+    AnyIntegerOrFloatOr1DVector:$value,
     GPU_AllReduceOperationAttr:$op,
     UnitAttr:$uniform
   );
-  let results = (outs AnyIntegerOrFloat:$result);
+  let results = (outs AnyIntegerOrFloatOr1DVector:$result);
 
   let assemblyFormat = [{ custom<AllReduceOperation>($op) $value
                           (`uniform` $uniform^)? attr-dict
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index c6c02ccaafbc..6c5bf75d2124 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -15,6 +15,7 @@
 
 #include "Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include <optional>
 
@@ -62,6 +63,13 @@ void populateGpuShufflePatterns(RewritePatternSet &patterns);
 /// Collect a set of patterns to rewrite all-reduce ops within the GPU dialect.
 void populateGpuAllReducePatterns(RewritePatternSet &patterns);
 
+/// Collect a set of patterns to break down subgroup_reduce ops into smaller
+/// ones supported by the target of `size <= maxShuffleBitwidth`, where `size`
+/// is the subgroup_reduce value bitwidth.
+void populateGpuBreakDownSubgrupReducePatterns(RewritePatternSet &patterns,
+                                               unsigned maxShuffleBitwidth = 32,
+                                               PatternBenefit benefit = 1);
+
 /// Collect all patterns to rewrite ops within the GPU dialect.
 inline void populateGpuRewritePatterns(RewritePatternSet &patterns) {
   populateGpuAllReducePatterns(patterns);
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index a798cad60377..a4f08fb92da9 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -435,6 +435,14 @@ def LLVM_CoroResumeOp : LLVM_IntrOp<"coro.resume", [], [], [], 0> {
   let assemblyFormat = "$handle attr-dict `:` qualified(type($handle))";
 }
 
+def LLVM_CoroPromiseOp : LLVM_IntrOp<"coro.promise", [], [], [], 1> {
+  let arguments = (ins LLVM_AnyPointer:$handle,
+                       I32:$align,
+                       I1:$from);
+  let results = (outs LLVM_AnyPointer:$res);
+  let assemblyFormat = "$handle `,` $align `,` $from attr-dict `:` functional-type(operands, results)";
+}
+
 //
 // Debug function intrinsics.
 //
diff --git a/mlir/include/mlir/Dialect/Math/IR/MathOps.td b/mlir/include/mlir/Dialect/Math/IR/MathOps.td
index 211cb31d50bd..fdb9ec09ae3e 100644
--- a/mlir/include/mlir/Dialect/Math/IR/MathOps.td
+++ b/mlir/include/mlir/Dialect/Math/IR/MathOps.td
@@ -142,12 +142,6 @@ def Math_AbsIOp : Math_IntegerUnaryOp<"absi"> {
 def Math_AtanOp : Math_FloatUnaryOp<"atan">{
   let summary = "arcus tangent of the given value";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.atan` ssa-use `:` type
-    ```
-
     The `atan` operation computes the arcus tangent of a given value.  It takes
     one operand of floating point type (i.e., scalar, tensor or vector) and returns
     one result of the same type. It has no standard attributes.
@@ -169,12 +163,6 @@ def Math_AtanOp : Math_FloatUnaryOp<"atan">{
 def Math_Atan2Op : Math_FloatBinaryOp<"atan2">{
   let summary = "2-argument arcus tangent of the given values";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.atan2` ssa-use `,` ssa-use `:` type
-    ```
-
     The `atan2` operation takes two operands and returns one result, all of
     which must be of the same type.  The operands must be of floating point type
     (i.e., scalar, tensor or vector).
@@ -225,12 +213,6 @@ def Math_CbrtOp : Math_FloatUnaryOp<"cbrt"> {
 def Math_CeilOp : Math_FloatUnaryOp<"ceil"> {
   let summary = "ceiling of the specified value";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.ceil` ssa-use `:` type
-    ```
-
     The `ceil` operation computes the ceiling of a given value. It takes one
     operand of floating point type (i.e., scalar, tensor or vector) and returns one
     result of the same type.  It has no standard attributes.
@@ -252,12 +234,6 @@ def Math_CeilOp : Math_FloatUnaryOp<"ceil"> {
 def Math_CopySignOp : Math_FloatBinaryOp<"copysign"> {
   let summary = "A copysign operation";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.copysign` ssa-use `,` ssa-use `:` type
-    ```
-
     The `copysign` returns a value with the magnitude of the first operand and
     the sign of the second operand. It takes two operands and returns one result of
     the same type. The operands must be of floating point type (i.e., scalar,
@@ -280,12 +256,6 @@ def Math_CopySignOp : Math_FloatBinaryOp<"copysign"> {
 def Math_CosOp : Math_FloatUnaryOp<"cos"> {
   let summary = "cosine of the specified value";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.cos` ssa-use `:` type
-    ```
-
     The `cos` operation computes the cosine of a given value. It takes one
     operand of floating point type (i.e., scalar, tensor or vector) and returns one
     result of the same type.  It has no standard attributes.
@@ -307,12 +277,6 @@ def Math_CosOp : Math_FloatUnaryOp<"cos"> {
 def Math_AcosOp : Math_FloatUnaryOp<"acos"> {
   let summary = "arcus cosine of the specified value";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.acos` ssa-use `:` type
-    ```
-
     The `acos` operation computes the arcus cosine of a given value. It takes one
     operand of floating point type (i.e., scalar, tensor or vector) and returns one
     result of the same type.  It has no standard attributes.
@@ -355,12 +319,6 @@ def Math_CoshOp : Math_FloatUnaryOp<"cosh"> {
 def Math_SinOp : Math_FloatUnaryOp<"sin"> {
   let summary = "sine of the specified value";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.sin` ssa-use `:` type
-    ```
-
     The `sin` operation computes the sine of a given value. It takes one
     operand of floating point type (i.e., scalar, tensor or vector) and returns one
     result of the same type.  It has no standard attributes.
@@ -463,12 +421,6 @@ def Math_CtPopOp : Math_IntegerUnaryOp<"ctpop"> {
 def Math_ErfOp : Math_FloatUnaryOp<"erf"> {
   let summary = "error function of the specified value";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.erf` ssa-use `:` type
-    ```
-
     The `erf` operation computes the error function. It takes one operand of
     floating point type (i.e., scalar, tensor or vector) and returns one result of
     the same type. It has no standard attributes.
@@ -491,12 +443,6 @@ def Math_ErfOp : Math_FloatUnaryOp<"erf"> {
 def Math_ExpOp : Math_FloatUnaryOp<"exp"> {
   let summary = "base-e exponential of the specified value";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.exp` ssa-use `:` type
-    ```
-
     The `exp` operation takes one operand of floating point type (i.e., scalar,
     tensor or vector) and returns one result of the same type. It has no standard
     attributes.
@@ -519,12 +465,6 @@ def Math_Exp2Op : Math_FloatUnaryOp<"exp2"> {
   let summary = "base-2 exponential of the specified value";
 
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.exp2` ssa-use `:` type
-    ```
-
     The `exp` operation takes one operand of floating point type (i.e., scalar,
     tensor or vector) and returns one result of the same type. It has no standard
     attributes.
@@ -546,12 +486,6 @@ def Math_Exp2Op : Math_FloatUnaryOp<"exp2"> {
 def Math_ExpM1Op : Math_FloatUnaryOp<"expm1"> {
   let summary = "base-e exponential of the specified value minus 1";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.expm1` ssa-use `:` type
-    ```
-
     expm1(x) := exp(x) - 1
 
     The `expm1` operation takes one operand of floating point type (i.e.,
@@ -575,12 +509,6 @@ def Math_ExpM1Op : Math_FloatUnaryOp<"expm1"> {
 def Math_FloorOp : Math_FloatUnaryOp<"floor"> {
   let summary = "floor of the specified value";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.floor` ssa-use `:` type
-    ```
-
     The `floor` operation computes the floor of a given value. It takes one
     operand of floating point type (i.e., scalar, tensor or vector) and returns one
     result of the same type.  It has no standard attributes.
@@ -603,12 +531,6 @@ def Math_FloorOp : Math_FloatUnaryOp<"floor"> {
 def Math_FmaOp : Math_FloatTernaryOp<"fma"> {
   let summary = "floating point fused multipy-add operation";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.fma` ssa-use `,` ssa-use `,` ssa-use `:` type
-    ```
-
     The `fma` operation takes three operands and returns one result, each of
     these is required to be the same type. Operands must be of floating point type
     (i.e., scalar, tensor or vector).
@@ -634,12 +556,6 @@ def Math_FmaOp : Math_FloatTernaryOp<"fma"> {
 def Math_IPowIOp : Math_IntegerBinaryOp<"ipowi"> {
   let summary = "signed integer raised to the power of operation";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.ipowi` ssa-use `,` ssa-use `:` type
-    ```
-
     The `ipowi` operation takes two operands of integer type (i.e., scalar,
     tensor or vector) and returns one result of the same type. Operands
     must have the same type.
@@ -751,12 +667,6 @@ def Math_Log2Op : Math_FloatUnaryOp<"log2"> {
 def Math_PowFOp : Math_FloatBinaryOp<"powf"> {
   let summary = "floating point raised to the power of operation";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.powf` ssa-use `,` ssa-use `:` type
-    ```
-
     The `powf` operation takes two operands of floating point type (i.e.,
     scalar, tensor or vector) and returns one result of the same type. Operands
     must have the same type.
@@ -861,12 +771,6 @@ def Math_TanhOp : Math_FloatUnaryOp<"tanh"> {
 def Math_RoundEvenOp : Math_FloatUnaryOp<"roundeven"> {
   let summary = "round of the specified value with halfway cases to even";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.roundeven` ssa-use `:` type
-    ```
-
     The `roundeven` operation returns the operand rounded to the nearest integer
     value in floating-point format. It takes one operand of floating point type
     (i.e., scalar, tensor or vector) and produces one result of the same type.  The
@@ -891,12 +795,6 @@ def Math_RoundEvenOp : Math_FloatUnaryOp<"roundeven"> {
 def Math_RoundOp : Math_FloatUnaryOp<"round"> {
   let summary = "round of the specified value";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.round` ssa-use `:` type
-    ```
-
     The `round` operation returns the operand rounded to the nearest integer
     value in floating-point format. It takes one operand of floating point type
     (i.e., scalar, tensor or vector) and produces one result of the same type.  The
@@ -921,12 +819,6 @@ def Math_RoundOp : Math_FloatUnaryOp<"round"> {
 def Math_TruncOp : Math_FloatUnaryOp<"trunc"> {
   let summary = "trunc of the specified value";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.trunc` ssa-use `:` type
-    ```
-
     The `trunc` operation returns the operand rounded to the nearest integer
     value in floating-point format. It takes one operand of floating point type
     (i.e., scalar, tensor or vector) and produces one result of the same type.
@@ -952,12 +844,6 @@ def Math_FPowIOp : Math_Op<"fpowi",
      DeclareOpInterfaceMethods<ArithFastMathInterface>]> {
   let summary = "floating point raised to the signed integer power";
   let description = [{
-    Syntax:
-
-    ```
-    operation ::= ssa-id `=` `math.fpowi` ssa-use `,` ssa-use `:` type
-    ```
-
     The `fpowi` operation takes a `base` operand of floating point type
     (i.e. scalar, tensor or vector) and a `power` operand of integer type
     (also scalar, tensor or vector) and returns one result of the same type
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index a78c3e98c955..234c1076e14e 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -156,29 +156,46 @@ def DeclareActionAttr : OpenACC_Attr<"DeclareAction", "declare_action"> {
 }
 
 // Device type enumeration.
-def OpenACC_DeviceTypeStar      : I32EnumAttrCase<"Star", 0, "star">;
-def OpenACC_DeviceTypeDefault   : I32EnumAttrCase<"Default", 1, "default">;
-def OpenACC_DeviceTypeHost      : I32EnumAttrCase<"Host", 2, "host">;
-def OpenACC_DeviceTypeMulticore : I32EnumAttrCase<"Multicore", 3, "multicore">;
-def OpenACC_DeviceTypeNvidia    : I32EnumAttrCase<"Nvidia", 4, "nvidia">;
-def OpenACC_DeviceTypeRadeon    : I32EnumAttrCase<"Radeon", 5, "radeon">;
-
+def OpenACC_DeviceTypeNone      : I32EnumAttrCase<"None", 0, "none">;
+def OpenACC_DeviceTypeStar      : I32EnumAttrCase<"Star", 1, "star">;
+def OpenACC_DeviceTypeDefault   : I32EnumAttrCase<"Default", 2, "default">;
+def OpenACC_DeviceTypeHost      : I32EnumAttrCase<"Host", 3, "host">;
+def OpenACC_DeviceTypeMulticore : I32EnumAttrCase<"Multicore", 4, "multicore">;
+def OpenACC_DeviceTypeNvidia    : I32EnumAttrCase<"Nvidia", 5, "nvidia">;
+def OpenACC_DeviceTypeRadeon    : I32EnumAttrCase<"Radeon", 6, "radeon">;
 
 def OpenACC_DeviceType : I32EnumAttr<"DeviceType",
     "built-in device type supported by OpenACC",
-    [OpenACC_DeviceTypeStar, OpenACC_DeviceTypeDefault,
+    [OpenACC_DeviceTypeNone, OpenACC_DeviceTypeStar, OpenACC_DeviceTypeDefault,
      OpenACC_DeviceTypeHost, OpenACC_DeviceTypeMulticore,
      OpenACC_DeviceTypeNvidia, OpenACC_DeviceTypeRadeon
     ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::acc";
 }
+
+// Device type attribute is used to associate a value for for clauses that
+// appear after a device_type clause. The list of clauses allowed after the
+// device_type clause is defined per construct as follows:
+// Loop construct: collapse, gang, worker, vector, seq, independent, auto,
+//                 and tile
+// Compute construct: async, wait, num_gangs, num_workers, and vector_length
+// Data construct: async and wait 
+// Routine: gang, worker, vector, seq and bind
+//
+// The `none` means that the value appears before any device_type clause.
+//
 def OpenACC_DeviceTypeAttr : EnumAttr<OpenACC_Dialect,
                                       OpenACC_DeviceType,
                                       "device_type"> {
   let assemblyFormat = [{ ```<` $value `>` }];
 }
 
+def DeviceTypeArrayAttr :
+  TypedArrayAttrBase<OpenACC_DeviceTypeAttr, "device type array attribute"> {
+  let constBuilderCall = ?;
+}
+
 // Define a resource for the OpenACC runtime counters.
 def OpenACC_RuntimeCounters : Resource<"::mlir::acc::RuntimeCounters">;
 
@@ -863,24 +880,32 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel",
     ```
   }];
 
-  let arguments = (ins Optional<IntOrIndex>:$async,
-                       UnitAttr:$asyncAttr,
-                       Variadic<IntOrIndex>:$waitOperands,
-                       UnitAttr:$waitAttr,
-                       Variadic<IntOrIndex>:$numGangs,
-                       Optional<IntOrIndex>:$numWorkers,
-                       Optional<IntOrIndex>:$vectorLength,
-                       Optional<I1>:$ifCond,
-                       Optional<I1>:$selfCond,
-                       UnitAttr:$selfAttr,
-                       Variadic<AnyType>:$reductionOperands,
-                       OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
-                       OptionalAttr<SymbolRefArrayAttr>:$privatizations,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangFirstPrivateOperands,
-                       OptionalAttr<SymbolRefArrayAttr>:$firstprivatizations,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
-                       OptionalAttr<DefaultValueAttr>:$defaultAttr);
+  let arguments = (ins
+      Variadic<IntOrIndex>:$async,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
+      Variadic<IntOrIndex>:$waitOperands,
+      OptionalAttr<DenseI32ArrayAttr>:$waitOperandsSegments,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOperandsDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOnly,
+      Variadic<IntOrIndex>:$numGangs,
+      OptionalAttr<DenseI32ArrayAttr>:$numGangsSegments,
+      OptionalAttr<DeviceTypeArrayAttr>:$numGangsDeviceType,
+      Variadic<IntOrIndex>:$numWorkers,
+      OptionalAttr<DeviceTypeArrayAttr>:$numWorkersDeviceType,
+      Variadic<IntOrIndex>:$vectorLength,
+      OptionalAttr<DeviceTypeArrayAttr>:$vectorLengthDeviceType,
+      Optional<I1>:$ifCond,
+      Optional<I1>:$selfCond,
+      UnitAttr:$selfAttr,
+      Variadic<AnyType>:$reductionOperands,
+      OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
+      OptionalAttr<SymbolRefArrayAttr>:$privatizations,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$gangFirstPrivateOperands,
+      OptionalAttr<SymbolRefArrayAttr>:$firstprivatizations,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
+      OptionalAttr<DefaultValueAttr>:$defaultAttr);
 
   let regions = (region AnyRegion:$region);
 
@@ -890,22 +915,69 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel",
 
     /// The i-th data operand passed.
     Value getDataOperand(unsigned i);
+
+    /// Return true if the op has the async attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasAsyncOnly();
+    /// Return true if the op has the async attribute for the given device_type.
+    bool hasAsyncOnly(mlir::acc::DeviceType deviceType);
+    /// Return the value of the async clause if present.
+    mlir::Value getAsyncValue();
+    /// Return the value of the async clause for the given device_type if
+    /// present.
+    mlir::Value getAsyncValue(mlir::acc::DeviceType deviceType);
+
+    /// Return the value of the num_workers clause if present.
+    mlir::Value getNumWorkersValue();
+    /// Return the value of the num_workers clause for the given device_type if
+    /// present.
+    mlir::Value getNumWorkersValue(mlir::acc::DeviceType deviceType);
+
+    /// Return the value of the vector_length clause if present.
+    mlir::Value getVectorLengthValue();
+    /// Return the value of the vector_length clause for the given device_type 
+    /// if present.
+    mlir::Value getVectorLengthValue(mlir::acc::DeviceType deviceType);
+
+    /// Return the values of the num_gangs clause if present.
+    mlir::Operation::operand_range getNumGangsValues();
+    /// Return the values of the num_gangs clause for the given device_type if
+    /// present.
+    mlir::Operation::operand_range
+    getNumGangsValues(mlir::acc::DeviceType deviceType);
+
+    /// Return true if the op has the wait attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasWaitOnly();
+    /// Return true if the op has the wait attribute for the given device_type.
+    bool hasWaitOnly(mlir::acc::DeviceType deviceType);
+    /// Return the values of the wait clause if present.
+    mlir::Operation::operand_range getWaitValues();
+    /// Return the values of the wait clause for the given device_type if
+    /// present.
+    mlir::Operation::operand_range
+    getWaitValues(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     oilist(
         `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)`
-      | `async` `(` $async `:` type($async) `)`
+      | `async` `(` custom<DeviceTypeOperands>($async,
+            type($async), $asyncDeviceType) `)`
       | `firstprivate` `(` custom<SymOperandList>($gangFirstPrivateOperands,
             type($gangFirstPrivateOperands), $firstprivatizations)
         `)`
-      | `num_gangs` `(` $numGangs `:` type($numGangs) `)`
-      | `num_workers` `(` $numWorkers `:` type($numWorkers) `)`
+      | `num_gangs` `(` custom<NumGangs>($numGangs,
+            type($numGangs), $numGangsDeviceType, $numGangsSegments) `)`
+      | `num_workers` `(` custom<DeviceTypeOperands>($numWorkers,
+            type($numWorkers), $numWorkersDeviceType) `)`
       | `private` `(` custom<SymOperandList>(
             $gangPrivateOperands, type($gangPrivateOperands), $privatizations)
         `)`
-      | `vector_length` `(` $vectorLength `:` type($vectorLength) `)`
-      | `wait` `(` $waitOperands `:` type($waitOperands) `)`
+      | `vector_length` `(` custom<DeviceTypeOperands>($vectorLength,
+            type($vectorLength), $vectorLengthDeviceType) `)`
+      | `wait` `(` custom<WaitOperands>($waitOperands,
+            type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments) `)`
       | `self` `(` $selfCond `)`
       | `if` `(` $ifCond `)`
       | `reduction` `(` custom<SymOperandList>(
@@ -939,21 +1011,25 @@ def OpenACC_SerialOp : OpenACC_Op<"serial",
     ```
   }];
 
-  let arguments = (ins Optional<IntOrIndex>:$async,
-                       UnitAttr:$asyncAttr,
-                       Variadic<IntOrIndex>:$waitOperands,
-                       UnitAttr:$waitAttr,
-                       Optional<I1>:$ifCond,
-                       Optional<I1>:$selfCond,
-                       UnitAttr:$selfAttr,
-                       Variadic<AnyType>:$reductionOperands,
-                       OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
-                       OptionalAttr<SymbolRefArrayAttr>:$privatizations,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangFirstPrivateOperands,
-                       OptionalAttr<SymbolRefArrayAttr>:$firstprivatizations,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
-                       OptionalAttr<DefaultValueAttr>:$defaultAttr);
+  let arguments = (ins
+      Variadic<IntOrIndex>:$async,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
+      Variadic<IntOrIndex>:$waitOperands,
+      OptionalAttr<DenseI32ArrayAttr>:$waitOperandsSegments,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOperandsDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOnly,
+      Optional<I1>:$ifCond,
+      Optional<I1>:$selfCond,
+      UnitAttr:$selfAttr,
+      Variadic<AnyType>:$reductionOperands,
+      OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
+      OptionalAttr<SymbolRefArrayAttr>:$privatizations,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$gangFirstPrivateOperands,
+      OptionalAttr<SymbolRefArrayAttr>:$firstprivatizations,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
+      OptionalAttr<DefaultValueAttr>:$defaultAttr);
 
   let regions = (region AnyRegion:$region);
 
@@ -963,19 +1039,44 @@ def OpenACC_SerialOp : OpenACC_Op<"serial",
 
     /// The i-th data operand passed.
     Value getDataOperand(unsigned i);
+
+    /// Return true if the op has the async attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasAsyncOnly();
+    /// Return true if the op has the async attribute for the given device_type.
+    bool hasAsyncOnly(mlir::acc::DeviceType deviceType);
+    /// Return the value of the async clause if present.
+    mlir::Value getAsyncValue();
+    /// Return the value of the async clause for the given device_type if
+    /// present.
+    mlir::Value getAsyncValue(mlir::acc::DeviceType deviceType);
+
+    /// Return true if the op has the wait attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasWaitOnly();
+    /// Return true if the op has the wait attribute for the given device_type.
+    bool hasWaitOnly(mlir::acc::DeviceType deviceType);
+    /// Return the values of the wait clause if present.
+    mlir::Operation::operand_range getWaitValues();
+    /// Return the values of the wait clause for the given device_type if
+    /// present.
+    mlir::Operation::operand_range
+    getWaitValues(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     oilist(
         `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)`
-      | `async` `(` $async `:` type($async) `)`
+      | `async` `(` custom<DeviceTypeOperands>($async,
+            type($async), $asyncDeviceType) `)`
       | `firstprivate` `(` custom<SymOperandList>($gangFirstPrivateOperands,
             type($gangFirstPrivateOperands), $firstprivatizations)
         `)`
       | `private` `(` custom<SymOperandList>(
             $gangPrivateOperands, type($gangPrivateOperands), $privatizations)
         `)`
-      | `wait` `(` $waitOperands `:` type($waitOperands) `)`
+      | `wait` `(` custom<WaitOperands>($waitOperands,
+            type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments) `)`
       | `self` `(` $selfCond `)`
       | `if` `(` $ifCond `)`
       | `reduction` `(` custom<SymOperandList>(
@@ -1011,18 +1112,26 @@ def OpenACC_KernelsOp : OpenACC_Op<"kernels",
     ```
   }];
 
-  let arguments = (ins Optional<IntOrIndex>:$async,
-                       UnitAttr:$asyncAttr,
-                       Variadic<IntOrIndex>:$waitOperands,
-                       UnitAttr:$waitAttr,
-                       Variadic<IntOrIndex>:$numGangs,
-                       Optional<IntOrIndex>:$numWorkers,
-                       Optional<IntOrIndex>:$vectorLength,
-                       Optional<I1>:$ifCond,
-                       Optional<I1>:$selfCond,
-                       UnitAttr:$selfAttr,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
-                       OptionalAttr<DefaultValueAttr>:$defaultAttr);
+  let arguments = (ins
+      Variadic<IntOrIndex>:$async,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
+      Variadic<IntOrIndex>:$waitOperands,
+      OptionalAttr<DenseI32ArrayAttr>:$waitOperandsSegments,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOperandsDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOnly,
+      Variadic<IntOrIndex>:$numGangs,
+      OptionalAttr<DenseI32ArrayAttr>:$numGangsSegments,
+      OptionalAttr<DeviceTypeArrayAttr>:$numGangsDeviceType,
+      Variadic<IntOrIndex>:$numWorkers,
+      OptionalAttr<DeviceTypeArrayAttr>:$numWorkersDeviceType,
+      Variadic<IntOrIndex>:$vectorLength,
+      OptionalAttr<DeviceTypeArrayAttr>:$vectorLengthDeviceType,
+      Optional<I1>:$ifCond,
+      Optional<I1>:$selfCond,
+      UnitAttr:$selfAttr,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
+      OptionalAttr<DefaultValueAttr>:$defaultAttr);
 
   let regions = (region AnyRegion:$region);
 
@@ -1032,16 +1141,63 @@ def OpenACC_KernelsOp : OpenACC_Op<"kernels",
 
     /// The i-th data operand passed.
     Value getDataOperand(unsigned i);
+
+    /// Return true if the op has the async attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasAsyncOnly();
+    /// Return true if the op has the async attribute for the given device_type.
+    bool hasAsyncOnly(mlir::acc::DeviceType deviceType);
+    /// Return the value of the async clause if present.
+    mlir::Value getAsyncValue();
+    /// Return the value of the async clause for the given device_type if
+    /// present.
+    mlir::Value getAsyncValue(mlir::acc::DeviceType deviceType);
+
+    /// Return the value of the num_workers clause if present.
+    mlir::Value getNumWorkersValue();
+    /// Return the value of the num_workers clause for the given device_type if
+    /// present.
+    mlir::Value getNumWorkersValue(mlir::acc::DeviceType deviceType);
+
+    /// Return the value of the vector_length clause if present.
+    mlir::Value getVectorLengthValue();
+    /// Return the value of the vector_length clause for the given device_type 
+    /// if present.
+    mlir::Value getVectorLengthValue(mlir::acc::DeviceType deviceType);
+
+    /// Return the values of the num_gangs clause if present.
+    mlir::Operation::operand_range getNumGangsValues();
+    /// Return the values of the num_gangs clause for the given device_type if
+    /// present.
+    mlir::Operation::operand_range
+    getNumGangsValues(mlir::acc::DeviceType deviceType);
+
+    /// Return true if the op has the wait attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasWaitOnly();
+    /// Return true if the op has the wait attribute for the given device_type.
+    bool hasWaitOnly(mlir::acc::DeviceType deviceType);
+    /// Return the values of the wait clause if present.
+    mlir::Operation::operand_range getWaitValues();
+    /// Return the values of the wait clause for the given device_type if
+    /// present.
+    mlir::Operation::operand_range
+    getWaitValues(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     oilist(
         `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)`
-      | `async` `(` $async `:` type($async) `)`
-      | `num_gangs` `(` $numGangs `:` type($numGangs) `)`
-      | `num_workers` `(` $numWorkers `:` type($numWorkers) `)`
-      | `vector_length` `(` $vectorLength `:` type($vectorLength) `)`
-      | `wait` `(` $waitOperands `:` type($waitOperands) `)`
+      | `async` `(` custom<DeviceTypeOperands>($async,
+            type($async), $asyncDeviceType) `)`
+      | `num_gangs` `(` custom<NumGangs>($numGangs,
+            type($numGangs), $numGangsDeviceType, $numGangsSegments) `)`
+      | `num_workers` `(` custom<DeviceTypeOperands>($numWorkers,
+            type($numWorkers), $numWorkersDeviceType) `)`
+      | `vector_length` `(` custom<DeviceTypeOperands>($vectorLength,
+            type($vectorLength), $vectorLengthDeviceType) `)`
+      | `wait` `(` custom<WaitOperands>($waitOperands,
+            type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments) `)`
       | `self` `(` $selfCond `)`
       | `if` `(` $ifCond `)`
     )
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
index 51124e141c6d..22d5afcd7738 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
@@ -582,6 +582,8 @@ def SPIRV_SNegateOp : SPIRV_ArithmeticUnaryOp<"SNegate",
     %3 = spirv.SNegate %2 : vector<4xi32>
     ```
   }];
+
+  let hasFolder = 1;
 }
 
 // -----
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBitOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBitOps.td
index b460c8e68aa0..38639a175ab4 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBitOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBitOps.td
@@ -462,6 +462,8 @@ def SPIRV_NotOp : SPIRV_BitUnaryOp<"Not", [UsableInSpecConstantOp]> {
     %3 = spirv.Not %1 : vector<4xi32>
     ```
   }];
+
+  let hasFolder = 1;
 }
 
 #endif // MLIR_DIALECT_SPIRV_IR_BIT_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td
index 2e26c44de281..e48a56f0625d 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td
@@ -534,6 +534,7 @@ def SPIRV_LogicalNotOp : SPIRV_LogicalUnaryOp<"LogicalNot",
   }];
 
   let hasCanonicalizer = 1;
+  let hasFolder = 1;
 }
 
 // -----
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td
index 654c6aff0c64..185cff46ae25 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td
@@ -30,13 +30,6 @@ def SparseTensor_StorageSpecifier : SparseTensor_Type<"StorageSpecifier"> {
   let summary = "Structured metadata for sparse tensor low-level storage scheme";
 
   let description = [{
-    Syntax:
-
-    ```
-    storage_specifier-type ::= `!storage_specifier` `<` encoding `>`
-    encoding ::= attribute-value
-    ```
-
     Values with storage_specifier types represent aggregated storage scheme
     metadata for the given sparse tensor encoding.  It currently holds
     a set of values for level-sizes, coordinate arrays, position arrays,
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h b/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h
index 06642adda42b..0a21c9922b22 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h
+++ b/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h
@@ -163,9 +163,6 @@ void populateFoldConstantExtractSlicePatterns(
           return false;
         });
 
-/// Patterns to simplify tensor.pack.
-void populateSimplifyTensorPack(RewritePatternSet &patterns);
-
 } // namespace tensor
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
index 44b8377bd6aa..35b519e790d1 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
@@ -74,6 +74,11 @@ void populateFoldTensorEmptyPatterns(RewritePatternSet &patterns,
 /// that it can be bufferized into a sequence of copies.
 void populateDecomposeTensorConcatPatterns(RewritePatternSet &patterns);
 
+/// Populates `patterns` with patterns that simplify `tensor.pack` and
+/// `tensor.unpack` operations.
+/// TODO: Add a pattern to convert tensor.unpack op to tensor.collapse_shape op.
+void populateSimplifyPackAndUnpackPatterns(RewritePatternSet &patterns);
+
 /// Populates `patterns` with patterns that fold operations like `tensor.pad`
 /// and `tensor.extract_slice` into `tensor.pack` and `tensor.unpack` operations
 /// respectively.
diff --git a/mlir/include/mlir/Dialect/UB/IR/UBOps.td b/mlir/include/mlir/Dialect/UB/IR/UBOps.td
index beaf5616d6c6..f3d5a26ef6f9 100644
--- a/mlir/include/mlir/Dialect/UB/IR/UBOps.td
+++ b/mlir/include/mlir/Dialect/UB/IR/UBOps.td
@@ -48,12 +48,6 @@ def PoisonOp : UB_Op<"poison", [ConstantLike, Pure]> {
     semantics (e.g. partially poisoned vectors), default value indicates results
     is fully poisoned.
 
-    Syntax:
-
-    ```
-    poison-op ::= `poison` (`<` value `>`)? `:` type
-    ```
-
     Examples:
 
     ```
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 423118f79e73..40d874dc99dd 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -1582,22 +1582,27 @@ def Vector_LoadOp : Vector_Op<"load"> {
     vector. If the memref element type is vector, it should match the result
     vector type.
 
-    Example 1: 1-D vector load on a scalar memref.
+    Example: 0-D vector load on a scalar memref.
+    ```mlir
+    %result = vector.load %base[%i, %j] : memref<100x100xf32>, vector<f32>
+    ```
+
+    Example: 1-D vector load on a scalar memref.
     ```mlir
     %result = vector.load %base[%i, %j] : memref<100x100xf32>, vector<8xf32>
     ```
 
-    Example 2: 1-D vector load on a vector memref.
+    Example: 1-D vector load on a vector memref.
     ```mlir
     %result = vector.load %memref[%i, %j] : memref<200x100xvector<8xf32>>, vector<8xf32>
     ```
 
-    Example 3:  2-D vector load on a scalar memref.
+    Example:  2-D vector load on a scalar memref.
     ```mlir
     %result = vector.load %memref[%i, %j] : memref<200x100xf32>, vector<4x8xf32>
     ```
 
-    Example 4:  2-D vector load on a vector memref.
+    Example:  2-D vector load on a vector memref.
     ```mlir
     %result = vector.load %memref[%i, %j] : memref<200x100xvector<4x8xf32>>, vector<4x8xf32>
     ```
@@ -1608,12 +1613,12 @@ def Vector_LoadOp : Vector_Op<"load"> {
     loaded out of bounds. Not all targets may support out-of-bounds vector
     loads.
 
-    Example 5:  Potential out-of-bound vector load.
+    Example:  Potential out-of-bound vector load.
     ```mlir
     %result = vector.load %memref[%index] : memref<?xf32>, vector<8xf32>
     ```
 
-    Example 6:  Explicit out-of-bound vector load.
+    Example:  Explicit out-of-bound vector load.
     ```mlir
     %result = vector.load %memref[%c0] : memref<7xf32>, vector<8xf32>
     ```
@@ -1622,7 +1627,7 @@ def Vector_LoadOp : Vector_Op<"load"> {
   let arguments = (ins Arg<AnyMemRef, "the reference to load from",
       [MemRead]>:$base,
       Variadic<Index>:$indices);
-  let results = (outs AnyVector:$result);
+  let results = (outs AnyVectorOfAnyRank:$result);
 
   let extraClassDeclaration = [{
     MemRefType getMemRefType() {
@@ -1660,22 +1665,27 @@ def Vector_StoreOp : Vector_Op<"store"> {
     to store. If the memref element type is vector, it should match the type
     of the value to store.
 
-    Example 1: 1-D vector store on a scalar memref.
+    Example: 0-D vector store on a scalar memref.
+    ```mlir
+    vector.store %valueToStore, %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+    ```
+
+    Example: 1-D vector store on a scalar memref.
     ```mlir
     vector.store %valueToStore, %memref[%i, %j] : memref<200x100xf32>, vector<8xf32>
     ```
 
-    Example 2: 1-D vector store on a vector memref.
+    Example: 1-D vector store on a vector memref.
     ```mlir
     vector.store %valueToStore, %memref[%i, %j] : memref<200x100xvector<8xf32>>, vector<8xf32>
     ```
 
-    Example 3:  2-D vector store on a scalar memref.
+    Example:  2-D vector store on a scalar memref.
     ```mlir
     vector.store %valueToStore, %memref[%i, %j] : memref<200x100xf32>, vector<4x8xf32>
     ```
 
-    Example 4:  2-D vector store on a vector memref.
+    Example:  2-D vector store on a vector memref.
     ```mlir
     vector.store %valueToStore, %memref[%i, %j] : memref<200x100xvector<4x8xf32>>, vector<4x8xf32>
     ```
@@ -1685,21 +1695,23 @@ def Vector_StoreOp : Vector_Op<"store"> {
     target-specific. No assumptions should be made on the memory written out of
     bounds. Not all targets may support out-of-bounds vector stores.
 
-    Example 5:  Potential out-of-bounds vector store.
+    Example:  Potential out-of-bounds vector store.
     ```mlir
     vector.store %valueToStore, %memref[%index] : memref<?xf32>, vector<8xf32>
     ```
 
-    Example 6:  Explicit out-of-bounds vector store.
+    Example:  Explicit out-of-bounds vector store.
     ```mlir
     vector.store %valueToStore, %memref[%c0] : memref<7xf32>, vector<8xf32>
     ```
   }];
 
-  let arguments = (ins AnyVector:$valueToStore,
+  let arguments = (ins
+      AnyVectorOfAnyRank:$valueToStore,
       Arg<AnyMemRef, "the reference to store to",
       [MemWrite]>:$base,
-      Variadic<Index>:$indices);
+      Variadic<Index>:$indices
+  );
 
   let extraClassDeclaration = [{
     MemRefType getMemRefType() {
diff --git a/mlir/include/mlir/IR/Value.h b/mlir/include/mlir/IR/Value.h
index 1c5bf09c1aeb..fff3b87faff6 100644
--- a/mlir/include/mlir/IR/Value.h
+++ b/mlir/include/mlir/IR/Value.h
@@ -443,7 +443,7 @@ struct TypedValue : Value {
   static bool classof(Value value) { return llvm::isa<Ty>(value.getType()); }
 
   /// Return the known Type
-  Ty getType() { return llvm::cast<Ty>(Value::getType()); }
+  Ty getType() const { return llvm::cast<Ty>(Value::getType()); }
   void setType(Ty ty) { Value::setType(ty); }
 };
 
diff --git a/mlir/include/mlir/Transforms/FoldUtils.h b/mlir/include/mlir/Transforms/FoldUtils.h
index 2600da361496..2e7a6fe3e362 100644
--- a/mlir/include/mlir/Transforms/FoldUtils.h
+++ b/mlir/include/mlir/Transforms/FoldUtils.h
@@ -33,7 +33,8 @@ class Value;
 class OperationFolder {
 public:
   OperationFolder(MLIRContext *ctx, OpBuilder::Listener *listener = nullptr)
-      : interfaces(ctx), rewriter(ctx, listener) {}
+      : erasedFoldedLocation(UnknownLoc::get(ctx)), interfaces(ctx),
+        rewriter(ctx, listener) {}
 
   /// Tries to perform folding on the given `op`, including unifying
   /// deduplicated constants. If successful, replaces `op`'s uses with
@@ -65,7 +66,7 @@ public:
   /// be created in a parent block. On success this returns the constant
   /// operation, nullptr otherwise.
   Value getOrCreateConstant(Block *block, Dialect *dialect, Attribute value,
-                            Type type, Location loc);
+                            Type type);
 
 private:
   /// This map keeps track of uniqued constants by dialect, attribute, and type.
@@ -95,6 +96,9 @@ private:
                                     Dialect *dialect, Attribute value,
                                     Type type, Location loc);
 
+  /// The location to overwrite with for folder-owned constants.
+  UnknownLoc erasedFoldedLocation;
+
   /// A mapping between an insertion region and the constants that have been
   /// created within it.
   DenseMap<Region *, ConstantMap> foldScopes;
diff --git a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
index 2820d27b65f7..7875fa9d43d9 100644
--- a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
@@ -191,7 +191,7 @@ void LivenessAnalysis::visitCallOperand(OpOperand &operand) {
 
 void LivenessAnalysis::setToExitState(Liveness *lattice) {
   // This marks values of type (2) liveness as "live".
-  lattice->markLive();
+  (void)lattice->markLive();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Analysis/Presburger/CMakeLists.txt b/mlir/lib/Analysis/Presburger/CMakeLists.txt
index 22f1a4cac440..e77e1623dae1 100644
--- a/mlir/lib/Analysis/Presburger/CMakeLists.txt
+++ b/mlir/lib/Analysis/Presburger/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_library(MLIRPresburger
   PresburgerRelation.cpp
   PresburgerSpace.cpp
   PWMAFunction.cpp
+  QuasiPolynomial.cpp
   Simplex.cpp
   SlowMPInt.cpp
   Utils.cpp
diff --git a/mlir/lib/Analysis/Presburger/GeneratingFunction.h b/mlir/lib/Analysis/Presburger/GeneratingFunction.h
new file mode 100644
index 000000000000..f7deba921ea5
--- /dev/null
+++ b/mlir/lib/Analysis/Presburger/GeneratingFunction.h
@@ -0,0 +1,134 @@
+//===- GeneratingFunction.h - Generating Functions over Q^d -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of the GeneratingFunction class for Barvinok's algorithm,
+// which represents a function over Q^n, parameterized by d parameters.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H
+#define MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H
+
+#include "mlir/Analysis/Presburger/Fraction.h"
+#include "mlir/Analysis/Presburger/Matrix.h"
+
+namespace mlir {
+namespace presburger {
+
+// A parametric point is a vector, each of whose elements
+// is an affine function of n parameters. Each row
+// in the matrix represents the affine function and
+// has n+1 elements.
+using ParamPoint = FracMatrix;
+
+// A point is simply a vector.
+using Point = SmallVector<Fraction>;
+
+// A class to describe the type of generating function
+// used to enumerate the integer points in a polytope.
+// Consists of a set of terms, where the ith term has
+// * a sign, ±1, stored in `signs[i]`
+// * a numerator, of the form x^{n},
+//      where n, stored in `numerators[i]`,
+//      is a parametric point.
+// * a denominator, of the form (1 - x^{d1})...(1 - x^{dn}),
+//      where each dj, stored in `denominators[i][j]`,
+//      is a vector.
+//
+// Represents functions f_p : Q^n -> Q of the form
+//
+// f_p(x) = \sum_i s_i * (x^n_i(p)) / (\prod_j (1 - x^d_{ij})
+//
+// where s_i is ±1,
+// n_i \in Q^d -> Q^n is an n-vector of affine functions on d parameters, and
+// g_{ij} \in Q^n are vectors.
+class GeneratingFunction {
+public:
+  GeneratingFunction(unsigned numParam, SmallVector<int> signs,
+                     std::vector<ParamPoint> nums,
+                     std::vector<std::vector<Point>> dens)
+      : numParam(numParam), signs(signs), numerators(nums), denominators(dens) {
+#ifndef NDEBUG
+    for (const ParamPoint &term : numerators)
+      assert(term.getNumColumns() == numParam + 1 &&
+             "dimensionality of numerator exponents does not match number of "
+             "parameters!");
+#endif // NDEBUG
+  }
+
+  unsigned getNumParams() { return numParam; }
+
+  SmallVector<int> getSigns() { return signs; }
+
+  std::vector<ParamPoint> getNumerators() { return numerators; }
+
+  std::vector<std::vector<Point>> getDenominators() { return denominators; }
+
+  GeneratingFunction operator+(GeneratingFunction &gf) const {
+    assert(numParam == gf.getNumParams() &&
+           "two generating functions with different numbers of parameters "
+           "cannot be added!");
+    SmallVector<int> sumSigns = signs;
+    sumSigns.append(gf.signs);
+
+    std::vector<ParamPoint> sumNumerators = numerators;
+    sumNumerators.insert(sumNumerators.end(), gf.numerators.begin(),
+                         gf.numerators.end());
+
+    std::vector<std::vector<Point>> sumDenominators = denominators;
+    sumDenominators.insert(sumDenominators.end(), gf.denominators.begin(),
+                           gf.denominators.end());
+    return GeneratingFunction(0, sumSigns, sumNumerators, sumDenominators);
+  }
+
+  llvm::raw_ostream &print(llvm::raw_ostream &os) const {
+    for (unsigned i = 0, e = signs.size(); i < e; i++) {
+      if (i == 0) {
+        if (signs[i] == -1)
+          os << "- ";
+      } else {
+        if (signs[i] == 1)
+          os << " + ";
+        else
+          os << " - ";
+      }
+
+      os << "x^[";
+      unsigned r = numerators[i].getNumRows();
+      for (unsigned j = 0; j < r - 1; j++) {
+        os << "[";
+        for (unsigned k = 0, c = numerators[i].getNumColumns(); k < c - 1; k++)
+          os << numerators[i].at(j, k) << ",";
+        os << numerators[i].getRow(j).back() << "],";
+      }
+      os << "[";
+      for (unsigned k = 0, c = numerators[i].getNumColumns(); k < c - 1; k++)
+        os << numerators[i].at(r - 1, k) << ",";
+      os << numerators[i].getRow(r - 1).back() << "]]/";
+
+      for (const Point &den : denominators[i]) {
+        os << "(x^[";
+        for (unsigned j = 0, e = den.size(); j < e - 1; j++)
+          os << den[j] << ",";
+        os << den.back() << "])";
+      }
+    }
+    return os;
+  }
+
+private:
+  unsigned numParam;
+  SmallVector<int> signs;
+  std::vector<ParamPoint> numerators;
+  std::vector<std::vector<Point>> denominators;
+};
+
+} // namespace presburger
+} // namespace mlir
+
+#endif // MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H
diff --git a/mlir/lib/Analysis/Presburger/Matrix.cpp b/mlir/lib/Analysis/Presburger/Matrix.cpp
index 25300f84cfc0..b68a7b7004bb 100644
--- a/mlir/lib/Analysis/Presburger/Matrix.cpp
+++ b/mlir/lib/Analysis/Presburger/Matrix.cpp
@@ -452,6 +452,9 @@ MPInt IntMatrix::determinant(IntMatrix *inverse) const {
   if (detM == 0)
     return MPInt(0);
 
+  if (!inverse)
+    return detM;
+
   *inverse = IntMatrix(nRows, nColumns);
   for (unsigned i = 0; i < nRows; i++)
     for (unsigned j = 0; j < nColumns; j++)
@@ -642,5 +645,4 @@ void FracMatrix::LLL(Fraction delta) {
       k = k > 1 ? k - 1 : 1;
     }
   }
-  return;
 }
diff --git a/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp b/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp
index cf1b3befbc89..a6f9af83b977 100644
--- a/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp
+++ b/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp
@@ -18,8 +18,9 @@ using namespace presburger;
 bool Identifier::isEqual(const Identifier &other) const {
   if (value == nullptr || other.value == nullptr)
     return false;
-  assert(value == other.value && idType == other.idType &&
-         "Values of Identifiers are equal but their types do not match.");
+  assert(value != other.value ||
+         (value == other.value && idType == other.idType &&
+          "Values of Identifiers are equal but their types do not match."));
   return value == other.value;
 }
 
@@ -293,6 +294,40 @@ void PresburgerSpace::setVarSymbolSeperation(unsigned newSymbolCount) {
   // `identifiers` remains same.
 }
 
+void PresburgerSpace::mergeAndAlignSymbols(PresburgerSpace &other) {
+  assert(usingIds && other.usingIds &&
+         "Both spaces need to have identifers to merge & align");
+
+  // First merge & align identifiers into `other` from `this`.
+  unsigned kindBeginOffset = other.getVarKindOffset(VarKind::Symbol);
+  unsigned i = 0;
+  for (const Identifier *identifier =
+           identifiers.begin() + getVarKindOffset(VarKind::Symbol);
+       identifier != identifiers.begin() + getVarKindEnd(VarKind::Symbol);
+       identifier++) {
+    // If the identifier exists in `other`, then align it; otherwise insert it
+    // assuming it is a new identifier. Search in `other` starting at position
+    // `i` since the left of `i` is aligned.
+    auto *findEnd =
+        other.identifiers.begin() + other.getVarKindEnd(VarKind::Symbol);
+    auto *itr = std::find(other.identifiers.begin() + kindBeginOffset + i,
+                          findEnd, *identifier);
+    if (itr != findEnd) {
+      std::iter_swap(other.identifiers.begin() + kindBeginOffset + i, itr);
+    } else {
+      other.insertVar(VarKind::Symbol, i);
+      other.getId(VarKind::Symbol, i) = *identifier;
+    }
+    i++;
+  }
+
+  // Finally add identifiers that are in `other`, but not in `this` to `this`.
+  for (unsigned e = other.getNumVarKind(VarKind::Symbol); i < e; i++) {
+    insertVar(VarKind::Symbol, i);
+    getId(VarKind::Symbol, i) = other.getId(VarKind::Symbol, i);
+  }
+}
+
 void PresburgerSpace::print(llvm::raw_ostream &os) const {
   os << "Domain: " << getNumDomainVars() << ", "
      << "Range: " << getNumRangeVars() << ", "
diff --git a/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
new file mode 100644
index 000000000000..feed683a203c
--- /dev/null
+++ b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
@@ -0,0 +1,115 @@
+//===- QuasiPolynomial.cpp - Quasipolynomial Class --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Presburger/QuasiPolynomial.h"
+#include "mlir/Analysis/Presburger/Fraction.h"
+#include "mlir/Analysis/Presburger/PresburgerSpace.h"
+#include "mlir/Analysis/Presburger/Utils.h"
+
+using namespace mlir;
+using namespace presburger;
+
+QuasiPolynomial::QuasiPolynomial(
+    unsigned numVars, SmallVector<Fraction> coeffs,
+    std::vector<std::vector<SmallVector<Fraction>>> aff)
+    : PresburgerSpace(/*numDomain=*/numVars, /*numRange=*/1, /*numSymbols=*/0,
+                      /*numLocals=*/0),
+      coefficients(coeffs), affine(aff) {
+#ifndef NDEBUG
+  // For each term which involves at least one affine function,
+  for (const std::vector<SmallVector<Fraction>> &term : affine) {
+    if (term.empty())
+      continue;
+    // the number of elements in each affine function is
+    // one more than the number of symbols.
+    for (const SmallVector<Fraction> &aff : term) {
+      assert(aff.size() == getNumInputs() + 1 &&
+             "dimensionality of affine functions does not match number of "
+             "symbols!");
+    }
+  }
+#endif // NDEBUG
+}
+
+QuasiPolynomial QuasiPolynomial::operator+(const QuasiPolynomial &x) const {
+  assert(getNumInputs() == x.getNumInputs() &&
+         "two quasi-polynomials with different numbers of symbols cannot "
+         "be added!");
+  SmallVector<Fraction> sumCoeffs = coefficients;
+  sumCoeffs.append(x.coefficients);
+  std::vector<std::vector<SmallVector<Fraction>>> sumAff = affine;
+  sumAff.insert(sumAff.end(), x.affine.begin(), x.affine.end());
+  return QuasiPolynomial(getNumInputs(), sumCoeffs, sumAff);
+}
+
+QuasiPolynomial QuasiPolynomial::operator-(const QuasiPolynomial &x) const {
+  assert(getNumInputs() == x.getNumInputs() &&
+         "two quasi-polynomials with different numbers of symbols cannot "
+         "be subtracted!");
+  QuasiPolynomial qp(getNumInputs(), x.coefficients, x.affine);
+  for (Fraction &coeff : qp.coefficients)
+    coeff = -coeff;
+  return *this + qp;
+}
+
+QuasiPolynomial QuasiPolynomial::operator*(const QuasiPolynomial &x) const {
+  assert(getNumInputs() == x.getNumInputs() &&
+         "two quasi-polynomials with different numbers of "
+         "symbols cannot be multiplied!");
+
+  SmallVector<Fraction> coeffs;
+  coeffs.reserve(coefficients.size() * x.coefficients.size());
+  for (const Fraction &coeff : coefficients)
+    for (const Fraction &xcoeff : x.coefficients)
+      coeffs.push_back(coeff * xcoeff);
+
+  std::vector<SmallVector<Fraction>> product;
+  std::vector<std::vector<SmallVector<Fraction>>> aff;
+  aff.reserve(affine.size() * x.affine.size());
+  for (const std::vector<SmallVector<Fraction>> &term : affine) {
+    for (const std::vector<SmallVector<Fraction>> &xterm : x.affine) {
+      product.clear();
+      product.insert(product.end(), term.begin(), term.end());
+      product.insert(product.end(), xterm.begin(), xterm.end());
+      aff.push_back(product);
+    }
+  }
+
+  return QuasiPolynomial(getNumInputs(), coeffs, aff);
+}
+
+QuasiPolynomial QuasiPolynomial::operator/(const Fraction x) const {
+  assert(x != 0 && "division by zero!");
+  QuasiPolynomial qp(*this);
+  for (Fraction &coeff : qp.coefficients)
+    coeff /= x;
+  return qp;
+}
+
+// Removes terms which evaluate to zero from the expression.
+QuasiPolynomial QuasiPolynomial::simplify() {
+  SmallVector<Fraction> newCoeffs({});
+  std::vector<std::vector<SmallVector<Fraction>>> newAffine({});
+  for (unsigned i = 0, e = coefficients.size(); i < e; i++) {
+    // A term is zero if its coefficient is zero, or
+    if (coefficients[i] == Fraction(0, 1))
+      continue;
+    bool product_is_zero =
+        // if any of the affine functions in the product
+        llvm::any_of(affine[i], [](const SmallVector<Fraction> &affine_ij) {
+          // has all its coefficients as zero.
+          return llvm::all_of(affine_ij,
+                              [](const Fraction &f) { return f == 0; });
+        });
+    if (product_is_zero)
+      continue;
+    newCoeffs.push_back(coefficients[i]);
+    newAffine.push_back(affine[i]);
+  }
+  return QuasiPolynomial(getNumInputs(), newCoeffs, newAffine);
+}
diff --git a/mlir/lib/CAPI/Dialect/CMakeLists.txt b/mlir/lib/CAPI/Dialect/CMakeLists.txt
index d815eba48d9b..b2952da17a41 100644
--- a/mlir/lib/CAPI/Dialect/CMakeLists.txt
+++ b/mlir/lib/CAPI/Dialect/CMakeLists.txt
@@ -171,6 +171,15 @@ add_mlir_upstream_c_api_library(MLIRCAPIFunc
   MLIRFuncDialect
 )
 
+add_mlir_upstream_c_api_library(MLIRCAPISPIRV
+  SPIRV.cpp
+
+  PARTIAL_SOURCES_INTENDED
+  LINK_LIBS PUBLIC
+  MLIRCAPIIR
+  MLIRSPIRVDialect
+)
+
 add_mlir_upstream_c_api_library(MLIRCAPITensor
   Tensor.cpp
 
diff --git a/mlir/lib/CAPI/Dialect/SPIRV.cpp b/mlir/lib/CAPI/Dialect/SPIRV.cpp
new file mode 100644
index 000000000000..9bfe26b9598e
--- /dev/null
+++ b/mlir/lib/CAPI/Dialect/SPIRV.cpp
@@ -0,0 +1,13 @@
+//===- SPIRV.cpp - C Interface for SPIRV dialect --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir-c/Dialect/SPIRV.h"
+#include "mlir/CAPI/Registration.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
+
+MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(SPIRV, spirv, mlir::spirv::SPIRVDialect)
diff --git a/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp b/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp
index f9d6f04a811f..0c6e2e80b88a 100644
--- a/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp
+++ b/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp
@@ -32,6 +32,95 @@ using namespace mlir;
 
 namespace {
 
+/// Helper to create an arm_sme.intr.ld1*.(horiz|vert)' intrinsic.
+static Operation *createLoadTileSliceIntrinsic(
+    RewriterBase &rewriter, Location loc, arm_sme::ArmSMETileType type,
+    arm_sme::TileSliceLayout layout, Value maskOp, Value ptr,
+    IntegerAttr tileId, Value tileSliceI32) {
+  if (layout == arm_sme::TileSliceLayout::Horizontal) {
+    switch (type) {
+    case arm_sme::ArmSMETileType::ZAB:
+      return rewriter.create<arm_sme::aarch64_sme_ld1b_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAH:
+      return rewriter.create<arm_sme::aarch64_sme_ld1h_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAS:
+      return rewriter.create<arm_sme::aarch64_sme_ld1w_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAD:
+      return rewriter.create<arm_sme::aarch64_sme_ld1d_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAQ:
+      return rewriter.create<arm_sme::aarch64_sme_ld1q_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    }
+  } else {
+    switch (type) {
+    case arm_sme::ArmSMETileType::ZAB:
+      return rewriter.create<arm_sme::aarch64_sme_ld1b_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAH:
+      return rewriter.create<arm_sme::aarch64_sme_ld1h_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAS:
+      return rewriter.create<arm_sme::aarch64_sme_ld1w_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAD:
+      return rewriter.create<arm_sme::aarch64_sme_ld1d_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAQ:
+      return rewriter.create<arm_sme::aarch64_sme_ld1q_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+      break;
+    }
+  }
+}
+
+/// Helper to create an arm_sme.intr.st1*.(horiz|vert)' intrinsic.
+static Operation *createStoreTileSliceIntrinsic(
+    RewriterBase &rewriter, Location loc, arm_sme::ArmSMETileType type,
+    arm_sme::TileSliceLayout layout, Value maskOp, Value ptr,
+    IntegerAttr tileId, Value tileSliceI32) {
+  if (layout == arm_sme::TileSliceLayout::Horizontal) {
+    switch (type) {
+    case arm_sme::ArmSMETileType::ZAB:
+      return rewriter.create<arm_sme::aarch64_sme_st1b_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAH:
+      return rewriter.create<arm_sme::aarch64_sme_st1h_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAS:
+      return rewriter.create<arm_sme::aarch64_sme_st1w_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAD:
+      return rewriter.create<arm_sme::aarch64_sme_st1d_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAQ:
+      return rewriter.create<arm_sme::aarch64_sme_st1q_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    }
+  } else {
+    switch (type) {
+    case arm_sme::ArmSMETileType::ZAB:
+      return rewriter.create<arm_sme::aarch64_sme_st1b_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAH:
+      return rewriter.create<arm_sme::aarch64_sme_st1h_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAS:
+      return rewriter.create<arm_sme::aarch64_sme_st1w_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAD:
+      return rewriter.create<arm_sme::aarch64_sme_st1d_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAQ:
+      return rewriter.create<arm_sme::aarch64_sme_st1q_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    }
+  }
+}
+
 IntegerAttr getTileIdOrError(arm_sme::ArmSMETileOpInterface op) {
   auto tileId = op.getTileId();
   if (!tileId)
@@ -75,9 +164,6 @@ struct ZeroOpConversion : public ConvertOpToLLVMPattern<arm_sme::ZeroOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = zero.getLoc();
 
-    unsigned tileElementWidth =
-        zero.getVectorType().getElementType().getIntOrFloatBitWidth();
-
     auto tileId = getTileIdOrError(zero);
     if (!tileId)
       return failure();
@@ -86,23 +172,24 @@ struct ZeroOpConversion : public ConvertOpToLLVMPattern<arm_sme::ZeroOp> {
     // The base mask is just the mask to zero the first tile (of a size).
     // These masks are derived from:
     // https://developer.arm.com/documentation/ddi0602/2022-06/SME-Instructions/ZERO--Zero-a-list-of-64-bit-element-ZA-tiles-
+    arm_sme::ArmSMETileType tileType = *zero.getAllocatedTileType();
     auto baseMaskForSize = [&] {
-      switch (tileElementWidth) {
-      case 8:
+      switch (tileType) {
+      case arm_sme::ArmSMETileType::ZAB:
         // Zeroing the 8-bit ZA0.B tile is equivalent to zeroing all eight
         // 64-bit element tiles named ZA0.D to ZA7.D.
         return 0b1111'1111;
-      case 16:
-        // Zeroing the 16-bit ZA0.H tile is equivalent to zeroing 64-bit element
-        // tiles named ZA0.D, ZA2.D, ZA4.D, and ZA6.D.
-        // Shift this left once for ZA1.H.
+      case arm_sme::ArmSMETileType::ZAH:
+        // Zeroing the 16-bit ZA0.H tile is equivalent to zeroing 64-bit
+        // element tiles named ZA0.D, ZA2.D, ZA4.D, and ZA6.D. Shift this left
+        // once for ZA1.H.
         return 0b0101'0101;
-      case 32:
+      case arm_sme::ArmSMETileType::ZAS:
         // Zeroing the 32-bit ZA0.S tile is equivalent to zeroing 64-bit
         // element tiles named ZA0.D and ZA4.D.
         // Shift left by 1, 2, or 3 respectively for ZA1.S, ZA2.S, ZA3.S.
         return 0b0001'0001;
-      case 64:
+      case arm_sme::ArmSMETileType::ZAD:
         // Zeroing one of the a 64-bit tiles ZA0.D to ZA7.D just requires
         // setting the bit for that tile.
         return 0b0000'0001;
@@ -172,63 +259,13 @@ struct LoadTileSliceConversion
     // Create all active predicate mask.
     auto maskOp = loadTileSliceOp.getMask();
 
-    auto tileType = loadTileSliceOp.getVectorType();
-    auto tileElementType = tileType.getElementType();
-    unsigned tileElementWidth = tileElementType.getIntOrFloatBitWidth();
+    auto tileVectorType = loadTileSliceOp.getVectorType();
+    arm_sme::ArmSMETileType tileType = *arm_sme::getSMETileType(tileVectorType);
     arm_sme::TileSliceLayout layout = loadTileSliceOp.getLayout();
 
     // Create 'arm_sme.intr.ld1*.(horiz|vert)' intrinsic to load ZA tile slice.
-    if (layout == arm_sme::TileSliceLayout::Horizontal) {
-      switch (tileElementWidth) {
-      default:
-        llvm_unreachable("unexpected element type!");
-      case 8:
-        rewriter.create<arm_sme::aarch64_sme_ld1b_horiz>(loc, maskOp, ptr,
-                                                         tileId, tileSliceI32);
-        break;
-      case 16:
-        rewriter.create<arm_sme::aarch64_sme_ld1h_horiz>(loc, maskOp, ptr,
-                                                         tileId, tileSliceI32);
-        break;
-      case 32:
-        rewriter.create<arm_sme::aarch64_sme_ld1w_horiz>(loc, maskOp, ptr,
-                                                         tileId, tileSliceI32);
-        break;
-      case 64:
-        rewriter.create<arm_sme::aarch64_sme_ld1d_horiz>(loc, maskOp, ptr,
-                                                         tileId, tileSliceI32);
-        break;
-      case 128:
-        rewriter.create<arm_sme::aarch64_sme_ld1q_horiz>(loc, maskOp, ptr,
-                                                         tileId, tileSliceI32);
-        break;
-      }
-    } else {
-      switch (tileElementWidth) {
-      default:
-        llvm_unreachable("unexpected element type!");
-      case 8:
-        rewriter.create<arm_sme::aarch64_sme_ld1b_vert>(loc, maskOp, ptr,
-                                                        tileId, tileSliceI32);
-        break;
-      case 16:
-        rewriter.create<arm_sme::aarch64_sme_ld1h_vert>(loc, maskOp, ptr,
-                                                        tileId, tileSliceI32);
-        break;
-      case 32:
-        rewriter.create<arm_sme::aarch64_sme_ld1w_vert>(loc, maskOp, ptr,
-                                                        tileId, tileSliceI32);
-        break;
-      case 64:
-        rewriter.create<arm_sme::aarch64_sme_ld1d_vert>(loc, maskOp, ptr,
-                                                        tileId, tileSliceI32);
-        break;
-      case 128:
-        rewriter.create<arm_sme::aarch64_sme_ld1q_vert>(loc, maskOp, ptr,
-                                                        tileId, tileSliceI32);
-        break;
-      }
-    }
+    createLoadTileSliceIntrinsic(rewriter, loc, tileType, layout, maskOp, ptr,
+                                 tileId, tileSliceI32);
 
     // The load intrinsics have no result, replace 'arm_sme.tile_load' with
     // the input tile to preserve dataflow.
@@ -249,9 +286,7 @@ struct StoreTileSliceConversion
                   arm_sme::StoreTileSliceOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = storeTileSliceOp.getLoc();
-    auto tileType = storeTileSliceOp.getVectorType();
-    auto tileElementType = tileType.getElementType();
-    unsigned tileElementWidth = tileElementType.getIntOrFloatBitWidth();
+    auto tileVectorType = storeTileSliceOp.getVectorType();
 
     auto tileId = getTileIdOrError(storeTileSliceOp);
     if (!tileId)
@@ -271,58 +306,12 @@ struct StoreTileSliceConversion
     auto maskOp = storeTileSliceOp.getMask();
 
     arm_sme::TileSliceLayout layout = storeTileSliceOp.getLayout();
+    arm_sme::ArmSMETileType tileType = *arm_sme::getSMETileType(tileVectorType);
 
-    if (layout == arm_sme::TileSliceLayout::Horizontal) {
-      switch (tileElementWidth) {
-      default:
-        llvm_unreachable("unexpected element type!");
-      case 8:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1b_horiz>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 16:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1h_horiz>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 32:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1w_horiz>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 64:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1d_horiz>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 128:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1q_horiz>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      }
-    } else {
-      switch (tileElementWidth) {
-      default:
-        llvm_unreachable("unexpected element type!");
-      case 8:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1b_vert>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 16:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1h_vert>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 32:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1w_vert>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 64:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1d_vert>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 128:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1q_vert>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      }
-    }
+    rewriter.replaceOp(storeTileSliceOp,
+                       createStoreTileSliceIntrinsic(rewriter, loc, tileType,
+                                                     layout, maskOp, ptr,
+                                                     tileId, tileSliceI32));
 
     return success();
   }
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
index d383c16949f0..d7885e035959 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
@@ -16,10 +16,12 @@
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVTypes.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include <optional>
 
@@ -591,10 +593,12 @@ public:
   LogicalResult
   matchAndRewrite(gpu::SubgroupReduceOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto opType = op.getOp();
-    auto result =
-        createGroupReduceOp(rewriter, op.getLoc(), adaptor.getValue(), opType,
-                            /*isGroup*/ false, op.getUniform());
+    if (!isa<spirv::ScalarType>(adaptor.getValue().getType()))
+      return rewriter.notifyMatchFailure(op, "reduction type is not a scalar");
+
+    auto result = createGroupReduceOp(rewriter, op.getLoc(), adaptor.getValue(),
+                                      adaptor.getOp(),
+                                      /*isGroup=*/false, adaptor.getUniform());
     if (!result)
       return failure();
 
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
index ae89774239b5..8279b3408a6e 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Conversion/FuncToSPIRV/FuncToSPIRV.h"
 #include "mlir/Conversion/GPUToSPIRV/GPUToSPIRV.h"
 #include "mlir/Conversion/MemRefToSPIRV/MemRefToSPIRV.h"
+#include "mlir/Conversion/SCFToSPIRV/SCFToSPIRV.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
@@ -126,6 +127,8 @@ void GPUToSPIRVPass::runOnOperation() {
 
     // TODO: Change SPIR-V conversion to be progressive and remove the following
     // patterns.
+    ScfToSPIRVContext scfContext;
+    populateSCFToSPIRVPatterns(typeConverter, scfContext, patterns);
     mlir::arith::populateArithToSPIRVPatterns(typeConverter, patterns);
     populateMemRefToSPIRVPatterns(typeConverter, patterns);
     populateFuncToSPIRVPatterns(typeConverter, patterns);
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index cd1cfb3b7686..730858ffc67a 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -239,11 +239,11 @@ void mlir::configureOpenMPToLLVMConversionLegality(
   target.addDynamicallyLegalOp<
       mlir::omp::AtomicReadOp, mlir::omp::AtomicWriteOp, mlir::omp::FlushOp,
       mlir::omp::ThreadprivateOp, mlir::omp::YieldOp, mlir::omp::EnterDataOp,
-      mlir::omp::ExitDataOp, mlir::omp::DataBoundsOp, mlir::omp::MapInfoOp>(
-      [&](Operation *op) {
-        return typeConverter.isLegal(op->getOperandTypes()) &&
-               typeConverter.isLegal(op->getResultTypes());
-      });
+      mlir::omp::ExitDataOp, mlir::omp::UpdateDataOp, mlir::omp::DataBoundsOp,
+      mlir::omp::MapInfoOp>([&](Operation *op) {
+    return typeConverter.isLegal(op->getOperandTypes()) &&
+           typeConverter.isLegal(op->getResultTypes());
+  });
   target.addDynamicallyLegalOp<mlir::omp::ReductionOp>([&](Operation *op) {
     return typeConverter.isLegal(op->getOperandTypes());
   });
@@ -282,6 +282,7 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter,
       RegionLessOpConversion<omp::YieldOp>,
       RegionLessOpConversion<omp::EnterDataOp>,
       RegionLessOpConversion<omp::ExitDataOp>,
+      RegionLessOpConversion<omp::UpdateDataOp>,
       RegionLessOpWithVarOperandsConversion<omp::DataBoundsOp>>(converter);
 }
 
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index beed71d93d9b..7c35389f1e92 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -1052,55 +1052,6 @@ public:
   }
 };
 
-class TransposeConverter : public OpRewritePattern<tosa::TransposeOp> {
-public:
-  using OpRewritePattern<tosa::TransposeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tosa::TransposeOp op,
-                                PatternRewriter &rewriter) const final {
-    DenseIntElementsAttr perms;
-    if (!matchPattern(op.getPerms(), m_Constant(&perms))) {
-      return rewriter.notifyMatchFailure(op, "unmatched permutation tensor");
-    }
-
-    auto loc = op.getLoc();
-    auto input = op->getOperand(0);
-    auto resultTy = cast<ShapedType>(op.getType());
-
-    SmallVector<Value> dynDims;
-    dynDims.resize(cast<ShapedType>(op->getResult(0).getType()).getRank());
-
-    SmallVector<AffineExpr, 2> inputExprs;
-    inputExprs.resize(resultTy.getRank());
-    for (const auto &permutation : llvm::enumerate(perms.getValues<APInt>())) {
-      auto index = permutation.index();
-      auto value = permutation.value().getZExtValue();
-      if (!resultTy.hasRank() || resultTy.isDynamicDim(index)) {
-        dynDims[index] = rewriter.create<tensor::DimOp>(loc, input, value);
-      }
-      inputExprs[value] = rewriter.getAffineDimExpr(index);
-    }
-
-    SmallVector<Value> filteredDims = condenseValues(dynDims);
-
-    auto emptyTensor = rewriter.create<tensor::EmptyOp>(
-        loc, resultTy.getShape(), resultTy.getElementType(), filteredDims);
-
-    SmallVector<AffineMap, 2> affineMaps = {
-        AffineMap::get(resultTy.getRank(), /*symbolCount=*/0, inputExprs,
-                       rewriter.getContext()),
-        rewriter.getMultiDimIdentityMap(resultTy.getRank())};
-
-    rewriter.replaceOpWithNewOp<linalg::GenericOp>(
-        op, resultTy, op.getInput1(), ValueRange{emptyTensor}, affineMaps,
-        getNParallelLoopsAttrs(resultTy.getRank()),
-        [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) {
-          nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
-        });
-    return success();
-  }
-};
-
 class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
 public:
   using OpRewritePattern<tosa::RescaleOp>::OpRewritePattern;
@@ -2454,7 +2405,6 @@ void mlir::tosa::populateTosaToLinalgConversionPatterns(
       ReverseConverter,
       RFFT2dConverter,
       TableConverter,
-      TileConverter,
-      TransposeConverter>(patterns->getContext());
+      TileConverter>(patterns->getContext());
   // clang-format on
 }
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index b3fbc7dd0b22..8dc2d27bd545 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
@@ -984,6 +985,31 @@ public:
   }
 };
 
+class TransposeConverter : public OpRewritePattern<tosa::TransposeOp> {
+public:
+  using OpRewritePattern<tosa::TransposeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tosa::TransposeOp op,
+                                PatternRewriter &rewriter) const final {
+    SmallVector<int64_t> constantPerms;
+    if (failed(op.getConstantPerms(constantPerms)))
+      return failure();
+
+    Location loc = op.getLoc();
+    // The verifier should have made sure we have a valid permutation tensor.
+    assert(isPermutationVector(constantPerms) && "Expected valid permutation");
+    SmallVector<OpFoldResult> inputSizes =
+        tensor::getMixedSizes(rewriter, loc, op.getInput1());
+    auto permutedSizes =
+        applyPermutation<OpFoldResult>(inputSizes, constantPerms);
+
+    auto permutedInit = rewriter.create<tensor::EmptyOp>(
+        loc, permutedSizes, op.getInput1().getType().getElementType());
+    rewriter.replaceOpWithNewOp<linalg::TransposeOp>(
+        op, op.getInput1(), permutedInit, constantPerms);
+    return success();
+  }
+};
 } // namespace
 
 void mlir::tosa::populateTosaToLinalgNamedConversionPatterns(
@@ -1004,6 +1030,8 @@ void mlir::tosa::populateTosaToLinalgNamedConversionPatterns(
       MatMulConverter,
       MaxPool2dConverter,
       AvgPool2dConverter,
-      FullyConnectedConverter>(patterns->getContext());
+      FullyConnectedConverter,
+      TransposeConverter
+  >(patterns->getContext());
   // clang-format on
 }
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp
index 5312dc164c26..096969391e51 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp
@@ -60,6 +60,7 @@ public:
     target.addIllegalOp<tosa::AvgPool2dOp>();
     target.addIllegalOp<tosa::MatMulOp>();
     target.addIllegalOp<tosa::FullyConnectedOp>();
+    target.addIllegalOp<tosa::TransposeOp>();
 
     target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
 
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
index 66d921b4889f..bb319208f58a 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
@@ -205,7 +205,10 @@ static bool isEscapingMemref(Value memref, Block *block) {
   // (e.g., call ops, alias creating ops, etc.).
   return llvm::any_of(memref.getUsers(), [&](Operation *user) {
     // Ignore users outside of `block`.
-    if (block->getParent()->findAncestorOpInRegion(*user)->getBlock() != block)
+    Operation *ancestorOp = block->getParent()->findAncestorOpInRegion(*user);
+    if (!ancestorOp)
+      return true;
+    if (ancestorOp->getBlock() != block)
       return false;
     return !isa<AffineMapAccessInterface>(*user);
   });
diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
index 42a63316b31c..8deb8f028ba4 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -186,6 +186,32 @@ public:
   }
 };
 
+template <typename OpTy, arith::CmpFPredicate pred>
+struct MaxNumMinNumFOpConverter : public OpRewritePattern<OpTy> {
+public:
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const final {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+
+    Location loc = op.getLoc();
+    // If any operand is NaN, 'cmp' will be true (and 'select' returns 'lhs').
+    static_assert(pred == arith::CmpFPredicate::UGT ||
+                      pred == arith::CmpFPredicate::ULT,
+                  "pred must be either UGT or ULT");
+    Value cmp = rewriter.create<arith::CmpFOp>(loc, pred, lhs, rhs);
+    Value select = rewriter.create<arith::SelectOp>(loc, cmp, lhs, rhs);
+
+    // Handle the case where lhs is NaN: 'isNaN(lhs) ? rhs : select'.
+    Value isNaN = rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UNO,
+                                                 lhs, lhs);
+    rewriter.replaceOpWithNewOp<arith::SelectOp>(op, isNaN, rhs, select);
+    return success();
+  }
+};
+
 struct BFloat16ExtFOpConverter : public OpRewritePattern<arith::ExtFOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(arith::ExtFOp op,
@@ -319,7 +345,9 @@ struct ArithExpandOpsPass
       arith::CeilDivUIOp,
       arith::FloorDivSIOp,
       arith::MaximumFOp,
-      arith::MinimumFOp
+      arith::MinimumFOp,
+      arith::MaxNumFOp,
+      arith::MinNumFOp
     >();
 
     if (includeBf16) {
@@ -365,7 +393,9 @@ void mlir::arith::populateArithExpandOpsPatterns(RewritePatternSet &patterns) {
   // clang-format off
   patterns.add<
     MaximumMinimumFOpConverter<MaximumFOp, arith::CmpFPredicate::UGT>,
-    MaximumMinimumFOpConverter<MinimumFOp, arith::CmpFPredicate::ULT>
+    MaximumMinimumFOpConverter<MinimumFOp, arith::CmpFPredicate::ULT>,
+    MaxNumMinNumFOpConverter<MaxNumFOp, arith::CmpFPredicate::UGT>,
+    MaxNumMinNumFOpConverter<MinNumFOp, arith::CmpFPredicate::ULT>
    >(patterns.getContext());
   // clang-format on
 }
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
index 597846e31e21..8aa51f352f82 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
@@ -223,8 +223,10 @@ struct AssignTileIDsPattern
     if (failed(tileId))
       return tileOp.emitError("ran out of SME virtual tiles!");
 
-    func->setDiscardableAttr(kTilesInUseAttr,
-                             rewriter.getI32IntegerAttr((unsigned)tilesInUse));
+    rewriter.updateRootInPlace(func, [&]() {
+      func->setDiscardableAttr(
+          kTilesInUseAttr, rewriter.getI32IntegerAttr((unsigned)tilesInUse));
+    });
 
     // Find all the ops that (transitively) depend on this tile.
     SetVector<Operation *> dependantOps;
@@ -245,14 +247,15 @@ struct AssignTileIDsPattern
     // scf.if, and moving the contents of %tileA or %tileB to result tile (based
     // on the %some_cond).
     auto tileIDAttr = rewriter.getI32IntegerAttr(*tileId);
-    tileOp.setTileId(tileIDAttr);
+    rewriter.updateRootInPlace(tileOp, [&]() { tileOp.setTileId(tileIDAttr); });
     for (auto *op : dependantOps) {
       if (auto tileOp = llvm::dyn_cast<ArmSMETileOpInterface>(op)) {
         auto currentTileId = tileOp.getTileId();
         if (currentTileId && unsigned(currentTileId.getInt()) != tileId)
           return tileOp.emitOpError(
               "already assigned different SME virtual tile!");
-        tileOp.setTileId(tileIDAttr);
+        rewriter.updateRootInPlace(tileOp,
+                                   [&]() { tileOp.setTileId(tileIDAttr); });
       }
     }
 
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index fd32efe783bc..8508d29d2306 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -50,6 +50,32 @@ void mlir::emitc::buildTerminatedBody(OpBuilder &builder, Location loc) {
   builder.create<emitc::YieldOp>(loc);
 }
 
+/// Check that the type of the initial value is compatible with the operations
+/// result type.
+static LogicalResult verifyInitializationAttribute(Operation *op,
+                                                   Attribute value) {
+  assert(op->getNumResults() == 1 && "operation must have 1 result");
+
+  if (llvm::isa<emitc::OpaqueAttr>(value))
+    return success();
+
+  if (llvm::isa<StringAttr>(value))
+    return op->emitOpError()
+           << "string attributes are not supported, use #emitc.opaque instead";
+
+  Type resultType = op->getResult(0).getType();
+  Type attrType = cast<TypedAttr>(value).getType();
+
+  if (resultType != attrType)
+    return op->emitOpError()
+           << "requires attribute to either be an #emitc.opaque attribute or "
+              "it's type ("
+           << attrType << ") to match the op's result type (" << resultType
+           << ")";
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // AddOp
 //===----------------------------------------------------------------------===//
@@ -169,21 +195,14 @@ LogicalResult emitc::CallOpaqueOp::verify() {
 // ConstantOp
 //===----------------------------------------------------------------------===//
 
-/// The constant op requires that the attribute's type matches the return type.
 LogicalResult emitc::ConstantOp::verify() {
-  if (llvm::isa<emitc::OpaqueAttr>(getValueAttr()))
-    return success();
-
-  // Value must not be empty
-  StringAttr strAttr = llvm::dyn_cast<StringAttr>(getValueAttr());
-  if (strAttr && strAttr.empty())
-    return emitOpError() << "value must not be empty";
-
-  auto value = cast<TypedAttr>(getValueAttr());
-  Type type = getType();
-  if (!llvm::isa<NoneType>(value.getType()) && type != value.getType())
-    return emitOpError() << "requires attribute's type (" << value.getType()
-                         << ") to match op's return type (" << type << ")";
+  Attribute value = getValueAttr();
+  if (failed(verifyInitializationAttribute(getOperation(), value)))
+    return failure();
+  if (auto opaqueValue = llvm::dyn_cast<emitc::OpaqueAttr>(value)) {
+    if (opaqueValue.getValue().empty())
+      return emitOpError() << "value must not be empty";
+  }
   return success();
 }
 
@@ -561,17 +580,8 @@ LogicalResult SubOp::verify() {
 // VariableOp
 //===----------------------------------------------------------------------===//
 
-/// The variable op requires that the attribute's type matches the return type.
 LogicalResult emitc::VariableOp::verify() {
-  if (llvm::isa<emitc::OpaqueAttr>(getValueAttr()))
-    return success();
-
-  auto value = cast<TypedAttr>(getValueAttr());
-  Type type = getType();
-  if (!llvm::isa<NoneType>(value.getType()) && type != value.getType())
-    return emitOpError() << "requires attribute's type (" << value.getType()
-                         << ") to match op's return type (" << type << ")";
-  return success();
+  return verifyInitializationAttribute(getOperation(), getValueAttr());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp b/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
index 593d774cac73..88b691b50f32 100644
--- a/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
@@ -96,10 +96,7 @@ struct FoldExpressionOp : public OpRewritePattern<ExpressionOp> {
         assert(clonedExpressionRootOp->getNumResults() == 1 &&
                "Expected cloned root to have a single result");
 
-        Value clonedExpressionResult = clonedExpressionRootOp->getResult(0);
-
-        usedExpression.getResult().replaceAllUsesWith(clonedExpressionResult);
-        rewriter.eraseOp(usedExpression);
+        rewriter.replaceOp(usedExpression, clonedExpressionRootOp);
         anythingFolded = true;
       }
     }
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index ab6834cb262f..8383e06e6d24 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -50,19 +50,20 @@ add_mlir_dialect_library(MLIRGPUTransforms
   Transforms/AsyncRegionRewriter.cpp
   Transforms/BufferDeallocationOpInterfaceImpl.cpp
   Transforms/DecomposeMemrefs.cpp
+  Transforms/EliminateBarriers.cpp
   Transforms/GlobalIdRewriter.cpp
   Transforms/KernelOutlining.cpp
   Transforms/MemoryPromotion.cpp
   Transforms/ModuleToBinary.cpp
   Transforms/NVVMAttachTarget.cpp
   Transforms/ParallelLoopMapper.cpp
+  Transforms/ROCDLAttachTarget.cpp
   Transforms/SerializeToBlob.cpp
   Transforms/SerializeToCubin.cpp
   Transforms/SerializeToHsaco.cpp
   Transforms/ShuffleRewriter.cpp
   Transforms/SPIRVAttachTarget.cpp
-  Transforms/ROCDLAttachTarget.cpp
-  Transforms/EliminateBarriers.cpp
+  Transforms/SubgroupReduceLowering.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 7c3330f4c238..dd482f305fcb 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -19,6 +19,7 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpImplementation.h"
@@ -588,8 +589,16 @@ static void printAllReduceOperation(AsmPrinter &printer, Operation *op,
 //===----------------------------------------------------------------------===//
 
 LogicalResult gpu::SubgroupReduceOp::verify() {
+  Type elemType = getType();
+  if (auto vecTy = dyn_cast<VectorType>(elemType)) {
+    if (vecTy.isScalable())
+      return emitOpError() << "is not compatible with scalable vector types";
+
+    elemType = vecTy.getElementType();
+  }
+
   gpu::AllReduceOperation opName = getOp();
-  if (failed(verifyReduceOpAndType(opName, getType()))) {
+  if (failed(verifyReduceOpAndType(opName, elemType))) {
     return emitError() << '`' << gpu::stringifyAllReduceOperation(opName)
                        << "` reduction operation is not compatible with type "
                        << getType();
diff --git a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
index 095f8fd52051..70a9c77a6d79 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
@@ -9,4 +9,14 @@ add_mlir_dialect_library(MLIRGPUPipelines
   MLIRFuncDialect
   MLIRPass
   MLIRTransforms
+  MLIRLinalgTransforms
+  MLIRAffineToStandard
+  MLIRGPUToNVVMTransforms
+  MLIRIndexToLLVM
+  MLIRMathToLLVM
+  MLIRNVGPUToNVVM
+  MLIRNVVMToLLVM
+  MLIRReconcileUnrealizedCasts
+  MLIRSCFToControlFlow
+  MLIRVectorToSCF
 )
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
new file mode 100644
index 000000000000..61edce5e2a08
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -0,0 +1,150 @@
+//===- SubgroupReduceLowering.cpp - subgroup_reduce lowering patterns -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements gradual lowering of `gpu.subgroup_reduce` ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+using namespace mlir;
+
+namespace {
+
+/// Example, assumes `maxShuffleBitwidth` equal to 32:
+/// ```
+/// %a = gpu.subgroup_reduce add %x : (vector<3xf16>) -> vector<3xf16>
+///  ==>
+/// %v0 = arith.constant dense<0.0> : vector<3xf16>
+/// %e0 = vector.extract_strided_slice %x
+///   {offsets = [0], sizes = [2], strides = [1}: vector<3xf32> to vector<2xf32>
+/// %r0 = gpu.subgroup_reduce add %e0 : (vector<2xf16>) -> vector<2xf16>
+/// %v1 = vector.insert_strided_slice %r0, %v0
+///   {offsets = [0], strides = [1}: vector<2xf32> into vector<3xf32>
+/// %e1 = vector.extract %x[2] : f16 from vector<2xf16>
+/// %r1 = gpu.subgroup_reduce add %e1 : (f16) -> f16
+/// %a  = vector.insert %r1, %v1[2] : f16 into vector<3xf16>
+/// ```
+struct BreakDownSubgroupReduce final : OpRewritePattern<gpu::SubgroupReduceOp> {
+  BreakDownSubgroupReduce(MLIRContext *ctx, unsigned maxShuffleBitwidth,
+                          PatternBenefit benefit)
+      : OpRewritePattern(ctx, benefit), maxShuffleBitwidth(maxShuffleBitwidth) {
+  }
+
+  LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
+                                PatternRewriter &rewriter) const override {
+    auto vecTy = dyn_cast<VectorType>(op.getType());
+    if (!vecTy || vecTy.getNumElements() < 2)
+      return rewriter.notifyMatchFailure(op, "not a multi-element reduction");
+
+    assert(vecTy.getRank() == 1 && "Unexpected vector type");
+    assert(!vecTy.isScalable() && "Unexpected vector type");
+
+    Type elemTy = vecTy.getElementType();
+    unsigned elemBitwidth = elemTy.getIntOrFloatBitWidth();
+    if (elemBitwidth >= maxShuffleBitwidth)
+      return rewriter.notifyMatchFailure(
+          op, llvm::formatv("element type too large {0}, cannot break down "
+                            "into vectors of bitwidth {1} or less",
+                            elemBitwidth, maxShuffleBitwidth));
+
+    unsigned elementsPerShuffle = maxShuffleBitwidth / elemBitwidth;
+    assert(elementsPerShuffle >= 1);
+
+    unsigned numNewReductions =
+        llvm::divideCeil(vecTy.getNumElements(), elementsPerShuffle);
+    assert(numNewReductions >= 1);
+    if (numNewReductions == 1)
+      return rewriter.notifyMatchFailure(op, "nothing to break down");
+
+    Location loc = op.getLoc();
+    Value res =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getZeroAttr(vecTy));
+
+    for (unsigned i = 0; i != numNewReductions; ++i) {
+      int64_t startIdx = i * elementsPerShuffle;
+      int64_t endIdx =
+          std::min(startIdx + elementsPerShuffle, vecTy.getNumElements());
+      int64_t numElems = endIdx - startIdx;
+
+      Value extracted;
+      if (numElems == 1) {
+        extracted =
+            rewriter.create<vector::ExtractOp>(loc, op.getValue(), startIdx);
+      } else {
+        extracted = rewriter.create<vector::ExtractStridedSliceOp>(
+            loc, op.getValue(), /*offsets=*/startIdx, /*sizes=*/numElems,
+            /*strides=*/1);
+      }
+
+      Value reduce = rewriter.create<gpu::SubgroupReduceOp>(
+          loc, extracted, op.getOp(), op.getUniform());
+      if (numElems == 1) {
+        res = rewriter.create<vector::InsertOp>(loc, reduce, res, startIdx);
+        continue;
+      }
+
+      res = rewriter.create<vector::InsertStridedSliceOp>(
+          loc, reduce, res, /*offsets=*/startIdx, /*strides=*/1);
+    }
+
+    rewriter.replaceOp(op, res);
+    return success();
+  }
+
+private:
+  unsigned maxShuffleBitwidth = 0;
+};
+
+/// Example:
+/// ```
+/// %a = gpu.subgroup_reduce add %x : (vector<1xf32>) -> vector<1xf32>
+///  ==>
+/// %e0 = vector.extract %x[0] : f32 from vector<1xf32>
+/// %r0 = gpu.subgroup_reduce add %e0 : (f32) -> f32
+/// %a = vector.broadcast %r0 : f32 to vector<1xf32>
+/// ```
+struct ScalarizeSingleElementReduce final
+    : OpRewritePattern<gpu::SubgroupReduceOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
+                                PatternRewriter &rewriter) const override {
+    auto vecTy = dyn_cast<VectorType>(op.getType());
+    if (!vecTy || vecTy.getNumElements() != 1)
+      return rewriter.notifyMatchFailure(op, "not a single-element reduction");
+
+    assert(vecTy.getRank() == 1 && "Unexpected vector type");
+    assert(!vecTy.isScalable() && "Unexpected vector type");
+    Location loc = op.getLoc();
+    Value extracted = rewriter.create<vector::ExtractOp>(loc, op.getValue(), 0);
+    Value reduce = rewriter.create<gpu::SubgroupReduceOp>(
+        loc, extracted, op.getOp(), op.getUniform());
+    rewriter.replaceOpWithNewOp<vector::BroadcastOp>(op, vecTy, reduce);
+    return success();
+  }
+};
+
+} // namespace
+
+void mlir::populateGpuBreakDownSubgrupReducePatterns(
+    RewritePatternSet &patterns, unsigned maxShuffleBitwidth,
+    PatternBenefit benefit) {
+  patterns.add<BreakDownSubgroupReduce>(patterns.getContext(),
+                                        maxShuffleBitwidth, benefit);
+  patterns.add<ScalarizeSingleElementReduce>(patterns.getContext(), benefit);
+}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 458bf83eac17..64388a9a0181 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -908,6 +908,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
 
 void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
                    FlatSymbolRefAttr callee, ValueRange args) {
+  assert(callee && "expected non-null callee in direct call builder");
   build(builder, state, results,
         TypeAttr::get(getLLVMFuncType(builder.getContext(), results, args)),
         callee, args, /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
index 6e9019f932aa..65c1daee6711 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
@@ -50,11 +50,31 @@ static bool hasLifetimeMarkers(LLVM::AllocaOp allocaOp) {
 static void
 handleInlinedAllocas(Operation *call,
                      iterator_range<Region::iterator> inlinedBlocks) {
+  // Locate the entry block of the closest callsite ancestor that has either the
+  // IsolatedFromAbove or AutomaticAllocationScope trait. In pure LLVM dialect
+  // programs, this is the LLVMFuncOp containing the call site. However, in
+  // mixed-dialect programs, the callsite might be nested in another operation
+  // that carries one of these traits. In such scenarios, this traversal stops
+  // at the closest ancestor with either trait, ensuring visibility post
+  // relocation and respecting allocation scopes.
+  Block *callerEntryBlock = nullptr;
+  Operation *currentOp = call;
+  while (Operation *parentOp = currentOp->getParentOp()) {
+    if (parentOp->mightHaveTrait<OpTrait::IsIsolatedFromAbove>() ||
+        parentOp->mightHaveTrait<OpTrait::AutomaticAllocationScope>()) {
+      callerEntryBlock = &currentOp->getParentRegion()->front();
+      break;
+    }
+    currentOp = parentOp;
+  }
+
+  // Avoid relocating the alloca operations if the call has been inlined into
+  // the entry block already, which is typically the encompassing
+  // LLVM function, or if the relevant entry block cannot be identified.
   Block *calleeEntryBlock = &(*inlinedBlocks.begin());
-  Block *callerEntryBlock = &(*calleeEntryBlock->getParent()->begin());
-  if (calleeEntryBlock == callerEntryBlock)
-    // Nothing to do.
+  if (!callerEntryBlock || callerEntryBlock == calleeEntryBlock)
     return;
+
   SmallVector<std::tuple<LLVM::AllocaOp, IntegerAttr, bool>> allocasToMove;
   bool shouldInsertLifetimes = false;
   bool hasDynamicAlloca = false;
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 4f5d71e10f68..a4de89d928e1 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1003,7 +1003,7 @@ void NVVM::WgmmaMmaAsyncOp::getAsmValues(
         {makeConstantI32(rewriter, static_cast<int>(getLayoutA())),
          mlir::NVVM::PTXRegisterMod::Read});
     asmValues.push_back(
-        {makeConstantI32(rewriter, static_cast<int>(getLayoutB())),
+        {makeConstantI32(rewriter, 1 - static_cast<int>(getLayoutB())),
          mlir::NVVM::PTXRegisterMod::Read});
   }
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp b/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp
index 5a8320bdb287..f28f8f0d34a4 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp
@@ -60,7 +60,10 @@ LogicalResult linalg::linalgOpAnchoredEmptyTensorEliminationStep(
       config.alwaysIncludeLeaves = false;
       SetVector<Value> emptyTensors = state.findValueInReverseUseDefChain(
           in->get(), /*condition=*/
-          [&](Value val) { return val.getDefiningOp<tensor::EmptyOp>(); },
+          [&](Value val) {
+            return val.getDefiningOp<tensor::EmptyOp>() &&
+                   val.getType() == in->get().getType();
+          },
           config);
       if (emptyTensors.empty())
         continue;
diff --git a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
index 05a069d98ef3..05b813a3b1e9 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
@@ -8,10 +8,14 @@
 
 #include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h"
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Interfaces/RuntimeVerifiableOpInterface.h"
 
 using namespace mlir;
@@ -21,6 +25,12 @@ static std::string generateErrorMessage(Operation *op, const std::string &msg) {
   std::string buffer;
   llvm::raw_string_ostream stream(buffer);
   OpPrintingFlags flags;
+  // We may generate a lot of error messages and so we need to ensure the
+  // printing is fast.
+  flags.elideLargeElementsAttrs();
+  flags.printGenericOpForm();
+  flags.skipRegions();
+  flags.useLocalScope();
   stream << "ERROR: Runtime op verification failed\n";
   op->print(stream, flags);
   stream << "\n^ " << msg;
@@ -133,6 +143,161 @@ struct CastOpInterface
   }
 };
 
+/// Verifies that the indices on load/store ops are in-bounds of the memref's
+/// index space: 0 <= index#i < dim#i
+template <typename LoadStoreOp>
+struct LoadStoreOpInterface
+    : public RuntimeVerifiableOpInterface::ExternalModel<
+          LoadStoreOpInterface<LoadStoreOp>, LoadStoreOp> {
+  void generateRuntimeVerification(Operation *op, OpBuilder &builder,
+                                   Location loc) const {
+    auto loadStoreOp = cast<LoadStoreOp>(op);
+
+    auto memref = loadStoreOp.getMemref();
+    auto rank = memref.getType().getRank();
+    if (rank == 0) {
+      return;
+    }
+    auto indices = loadStoreOp.getIndices();
+
+    auto zero = builder.create<arith::ConstantIndexOp>(loc, 0);
+    Value assertCond;
+    for (auto i : llvm::seq<int64_t>(0, rank)) {
+      auto index = indices[i];
+
+      auto dimOp = builder.createOrFold<memref::DimOp>(loc, memref, i);
+
+      auto geLow = builder.createOrFold<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::sge, index, zero);
+      auto ltHigh = builder.createOrFold<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::slt, index, dimOp);
+      auto andOp = builder.createOrFold<arith::AndIOp>(loc, geLow, ltHigh);
+
+      assertCond =
+          i > 0 ? builder.createOrFold<arith::AndIOp>(loc, assertCond, andOp)
+                : andOp;
+    }
+    builder.create<cf::AssertOp>(
+        loc, assertCond, generateErrorMessage(op, "out-of-bounds access"));
+  }
+};
+
+/// Compute the linear index for the provided strided layout and indices.
+Value computeLinearIndex(OpBuilder &builder, Location loc, OpFoldResult offset,
+                         ArrayRef<OpFoldResult> strides,
+                         ArrayRef<OpFoldResult> indices) {
+  auto [expr, values] = computeLinearIndex(offset, strides, indices);
+  auto index =
+      affine::makeComposedFoldedAffineApply(builder, loc, expr, values);
+  return getValueOrCreateConstantIndexOp(builder, loc, index);
+}
+
+/// Returns two Values representing the bounds of the provided strided layout
+/// metadata. The bounds are returned as a half open interval -- [low, high).
+std::pair<Value, Value> computeLinearBounds(OpBuilder &builder, Location loc,
+                                            OpFoldResult offset,
+                                            ArrayRef<OpFoldResult> strides,
+                                            ArrayRef<OpFoldResult> sizes) {
+  auto zeros = SmallVector<int64_t>(sizes.size(), 0);
+  auto indices = getAsIndexOpFoldResult(builder.getContext(), zeros);
+  auto lowerBound = computeLinearIndex(builder, loc, offset, strides, indices);
+  auto upperBound = computeLinearIndex(builder, loc, offset, strides, sizes);
+  return {lowerBound, upperBound};
+}
+
+/// Returns two Values representing the bounds of the memref. The bounds are
+/// returned as a half open interval -- [low, high).
+std::pair<Value, Value> computeLinearBounds(OpBuilder &builder, Location loc,
+                                            TypedValue<BaseMemRefType> memref) {
+  auto runtimeMetadata = builder.create<ExtractStridedMetadataOp>(loc, memref);
+  auto offset = runtimeMetadata.getConstifiedMixedOffset();
+  auto strides = runtimeMetadata.getConstifiedMixedStrides();
+  auto sizes = runtimeMetadata.getConstifiedMixedSizes();
+  return computeLinearBounds(builder, loc, offset, strides, sizes);
+}
+
+/// Verifies that the linear bounds of a reinterpret_cast op are within the
+/// linear bounds of the base memref: low >= baseLow && high <= baseHigh
+struct ReinterpretCastOpInterface
+    : public RuntimeVerifiableOpInterface::ExternalModel<
+          ReinterpretCastOpInterface, ReinterpretCastOp> {
+  void generateRuntimeVerification(Operation *op, OpBuilder &builder,
+                                   Location loc) const {
+    auto reinterpretCast = cast<ReinterpretCastOp>(op);
+    auto baseMemref = reinterpretCast.getSource();
+    auto resultMemref =
+        cast<TypedValue<BaseMemRefType>>(reinterpretCast.getResult());
+
+    builder.setInsertionPointAfter(op);
+
+    // Compute the linear bounds of the base memref
+    auto [baseLow, baseHigh] = computeLinearBounds(builder, loc, baseMemref);
+
+    // Compute the linear bounds of the resulting memref
+    auto [low, high] = computeLinearBounds(builder, loc, resultMemref);
+
+    // Check low >= baseLow
+    auto geLow = builder.createOrFold<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sge, low, baseLow);
+
+    // Check high <= baseHigh
+    auto leHigh = builder.createOrFold<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sle, high, baseHigh);
+
+    auto assertCond = builder.createOrFold<arith::AndIOp>(loc, geLow, leHigh);
+
+    builder.create<cf::AssertOp>(
+        loc, assertCond,
+        generateErrorMessage(
+            op,
+            "result of reinterpret_cast is out-of-bounds of the base memref"));
+  }
+};
+
+/// Verifies that the linear bounds of a subview op are within the linear bounds
+/// of the base memref: low >= baseLow && high <= baseHigh
+/// TODO: This is not yet a full runtime verification of subview. For example,
+/// consider:
+///   %m = memref.alloc(%c10, %c10) : memref<10x10xf32>
+///   memref.subview %m[%c0, %c0][%c20, %c2][%c1, %c1]
+///      : memref<?x?xf32> to memref<?x?xf32>
+/// The subview is in-bounds of the entire base memref but the first dimension
+/// is out-of-bounds. Future work would verify the bounds on a per-dimension
+/// basis.
+struct SubViewOpInterface
+    : public RuntimeVerifiableOpInterface::ExternalModel<SubViewOpInterface,
+                                                         SubViewOp> {
+  void generateRuntimeVerification(Operation *op, OpBuilder &builder,
+                                   Location loc) const {
+    auto subView = cast<SubViewOp>(op);
+    auto baseMemref = cast<TypedValue<BaseMemRefType>>(subView.getSource());
+    auto resultMemref = cast<TypedValue<BaseMemRefType>>(subView.getResult());
+
+    builder.setInsertionPointAfter(op);
+
+    // Compute the linear bounds of the base memref
+    auto [baseLow, baseHigh] = computeLinearBounds(builder, loc, baseMemref);
+
+    // Compute the linear bounds of the resulting memref
+    auto [low, high] = computeLinearBounds(builder, loc, resultMemref);
+
+    // Check low >= baseLow
+    auto geLow = builder.createOrFold<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sge, low, baseLow);
+
+    // Check high <= baseHigh
+    auto leHigh = builder.createOrFold<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sle, high, baseHigh);
+
+    auto assertCond = builder.createOrFold<arith::AndIOp>(loc, geLow, leHigh);
+
+    builder.create<cf::AssertOp>(
+        loc, assertCond,
+        generateErrorMessage(op,
+                             "subview is out-of-bounds of the base memref"));
+  }
+};
+
 struct ExpandShapeOpInterface
     : public RuntimeVerifiableOpInterface::ExternalModel<ExpandShapeOpInterface,
                                                          ExpandShapeOp> {
@@ -183,8 +348,13 @@ void mlir::memref::registerRuntimeVerifiableOpInterfaceExternalModels(
   registry.addExtension(+[](MLIRContext *ctx, memref::MemRefDialect *dialect) {
     CastOp::attachInterface<CastOpInterface>(*ctx);
     ExpandShapeOp::attachInterface<ExpandShapeOpInterface>(*ctx);
+    LoadOp::attachInterface<LoadStoreOpInterface<LoadOp>>(*ctx);
+    ReinterpretCastOp::attachInterface<ReinterpretCastOpInterface>(*ctx);
+    StoreOp::attachInterface<LoadStoreOpInterface<StoreOp>>(*ctx);
+    SubViewOp::attachInterface<SubViewOpInterface>(*ctx);
 
     // Load additional dialects of which ops may get created.
-    ctx->loadDialect<arith::ArithDialect, cf::ControlFlowDialect>();
+    ctx->loadDialect<affine::AffineDialect, arith::ArithDialect,
+                     cf::ControlFlowDialect>();
   });
 }
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index df4f7825545c..f484eda3268d 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -615,15 +615,49 @@ unsigned ParallelOp::getNumDataOperands() {
 }
 
 Value ParallelOp::getDataOperand(unsigned i) {
-  unsigned numOptional = getAsync() ? 1 : 0;
+  unsigned numOptional = getAsync().size();
   numOptional += getNumGangs().size();
-  numOptional += getNumWorkers() ? 1 : 0;
-  numOptional += getVectorLength() ? 1 : 0;
+  numOptional += getNumWorkers().size();
+  numOptional += getVectorLength().size();
   numOptional += getIfCond() ? 1 : 0;
   numOptional += getSelfCond() ? 1 : 0;
   return getOperand(getWaitOperands().size() + numOptional + i);
 }
 
+template <typename Op>
+static LogicalResult verifyDeviceTypeCountMatch(Op op, OperandRange operands,
+                                                ArrayAttr deviceTypes,
+                                                llvm::StringRef keyword) {
+  if (!operands.empty() && deviceTypes.getValue().size() != operands.size())
+    return op.emitOpError() << keyword << " operands count must match "
+                            << keyword << " device_type count";
+  return success();
+}
+
+template <typename Op>
+static LogicalResult verifyDeviceTypeAndSegmentCountMatch(
+    Op op, OperandRange operands, DenseI32ArrayAttr segments,
+    ArrayAttr deviceTypes, llvm::StringRef keyword, int32_t maxInSegment = 0) {
+  std::size_t numOperandsInSegments = 0;
+
+  if (!segments)
+    return success();
+
+  for (auto segCount : segments.asArrayRef()) {
+    if (maxInSegment != 0 && segCount > maxInSegment)
+      return op.emitOpError() << keyword << " expects a maximum of "
+                              << maxInSegment << " values per segment";
+    numOperandsInSegments += segCount;
+  }
+  if (numOperandsInSegments != operands.size())
+    return op.emitOpError()
+           << keyword << " operand count does not match count in segments";
+  if (deviceTypes.getValue().size() != (size_t)segments.size())
+    return op.emitOpError()
+           << keyword << " segment count does not match device_type count";
+  return success();
+}
+
 LogicalResult acc::ParallelOp::verify() {
   if (failed(checkSymOperandList<mlir::acc::PrivateRecipeOp>(
           *this, getPrivatizations(), getGangPrivateOperands(), "private",
@@ -633,11 +667,322 @@ LogicalResult acc::ParallelOp::verify() {
           *this, getReductionRecipes(), getReductionOperands(), "reduction",
           "reductions", false)))
     return failure();
-  if (getNumGangs().size() > 3)
-    return emitOpError() << "num_gangs expects a maximum of 3 values";
+
+  if (failed(verifyDeviceTypeAndSegmentCountMatch(
+          *this, getNumGangs(), getNumGangsSegmentsAttr(),
+          getNumGangsDeviceTypeAttr(), "num_gangs", 3)))
+    return failure();
+
+  if (failed(verifyDeviceTypeAndSegmentCountMatch(
+          *this, getWaitOperands(), getWaitOperandsSegmentsAttr(),
+          getWaitOperandsDeviceTypeAttr(), "wait")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getNumWorkers(),
+                                        getNumWorkersDeviceTypeAttr(),
+                                        "num_workers")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getVectorLength(),
+                                        getVectorLengthDeviceTypeAttr(),
+                                        "vector_length")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getAsync(),
+                                        getAsyncDeviceTypeAttr(), "async")))
+    return failure();
+
   return checkDataOperands<acc::ParallelOp>(*this, getDataClauseOperands());
 }
 
+static std::optional<unsigned> findSegment(ArrayAttr segments,
+                                           mlir::acc::DeviceType deviceType) {
+  unsigned segmentIdx = 0;
+  for (auto attr : segments) {
+    auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
+    if (deviceTypeAttr.getValue() == deviceType)
+      return std::make_optional(segmentIdx);
+    ++segmentIdx;
+  }
+  return std::nullopt;
+}
+
+static mlir::Value
+getValueInDeviceTypeSegment(std::optional<mlir::ArrayAttr> arrayAttr,
+                            mlir::Operation::operand_range range,
+                            mlir::acc::DeviceType deviceType) {
+  if (!arrayAttr)
+    return {};
+  if (auto pos = findSegment(*arrayAttr, deviceType))
+    return range[*pos];
+  return {};
+}
+
+bool acc::ParallelOp::hasAsyncOnly() {
+  return hasAsyncOnly(mlir::acc::DeviceType::None);
+}
+
+bool acc::ParallelOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getAsyncOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Value acc::ParallelOp::getAsyncValue() {
+  return getAsyncValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value acc::ParallelOp::getAsyncValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getAsyncDeviceType(), getAsync(),
+                                     deviceType);
+}
+
+mlir::Value acc::ParallelOp::getNumWorkersValue() {
+  return getNumWorkersValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value
+acc::ParallelOp::getNumWorkersValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getNumWorkersDeviceType(), getNumWorkers(),
+                                     deviceType);
+}
+
+mlir::Value acc::ParallelOp::getVectorLengthValue() {
+  return getVectorLengthValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value
+acc::ParallelOp::getVectorLengthValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getVectorLengthDeviceType(),
+                                     getVectorLength(), deviceType);
+}
+
+mlir::Operation::operand_range ParallelOp::getNumGangsValues() {
+  return getNumGangsValues(mlir::acc::DeviceType::None);
+}
+
+static mlir::Operation::operand_range
+getValuesFromSegments(std::optional<mlir::ArrayAttr> arrayAttr,
+                      mlir::Operation::operand_range range,
+                      std::optional<llvm::ArrayRef<int32_t>> segments,
+                      mlir::acc::DeviceType deviceType) {
+  if (!arrayAttr)
+    return range.take_front(0);
+  if (auto pos = findSegment(*arrayAttr, deviceType)) {
+    int32_t nbOperandsBefore = 0;
+    for (unsigned i = 0; i < *pos; ++i)
+      nbOperandsBefore += (*segments)[i];
+    return range.drop_front(nbOperandsBefore).take_front((*segments)[*pos]);
+  }
+  return range.take_front(0);
+}
+
+mlir::Operation::operand_range
+ParallelOp::getNumGangsValues(mlir::acc::DeviceType deviceType) {
+  return getValuesFromSegments(getNumGangsDeviceType(), getNumGangs(),
+                               getNumGangsSegments(), deviceType);
+}
+
+bool acc::ParallelOp::hasWaitOnly() {
+  return hasWaitOnly(mlir::acc::DeviceType::None);
+}
+
+bool acc::ParallelOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getWaitOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Operation::operand_range ParallelOp::getWaitValues() {
+  return getWaitValues(mlir::acc::DeviceType::None);
+}
+
+mlir::Operation::operand_range
+ParallelOp::getWaitValues(mlir::acc::DeviceType deviceType) {
+  return getValuesFromSegments(getWaitOperandsDeviceType(), getWaitOperands(),
+                               getWaitOperandsSegments(), deviceType);
+}
+
+static ParseResult parseNumGangs(
+    mlir::OpAsmParser &parser,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &operands,
+    llvm::SmallVectorImpl<Type> &types, mlir::ArrayAttr &deviceTypes,
+    mlir::DenseI32ArrayAttr &segments) {
+  llvm::SmallVector<DeviceTypeAttr> attributes;
+  llvm::SmallVector<int32_t> seg;
+
+  do {
+    if (failed(parser.parseLBrace()))
+      return failure();
+
+    if (failed(parser.parseCommaSeparatedList(
+            mlir::AsmParser::Delimiter::None, [&]() {
+              if (parser.parseOperand(operands.emplace_back()) ||
+                  parser.parseColonType(types.emplace_back()))
+                return failure();
+              return success();
+            })))
+      return failure();
+
+    seg.push_back(operands.size());
+
+    if (failed(parser.parseRBrace()))
+      return failure();
+
+    if (succeeded(parser.parseOptionalLSquare())) {
+      if (parser.parseAttribute(attributes.emplace_back()) ||
+          parser.parseRSquare())
+        return failure();
+    } else {
+      attributes.push_back(mlir::acc::DeviceTypeAttr::get(
+          parser.getContext(), mlir::acc::DeviceType::None));
+    }
+  } while (succeeded(parser.parseOptionalComma()));
+
+  llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
+                                               attributes.end());
+  deviceTypes = ArrayAttr::get(parser.getContext(), arrayAttr);
+  segments = DenseI32ArrayAttr::get(parser.getContext(), seg);
+
+  return success();
+}
+
+static void printNumGangs(mlir::OpAsmPrinter &p, mlir::Operation *op,
+                          mlir::OperandRange operands, mlir::TypeRange types,
+                          std::optional<mlir::ArrayAttr> deviceTypes,
+                          std::optional<mlir::DenseI32ArrayAttr> segments) {
+  unsigned opIdx = 0;
+  for (unsigned i = 0; i < deviceTypes->size(); ++i) {
+    if (i != 0)
+      p << ", ";
+    p << "{";
+    for (int32_t j = 0; j < (*segments)[i]; ++j) {
+      if (j != 0)
+        p << ", ";
+      p << operands[opIdx] << " : " << operands[opIdx].getType();
+      ++opIdx;
+    }
+    p << "}";
+    auto deviceTypeAttr =
+        mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*deviceTypes)[i]);
+    if (deviceTypeAttr.getValue() != mlir::acc::DeviceType::None)
+      p << " [" << (*deviceTypes)[i] << "]";
+  }
+}
+
+static ParseResult parseWaitOperands(
+    mlir::OpAsmParser &parser,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &operands,
+    llvm::SmallVectorImpl<Type> &types, mlir::ArrayAttr &deviceTypes,
+    mlir::DenseI32ArrayAttr &segments) {
+  llvm::SmallVector<DeviceTypeAttr> attributes;
+  llvm::SmallVector<int32_t> seg;
+
+  do {
+    if (failed(parser.parseLBrace()))
+      return failure();
+
+    if (failed(parser.parseCommaSeparatedList(
+            mlir::AsmParser::Delimiter::None, [&]() {
+              if (parser.parseOperand(operands.emplace_back()) ||
+                  parser.parseColonType(types.emplace_back()))
+                return failure();
+              return success();
+            })))
+      return failure();
+
+    seg.push_back(operands.size());
+
+    if (failed(parser.parseRBrace()))
+      return failure();
+
+    if (succeeded(parser.parseOptionalLSquare())) {
+      if (parser.parseAttribute(attributes.emplace_back()) ||
+          parser.parseRSquare())
+        return failure();
+    } else {
+      attributes.push_back(mlir::acc::DeviceTypeAttr::get(
+          parser.getContext(), mlir::acc::DeviceType::None));
+    }
+  } while (succeeded(parser.parseOptionalComma()));
+
+  llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
+                                               attributes.end());
+  deviceTypes = ArrayAttr::get(parser.getContext(), arrayAttr);
+  segments = DenseI32ArrayAttr::get(parser.getContext(), seg);
+
+  return success();
+}
+
+static void printWaitOperands(mlir::OpAsmPrinter &p, mlir::Operation *op,
+                              mlir::OperandRange operands,
+                              mlir::TypeRange types,
+                              std::optional<mlir::ArrayAttr> deviceTypes,
+                              std::optional<mlir::DenseI32ArrayAttr> segments) {
+  unsigned opIdx = 0;
+  for (unsigned i = 0; i < deviceTypes->size(); ++i) {
+    if (i != 0)
+      p << ", ";
+    p << "{";
+    for (int32_t j = 0; j < (*segments)[i]; ++j) {
+      if (j != 0)
+        p << ", ";
+      p << operands[opIdx] << " : " << operands[opIdx].getType();
+      ++opIdx;
+    }
+    p << "}";
+    auto deviceTypeAttr =
+        mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*deviceTypes)[i]);
+    if (deviceTypeAttr.getValue() != mlir::acc::DeviceType::None)
+      p << " [" << (*deviceTypes)[i] << "]";
+  }
+}
+
+static ParseResult parseDeviceTypeOperands(
+    mlir::OpAsmParser &parser,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &operands,
+    llvm::SmallVectorImpl<Type> &types, mlir::ArrayAttr &deviceTypes) {
+  llvm::SmallVector<DeviceTypeAttr> attributes;
+  if (failed(parser.parseCommaSeparatedList([&]() {
+        if (parser.parseOperand(operands.emplace_back()) ||
+            parser.parseColonType(types.emplace_back()))
+          return failure();
+        if (succeeded(parser.parseOptionalLSquare())) {
+          if (parser.parseAttribute(attributes.emplace_back()) ||
+              parser.parseRSquare())
+            return failure();
+        } else {
+          attributes.push_back(mlir::acc::DeviceTypeAttr::get(
+              parser.getContext(), mlir::acc::DeviceType::None));
+        }
+        return success();
+      })))
+    return failure();
+  llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
+                                               attributes.end());
+  deviceTypes = ArrayAttr::get(parser.getContext(), arrayAttr);
+  return success();
+}
+
+static void
+printDeviceTypeOperands(mlir::OpAsmPrinter &p, mlir::Operation *op,
+                        mlir::OperandRange operands, mlir::TypeRange types,
+                        std::optional<mlir::ArrayAttr> deviceTypes) {
+  for (unsigned i = 0, e = deviceTypes->size(); i < e; ++i) {
+    if (i != 0)
+      p << ", ";
+    p << operands[i] << " : " << operands[i].getType();
+    auto deviceTypeAttr =
+        mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*deviceTypes)[i]);
+    if (deviceTypeAttr.getValue() != mlir::acc::DeviceType::None)
+      p << " [" << (*deviceTypes)[i] << "]";
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // SerialOp
 //===----------------------------------------------------------------------===//
@@ -648,12 +993,55 @@ unsigned SerialOp::getNumDataOperands() {
 }
 
 Value SerialOp::getDataOperand(unsigned i) {
-  unsigned numOptional = getAsync() ? 1 : 0;
+  unsigned numOptional = getAsync().size();
   numOptional += getIfCond() ? 1 : 0;
   numOptional += getSelfCond() ? 1 : 0;
   return getOperand(getWaitOperands().size() + numOptional + i);
 }
 
+bool acc::SerialOp::hasAsyncOnly() {
+  return hasAsyncOnly(mlir::acc::DeviceType::None);
+}
+
+bool acc::SerialOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getAsyncOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Value acc::SerialOp::getAsyncValue() {
+  return getAsyncValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value acc::SerialOp::getAsyncValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getAsyncDeviceType(), getAsync(),
+                                     deviceType);
+}
+
+bool acc::SerialOp::hasWaitOnly() {
+  return hasWaitOnly(mlir::acc::DeviceType::None);
+}
+
+bool acc::SerialOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getWaitOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Operation::operand_range SerialOp::getWaitValues() {
+  return getWaitValues(mlir::acc::DeviceType::None);
+}
+
+mlir::Operation::operand_range
+SerialOp::getWaitValues(mlir::acc::DeviceType deviceType) {
+  return getValuesFromSegments(getWaitOperandsDeviceType(), getWaitOperands(),
+                               getWaitOperandsSegments(), deviceType);
+}
+
 LogicalResult acc::SerialOp::verify() {
   if (failed(checkSymOperandList<mlir::acc::PrivateRecipeOp>(
           *this, getPrivatizations(), getGangPrivateOperands(), "private",
@@ -663,6 +1051,16 @@ LogicalResult acc::SerialOp::verify() {
           *this, getReductionRecipes(), getReductionOperands(), "reduction",
           "reductions", false)))
     return failure();
+
+  if (failed(verifyDeviceTypeAndSegmentCountMatch(
+          *this, getWaitOperands(), getWaitOperandsSegmentsAttr(),
+          getWaitOperandsDeviceTypeAttr(), "wait")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getAsync(),
+                                        getAsyncDeviceTypeAttr(), "async")))
+    return failure();
+
   return checkDataOperands<acc::SerialOp>(*this, getDataClauseOperands());
 }
 
@@ -675,19 +1073,114 @@ unsigned KernelsOp::getNumDataOperands() {
 }
 
 Value KernelsOp::getDataOperand(unsigned i) {
-  unsigned numOptional = getAsync() ? 1 : 0;
+  unsigned numOptional = getAsync().size();
   numOptional += getWaitOperands().size();
   numOptional += getNumGangs().size();
-  numOptional += getNumWorkers() ? 1 : 0;
-  numOptional += getVectorLength() ? 1 : 0;
+  numOptional += getNumWorkers().size();
+  numOptional += getVectorLength().size();
   numOptional += getIfCond() ? 1 : 0;
   numOptional += getSelfCond() ? 1 : 0;
   return getOperand(numOptional + i);
 }
 
+bool acc::KernelsOp::hasAsyncOnly() {
+  return hasAsyncOnly(mlir::acc::DeviceType::None);
+}
+
+bool acc::KernelsOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getAsyncOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Value acc::KernelsOp::getAsyncValue() {
+  return getAsyncValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value acc::KernelsOp::getAsyncValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getAsyncDeviceType(), getAsync(),
+                                     deviceType);
+}
+
+mlir::Value acc::KernelsOp::getNumWorkersValue() {
+  return getNumWorkersValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value
+acc::KernelsOp::getNumWorkersValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getNumWorkersDeviceType(), getNumWorkers(),
+                                     deviceType);
+}
+
+mlir::Value acc::KernelsOp::getVectorLengthValue() {
+  return getVectorLengthValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value
+acc::KernelsOp::getVectorLengthValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getVectorLengthDeviceType(),
+                                     getVectorLength(), deviceType);
+}
+
+mlir::Operation::operand_range KernelsOp::getNumGangsValues() {
+  return getNumGangsValues(mlir::acc::DeviceType::None);
+}
+
+mlir::Operation::operand_range
+KernelsOp::getNumGangsValues(mlir::acc::DeviceType deviceType) {
+  return getValuesFromSegments(getNumGangsDeviceType(), getNumGangs(),
+                               getNumGangsSegments(), deviceType);
+}
+
+bool acc::KernelsOp::hasWaitOnly() {
+  return hasWaitOnly(mlir::acc::DeviceType::None);
+}
+
+bool acc::KernelsOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getWaitOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Operation::operand_range KernelsOp::getWaitValues() {
+  return getWaitValues(mlir::acc::DeviceType::None);
+}
+
+mlir::Operation::operand_range
+KernelsOp::getWaitValues(mlir::acc::DeviceType deviceType) {
+  return getValuesFromSegments(getWaitOperandsDeviceType(), getWaitOperands(),
+                               getWaitOperandsSegments(), deviceType);
+}
+
 LogicalResult acc::KernelsOp::verify() {
-  if (getNumGangs().size() > 3)
-    return emitOpError() << "num_gangs expects a maximum of 3 values";
+  if (failed(verifyDeviceTypeAndSegmentCountMatch(
+          *this, getNumGangs(), getNumGangsSegmentsAttr(),
+          getNumGangsDeviceTypeAttr(), "num_gangs", 3)))
+    return failure();
+
+  if (failed(verifyDeviceTypeAndSegmentCountMatch(
+          *this, getWaitOperands(), getWaitOperandsSegmentsAttr(),
+          getWaitOperandsDeviceTypeAttr(), "wait")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getNumWorkers(),
+                                        getNumWorkersDeviceTypeAttr(),
+                                        "num_workers")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getVectorLength(),
+                                        getVectorLengthDeviceTypeAttr(),
+                                        "vector_length")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getAsync(),
+                                        getAsyncDeviceTypeAttr(), "async")))
+    return failure();
+
   return checkDataOperands<acc::KernelsOp>(*this, getDataClauseOperands());
 }
 
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
index 08ddc7c25aa9..4c62289a1e94 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
@@ -644,6 +644,45 @@ OpFoldResult spirv::UModOp::fold(FoldAdaptor adaptor) {
 }
 
 //===----------------------------------------------------------------------===//
+// spirv.SNegate
+//===----------------------------------------------------------------------===//
+
+OpFoldResult spirv::SNegateOp::fold(FoldAdaptor adaptor) {
+  // -(-x) = 0 - (0 - x) = x
+  auto op = getOperand();
+  if (auto negateOp = op.getDefiningOp<spirv::SNegateOp>())
+    return negateOp->getOperand(0);
+
+  // According to the SPIR-V spec:
+  //
+  // Signed-integer subtract of Operand from zero.
+  return constFoldUnaryOp<IntegerAttr>(
+      adaptor.getOperands(), [](const APInt &a) {
+        APInt zero = APInt::getZero(a.getBitWidth());
+        return zero - a;
+      });
+}
+
+//===----------------------------------------------------------------------===//
+// spirv.NotOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult spirv::NotOp::fold(spirv::NotOp::FoldAdaptor adaptor) {
+  // !(!x) = x
+  auto op = getOperand();
+  if (auto notOp = op.getDefiningOp<spirv::NotOp>())
+    return notOp->getOperand(0);
+
+  // According to the SPIR-V spec:
+  //
+  // Complement the bits of Operand.
+  return constFoldUnaryOp<IntegerAttr>(adaptor.getOperands(), [&](APInt a) {
+    a.flipAllBits();
+    return a;
+  });
+}
+
+//===----------------------------------------------------------------------===//
 // spirv.LogicalAnd
 //===----------------------------------------------------------------------===//
 
@@ -714,6 +753,22 @@ OpFoldResult spirv::LogicalNotEqualOp::fold(FoldAdaptor adaptor) {
 // spirv.LogicalNot
 //===----------------------------------------------------------------------===//
 
+OpFoldResult spirv::LogicalNotOp::fold(FoldAdaptor adaptor) {
+  // !(!x) = x
+  auto op = getOperand();
+  if (auto notOp = op.getDefiningOp<spirv::LogicalNotOp>())
+    return notOp->getOperand(0);
+
+  // According to the SPIR-V spec:
+  //
+  // Complement the bits of Operand.
+  return constFoldUnaryOp<IntegerAttr>(adaptor.getOperands(),
+                                       [](const APInt &a) {
+                                         APInt zero = APInt::getZero(1);
+                                         return a == 1 ? zero : (zero + 1);
+                                       });
+}
+
 void spirv::LogicalNotOp::getCanonicalizationPatterns(
     RewritePatternSet &results, MLIRContext *context) {
   results
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenEnv.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenEnv.cpp
index 4bd3af2d3f2f..d3de55e4d59b 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenEnv.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenEnv.cpp
@@ -85,7 +85,6 @@ void CodegenEnv::startEmit() {
     for (Level lvl = 0; lvl < lvlRank; lvl++)
       sortDependentLoops(latticeMerger.getDependentLoops(tid, lvl));
   }
-
   loopEmitter.initialize(
       tensors,
       StringAttr::get(linalgOp.getContext(),
@@ -95,17 +94,8 @@ void CodegenEnv::startEmit() {
       // TODO: compute the map and pass it to loop emitter directly instead of
       // passing in a callback.
       /*dependentLvlGetter=*/
-      [this](TensorId t,
-             Level lvl) -> std::vector<std::pair<TensorLevel, unsigned>> {
-        // Translates from a list of loop indices to a list of [tid, lvl] pair.
-        std::vector<LoopCoeffPair> &rLoops = merger().getDependentLoops(t, lvl);
-        std::vector<std::pair<TensorLevel, unsigned>> ret;
-        ret.reserve(rLoops.size());
-        for (auto [loop, coeff] : rLoops) {
-          TensorLevel tl = makeTensorLevel(merger().getLoopDefiningLvl(loop));
-          ret.emplace_back(tl, coeff);
-        };
-        return ret;
+      [this](TensorId t, Level lvl) -> std::vector<LoopCoeffPair> {
+        return merger().getDependentLoops(t, lvl);
       });
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
index d60b6ccd7321..80dad0646762 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
@@ -391,13 +391,18 @@ void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
                                  /*posTupleNum=*/Value(), std::nullopt, 0);
     if (dimGetter && !isSynTensor(tid)) {
       for (Level l = 0; l < lvlRank; l++) {
-        dependentLvlMap[tid][l] = dimGetter(tid, l);
+        std::vector<std::pair<LoopId, unsigned>> deps = dimGetter(tid, l);
+        // Sort the loop by order.
+        std::sort(deps.begin(), deps.end(),
+                  [](auto &lhs, auto &rhs) { return lhs.first < rhs.first; });
+
+        dependentLvlMap[tid][l] = std::move(deps);
         unsigned depends = dependentLvlMap[tid][l].size();
         if (depends == 0)
           continue;
-        sliceMeta[tid][l].assign(depends, std::make_pair(nullptr, 0));
+        sliceMeta[tid][l].reserve(depends);
         // We need `depends - 1` slices to fully reduce the affine expression.
-        slicePosBuffer[tid][l].assign(depends - 1, nullptr);
+        slicePosBuffer[tid][l].reserve(depends - 1);
       }
     }
   }
@@ -487,35 +492,70 @@ void LoopEmitter::initializeLoopEmit(
     // hoist the code ouside if-conditions.
   }
 
-  Type indexType = builder.getIndexType();
-  Value c0 = constantZero(builder, loc, indexType);
+  initSliceDriven(builder, loc);
+}
+
+void LoopEmitter::initSliceDriven(OpBuilder &builder, Location loc) {
+  Value c0 = C_IDX(0);
   for (TensorId t = 0, e = tensors.size(); t < e; t++) {
     auto rtp = dyn_cast<RankedTensorType>(tensors[t].getType());
     if (!rtp)
       continue;
 
     Level lvlRank = SparseTensorType(rtp).getLvlRank();
+
+    // Compute the dependency reduction order.
+    auto remDepStack = dependentLvlMap;
+    std::vector<std::tuple<LoopId, TensorId, Level>> depRedOrder;
     for (Level lvl = 0; lvl < lvlRank; lvl++) {
-      if (!dependentLvlMap[t][lvl].empty()) {
-        ArrayRef<std::pair<TensorLevel, unsigned>> depLvls =
-            dependentLvlMap[t][lvl];
-        // Needs at least two operands to form a non-trivial affine expression.
-        assert(depLvls.size() == sliceMeta[t][lvl].size());
-
-        Value size = c0;
-        for (int e = depLvls.size() - 1; e >= 0; e--) {
-          auto [dt, dl] = unpackTensorLevel(depLvls[e].first);
-          unsigned stride = depLvls[e].second;
-          Value stridedSize = lvlSizes[dt][dl];
-          if (stride != 1)
-            stridedSize = MULI(stridedSize, C_IDX(stride));
-          size = ADDI(size, stridedSize);
-          sliceMeta[t][lvl][e] = std::make_pair(size, stride);
-        }
+      // Reverse queue into a stack.
+      std::reverse(remDepStack[t][lvl].begin(), remDepStack[t][lvl].end());
+      for (auto [loop, coeff] : dependentLvlMap[t][lvl])
+        depRedOrder.emplace_back(std::make_tuple(loop, t, lvl));
+    }
+
+    if (depRedOrder.empty())
+      continue;
+    std::sort(depRedOrder.begin(), depRedOrder.end(),
+              [](auto &l, auto &r) { return std::get<0>(l) < std::get<0>(r); });
+
+    for (auto [loop, t, lvl] : depRedOrder) {
+      std::pair<LoopId, unsigned> curDep = remDepStack[t][lvl].back();
+      assert(curDep.first == loop);
+      Value size = c0;
+      for (auto [loop, stride] : remDepStack[t][lvl]) {
+        // The synthetic tensor high defines the loop upper bound.
+        Value loopHi = highs[getSynTensorId()][loop];
+        size = ADDI(size, MULI(loopHi, C_IDX(stride)));
       }
+      sliceMeta[t][lvl].emplace_back(size, curDep.second);
+      remDepStack[t][lvl].pop_back();
+
+      // Generate caches required to fast compute next-non-empty slices with
+      // increasing offset for slice-base loop.
+      // We do not need cache for dense levels.
+      if (!remDepStack[t][lvl].empty() && !isDenseLT(lvls[t][lvl]->getLT())) {
+        Value cnt = C_IDX(1);
+        for (int preLvl = lvl - 1; preLvl >= 0; preLvl--) {
+          if (remDepStack[t][preLvl].empty())
+            break;
+          assert(remDepStack[t][preLvl].size() == 1 && "Not implemented");
+          auto [loop, stride] = remDepStack[t][preLvl].back();
+          assert(stride == 1 && "Not yet implemented");
+          // Accumlate the size required to cache the pLo for the slice.
+          // E.g., if we want to cache the pIdx for slice<d0xd1xf64> on the
+          // second level. We at most need a memref<d0xindex>.
+          //
+          // NOTE: this is apparently an over-approximation when the previous
+          // level is compressed, and we can compute a precise memory size
+          // inside the loops. But that would also requires us to allocate/free
+          // memory in loops.
+          cnt = MULI(highs[getSynTensorId()][loop], cnt);
+        }
+        slicePosBuffer[t][lvl].push_back(allocSlicePosBuf(builder, loc, cnt));
+      } // else fully resolved.
     }
   }
-  localInsertPos = builder.getInsertionPoint()->getPrevNode();
 }
 
 void LoopEmitter::categorizeLoopCondition(
@@ -1878,9 +1918,6 @@ void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc,
   // simple dim expression in between).
   assert(lvl == *sliceStack[tid].back().slicedOnLvl + 1);
 
-  // Check slice stack integrity.
-  assert(slicePosBuffer[tid][lvl - 1].size() == sliceStack[tid].back().depth);
-
   SmallVector<const SliceInfo *> unResSlices;
   std::optional<std::pair<TensorId, Level>> firstResLvl;
   for (Level curLvl = lvl; curLvl >= 1; curLvl--) {
@@ -2006,37 +2043,6 @@ bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
   if (baseEnc.isSlice())
     llvm_unreachable("TODO: not yet implemented");
 
-  // Generate caches required to fast compute next-non-empty slices with
-  // increasing offset for slice-base loop.
-  // We do not need cache for dense levels.
-  if (slicePosBuffer[tid][lvl][0] == nullptr && !isDenseLT(lvlType)) {
-    OpBuilder::InsertionGuard guard(builder);
-    // The buffer can be reused, and the size is loop invariant: it only
-    // depends on the iteration graph's toposort.
-    builder.setInsertionPointAfter(localInsertPos);
-    Value tupleCnt = C_IDX(1);
-    // Accumlates the size required to cache the pLo for the slice.
-    // E.g., if we want to cache the pIdx for slice<d0xd1xf64> on the second
-    // level. We at most need to a memref<d0xindex>.
-    // NOTE: this is apperantly an over-approximation when the previous
-    // level is compressed, and we can compute a precise memory size
-    // inside the loops. But that would also requires us to allocate/free
-    // memorys in loops.
-    // TODO: Maybe using allocaScopeOp inside the loop to resolve the issue?
-    for (Level curLevel = lvl;
-         curLevel >= 1 && !lvlFullyResolved(tid, curLevel - 1); curLevel--) {
-      // We only handle cases when all the previously unresolved levels are
-      // fully reduced.
-      assert(depFullyReduced(tid, curLevel - 1));
-      assert(!sliceMeta[tid][curLevel - 1].empty());
-      auto [sz, stride] = sliceMeta[tid][curLevel - 1].back();
-      assert(stride == 1 && "Not yet implemented");
-      tupleCnt = MULI(tupleCnt, sz);
-    }
-    for (Value &cache : slicePosBuffer[tid][lvl])
-      cache = allocSlicePosBuf(builder, loc, tupleCnt);
-  }
-
   if (sliceInfo.isInitialTensor() ||
       (lvl >= 1 && lvlFullyResolved(tid, lvl - 1))) {
     // First level or previous level has been full resolved.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
index eb577ee4acef..450678924c13 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
@@ -63,22 +63,18 @@ public:
   using SynTensorBoundSetter =
       function_ref<Value(OpBuilder &builder, Location loc, Level lvl)>;
 
-  // Map from [tid, lvl] to a list of dependent [tidlvl, coeffecient] for
+  // Map from [tid, lvl] to a list of dependent [LoopId, coeffecient] for
   // subscript expressions on sparse tensors.
   //
-  // E.g., for affine index (2 * d0 + d1), it depends on two tidlvls that
-  // defines d0 and d1 (for affine expression reduction) and uses 2 and 1 for
-  // cofficients on d0, d1 respectively.
-  // If the list is empty, it means that there is no affine expression on the
-  // input [tid, lvl].
+  // E.g., for affine index (2 * d0 + d1), it depends on loop d0 and d1 (for
+  // affine expression reduction) and uses 2 and 1 for coefficients on d0, d1
+  // respectively. If the list is empty, it means that there is no affine
+  // expression on the input [tid, lvl].
   //
-  // NOTE: The caller is responsible to ensure that the order of the returned
-  // list to be consistent with the topological order of the iteration graph,
-  // otherwise the loop emitter might reduce a wrong dependent index variable
-  // when generating slice-driven loops.
+  // NOTE: LoopEmitter assumes that the loop id is consistent with the loop
+  // order, i.e., loop `d0` will be generated before loop `d1`.
   using DependentLvlGetter =
-      function_ref<std::vector<std::pair<TensorLevel, unsigned>>(TensorId,
-                                                                 Level)>;
+      function_ref<std::vector<std::pair<LoopId, unsigned>>(TensorId, Level)>;
 
   LoopEmitter() = default;
 
@@ -534,6 +530,8 @@ private:
   // Slice-driven loop related methods.
   //
 
+  void initSliceDriven(OpBuilder &builder, Location loc);
+
   /// Retrieves the most recent slice on lvl. To reduce affine expression like
   /// d0 + d1 + d2, we need two slices (one of size d1 + d2, and the other of
   /// size d2). This methods returns the latter slice (of size d2).
@@ -621,9 +619,6 @@ private:
   bool hasOutput;
   bool isSparseOut;
 
-  /// The insertion point to allocate top level local variables.
-  Operation *localInsertPos;
-
   //
   // Fields which have `numTensor` many entries.
   //
@@ -645,7 +640,7 @@ private:
   std::vector<std::vector<Value>> highs;
   std::vector<std::vector<Value>> lvlSizes;
   std::vector<std::vector<std::unique_ptr<SparseTensorLevel>>> lvls;
-  std::vector<Value> valBuffer;                       // to_value
+  std::vector<Value> valBuffer; // to_value
 
   //
   // Slice-driven loops related fields.
@@ -659,7 +654,7 @@ private:
 
   // Map from [tid, level] to a list of dependent [tidlevel, coefficient].
   // See comments for `DependentLvlGetter`.
-  std::vector<std::vector<std::vector<std::pair<TensorLevel, unsigned>>>>
+  std::vector<std::vector<std::vector<std::pair<LoopId, unsigned>>>>
       dependentLvlMap;
 
   // The cached position buffer for the slices, they serve the same purpose as
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 1b0cdbd0f4f7..816e6ba8fed9 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -3466,44 +3466,6 @@ OpFoldResult SplatOp::fold(FoldAdaptor adaptor) {
 // PackOp/UnPackOp Common
 //===----------------------------------------------------------------------===//
 
-namespace {
-
-/// Packing one-dimensional tensor can be expressed as an expand shape op.
-struct SimplifyPackToExandShape : public OpRewritePattern<PackOp> {
-  using OpRewritePattern<PackOp>::OpRewritePattern;
-
-  Value insertExpand(RewriterBase &rewriter, Location loc, Value operand,
-                     Type newOperandType, ArrayAttr reassociation) const {
-    if (operand.getType() == newOperandType)
-      return operand;
-    return rewriter.create<tensor::ExpandShapeOp>(loc, newOperandType, operand,
-                                                  reassociation);
-  }
-
-  LogicalResult matchAndRewrite(PackOp packOp,
-                                PatternRewriter &rewriter) const override {
-    RankedTensorType sourceType = packOp.getSourceType();
-    RankedTensorType destType = packOp.getDestType();
-    if (sourceType.getRank() != 1 || packOp.getPaddingValue())
-      return failure();
-    auto reassociation =
-        getReassociationIndicesForReshape(sourceType, destType);
-    if (!reassociation)
-      return failure();
-    Value expanded = insertExpand(
-        rewriter, packOp.getLoc(), packOp.getSource(), destType,
-        getReassociationIndicesAttribute(rewriter, *reassociation));
-    rewriter.replaceOp(packOp, expanded);
-    return success();
-  }
-};
-
-} // namespace
-
-void mlir::tensor::populateSimplifyTensorPack(RewritePatternSet &patterns) {
-  patterns.add<SimplifyPackToExandShape>(patterns.getContext());
-}
-
 template <typename OpTy>
 static LogicalResult
 reifyResultShapesImpl(OpTy op, OpBuilder &builder,
diff --git a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
index d233ab7a0e89..cbc0d499d9d5 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
@@ -4,10 +4,10 @@ add_mlir_dialect_library(MLIRTensorTransforms
   ConcatOpPatterns.cpp
   EmptyOpPatterns.cpp
   ExtractSliceFromReshapeUtils.cpp
-  FoldIntoPackAndUnpackPatterns.cpp
   FoldTensorSubsetOps.cpp
   IndependenceTransforms.cpp
   MergeConsecutiveInsertExtractSlicePatterns.cpp
+  PackAndUnpackPatterns.cpp
   ReshapePatterns.cpp
   RewriteAsConstant.cpp
   SwapExtractSliceWithProducerPatterns.cpp
diff --git a/mlir/lib/Dialect/Tensor/Transforms/FoldIntoPackAndUnpackPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
index e4509b331bee..e20450c95ffd 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/FoldIntoPackAndUnpackPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
@@ -21,6 +21,46 @@ static bool areAllConstantIntValue(ArrayRef<OpFoldResult> ofrs, int64_t value) {
       ofrs, [&](OpFoldResult ofr) { return isConstantIntValue(ofr, value); });
 }
 
+/// Packing one-dimensional tensor can be expressed as an expand shape op.
+struct SimplifyPackToExpandShape : public OpRewritePattern<PackOp> {
+  using OpRewritePattern<PackOp>::OpRewritePattern;
+
+  Value insertExpand(RewriterBase &rewriter, Location loc, Value operand,
+                     Type newOperandType, ArrayAttr reassociation) const {
+    if (operand.getType() == newOperandType)
+      return operand;
+    return rewriter.create<tensor::ExpandShapeOp>(loc, newOperandType, operand,
+                                                  reassociation);
+  }
+
+  LogicalResult matchAndRewrite(PackOp packOp,
+                                PatternRewriter &rewriter) const override {
+    if (packOp.getPaddingValue())
+      return rewriter.notifyMatchFailure(packOp, "expects no padding value");
+
+    if (!packOp.getOuterDimsPerm().empty())
+      return rewriter.notifyMatchFailure(packOp, "expects no outer_dims_perm");
+
+    RankedTensorType sourceType = packOp.getSourceType();
+    RankedTensorType destType = packOp.getDestType();
+    ArrayRef<int64_t> dimsPos = packOp.getInnerDimsPos();
+    if (dimsPos.size() != 1 || (dimsPos[0] + 1 != sourceType.getRank())) {
+      return rewriter.notifyMatchFailure(
+          packOp, "expects packing at the innermost dimension");
+    }
+
+    auto reassociation =
+        getReassociationIndicesForReshape(sourceType, destType);
+    if (!reassociation)
+      return failure();
+    Value expanded = insertExpand(
+        rewriter, packOp.getLoc(), packOp.getSource(), destType,
+        getReassociationIndicesAttribute(rewriter, *reassociation));
+    rewriter.replaceOp(packOp, expanded);
+    return success();
+  }
+};
+
 /// Fold a `pad` -> `pack` into `pack` if they have the same padding values and
 /// the pad op has zero low paddings, or if `pack` has no padding values.
 struct FoldPadWithPackOp : public OpRewritePattern<PackOp> {
@@ -150,5 +190,9 @@ void populateFoldIntoPackAndUnpackPatterns(RewritePatternSet &patterns) {
       patterns.getContext());
 }
 
+void populateSimplifyPackAndUnpackPatterns(RewritePatternSet &patterns) {
+  patterns.add<SimplifyPackToExpandShape>(patterns.getContext());
+}
+
 } // namespace tensor
 } // namespace mlir
diff --git a/mlir/lib/ExecutionEngine/AsyncRuntime.cpp b/mlir/lib/ExecutionEngine/AsyncRuntime.cpp
index 9d7e4000fe3b..24835c555b68 100644
--- a/mlir/lib/ExecutionEngine/AsyncRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/AsyncRuntime.cpp
@@ -446,7 +446,7 @@ extern "C" int64_t mlirAsyncRuntimGetNumWorkerThreads() {
 
 extern "C" void mlirAsyncRuntimePrintCurrentThreadId() {
   static thread_local std::thread::id thisId = std::this_thread::get_id();
-  std::cout << "Current thread id: " << thisId << std::endl;
+  std::cout << "Current thread id: " << thisId << '\n';
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
index 2dcc5d22e229..e7ac8f161875 100644
--- a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
@@ -450,10 +450,10 @@ void _mlir_ciface_outSparseTensorWriterMetaData(
   assert(dimRank != 0);
   index_type *dimSizes = MEMREF_GET_PAYLOAD(dimSizesRef);
   std::ostream &file = *static_cast<std::ostream *>(p);
-  file << dimRank << " " << nse << std::endl;
+  file << dimRank << " " << nse << '\n';
   for (index_type d = 0; d < dimRank - 1; d++)
     file << dimSizes[d] << " ";
-  file << dimSizes[dimRank - 1] << std::endl;
+  file << dimSizes[dimRank - 1] << '\n';
 }
 
 #define IMPL_OUTNEXT(VNAME, V)                                                 \
@@ -468,7 +468,7 @@ void _mlir_ciface_outSparseTensorWriterMetaData(
     for (index_type d = 0; d < dimRank; d++)                                   \
       file << (dimCoords[d] + 1) << " ";                                       \
     V *value = MEMREF_GET_PAYLOAD(vref);                                       \
-    file << *value << std::endl;                                               \
+    file << *value << '\n';                                                    \
   }
 MLIR_SPARSETENSOR_FOREVERY_V(IMPL_OUTNEXT)
 #undef IMPL_OUTNEXT
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index 905405e93988..528f113ea6f7 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1439,6 +1439,7 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
     } else {
       callOp = builder.create<CallOp>(loc, funcTy, operands);
     }
+    callOp.setCConv(convertCConvFromLLVM(callInst->getCallingConv()));
     setFastmathFlagsAttr(inst, callOp);
     if (!callInst->getType()->isVoidTy())
       mapValue(inst, callOp.getResult());
@@ -1516,6 +1517,7 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
           loc, funcTy, /*callee=*/nullptr, operands, directNormalDest,
           ValueRange(), lookupBlock(invokeInst->getUnwindDest()), unwindArgs);
     }
+    invokeOp.setCConv(convertCConvFromLLVM(invokeInst->getCallingConv()));
     if (!invokeInst->getType()->isVoidTy())
       mapValue(inst, invokeOp.getResults().front());
     else
diff --git a/mlir/lib/Transforms/SCCP.cpp b/mlir/lib/Transforms/SCCP.cpp
index 14435b37acc9..b2d3929b0459 100644
--- a/mlir/lib/Transforms/SCCP.cpp
+++ b/mlir/lib/Transforms/SCCP.cpp
@@ -53,7 +53,7 @@ static LogicalResult replaceWithConstant(DataFlowSolver &solver,
   Dialect *dialect = latticeValue.getConstantDialect();
   Value constant = folder.getOrCreateConstant(
       builder.getInsertionBlock(), dialect, latticeValue.getConstantValue(),
-      value.getType(), value.getLoc());
+      value.getType());
   if (!constant)
     return failure();
 
diff --git a/mlir/lib/Transforms/Utils/FoldUtils.cpp b/mlir/lib/Transforms/Utils/FoldUtils.cpp
index eb4dcb251a22..e5f78abf7fca 100644
--- a/mlir/lib/Transforms/Utils/FoldUtils.cpp
+++ b/mlir/lib/Transforms/Utils/FoldUtils.cpp
@@ -77,8 +77,10 @@ LogicalResult OperationFolder::tryToFold(Operation *op, bool *inPlaceUpdate) {
     // Check to see if we should rehoist, i.e. if a non-constant operation was
     // inserted before this one.
     Block *opBlock = op->getBlock();
-    if (&opBlock->front() != op && !isFolderOwnedConstant(op->getPrevNode()))
+    if (&opBlock->front() != op && !isFolderOwnedConstant(op->getPrevNode())) {
       op->moveBefore(&opBlock->front());
+      op->setLoc(erasedFoldedLocation);
+    }
     return failure();
   }
 
@@ -112,8 +114,10 @@ bool OperationFolder::insertKnownConstant(Operation *op, Attribute constValue) {
   // If this is a constant we unique'd, we don't need to insert, but we can
   // check to see if we should rehoist it.
   if (isFolderOwnedConstant(op)) {
-    if (&opBlock->front() != op && !isFolderOwnedConstant(op->getPrevNode()))
+    if (&opBlock->front() != op && !isFolderOwnedConstant(op->getPrevNode())) {
       op->moveBefore(&opBlock->front());
+      op->setLoc(erasedFoldedLocation);
+    }
     return true;
   }
 
@@ -142,6 +146,7 @@ bool OperationFolder::insertKnownConstant(Operation *op, Attribute constValue) {
   if (folderConstOp) {
     notifyRemoval(op);
     rewriter.replaceOp(op, folderConstOp->getResults());
+    folderConstOp->setLoc(erasedFoldedLocation);
     return false;
   }
 
@@ -151,8 +156,10 @@ bool OperationFolder::insertKnownConstant(Operation *op, Attribute constValue) {
   // anything. Otherwise, we move the constant to the insertion block.
   Block *insertBlock = &insertRegion->front();
   if (opBlock != insertBlock || (&insertBlock->front() != op &&
-                                 !isFolderOwnedConstant(op->getPrevNode())))
+                                 !isFolderOwnedConstant(op->getPrevNode()))) {
     op->moveBefore(&insertBlock->front());
+    op->setLoc(erasedFoldedLocation);
+  }
 
   folderConstOp = op;
   referencedDialects[op].push_back(op->getDialect());
@@ -193,17 +200,17 @@ void OperationFolder::clear() {
 /// Get or create a constant using the given builder. On success this returns
 /// the constant operation, nullptr otherwise.
 Value OperationFolder::getOrCreateConstant(Block *block, Dialect *dialect,
-                                           Attribute value, Type type,
-                                           Location loc) {
+                                           Attribute value, Type type) {
   // Find an insertion point for the constant.
   auto *insertRegion = getInsertionRegion(interfaces, block);
   auto &entry = insertRegion->front();
   rewriter.setInsertionPoint(&entry, entry.begin());
 
   // Get the constant map for the insertion region of this operation.
+  // Use erased location since the op is being built at the front of block.
   auto &uniquedConstants = foldScopes[insertRegion];
-  Operation *constOp =
-      tryGetOrCreateConstant(uniquedConstants, dialect, value, type, loc);
+  Operation *constOp = tryGetOrCreateConstant(uniquedConstants, dialect, value,
+                                              type, erasedFoldedLocation);
   return constOp ? constOp->getResult(0) : Value();
 }
 
@@ -254,8 +261,9 @@ OperationFolder::processFoldResults(Operation *op,
     // Check to see if there is a canonicalized version of this constant.
     auto res = op->getResult(i);
     Attribute attrRepl = foldResults[i].get<Attribute>();
-    if (auto *constOp = tryGetOrCreateConstant(
-            uniquedConstants, dialect, attrRepl, res.getType(), op->getLoc())) {
+    if (auto *constOp =
+            tryGetOrCreateConstant(uniquedConstants, dialect, attrRepl,
+                                   res.getType(), erasedFoldedLocation)) {
       // Ensure that this constant dominates the operation we are replacing it
       // with. This may not automatically happen if the operation being folded
       // was inserted before the constant within the insertion block.
@@ -290,8 +298,11 @@ OperationFolder::tryGetOrCreateConstant(ConstantMap &uniquedConstants,
   // Check if an existing mapping already exists.
   auto constKey = std::make_tuple(dialect, value, type);
   Operation *&constOp = uniquedConstants[constKey];
-  if (constOp)
+  if (constOp) {
+    if (loc != constOp->getLoc())
+      constOp->setLoc(erasedFoldedLocation);
     return constOp;
+  }
 
   // If one doesn't exist, try to materialize one.
   if (!(constOp = materializeConstant(dialect, rewriter, value, type, loc)))
@@ -314,6 +325,8 @@ OperationFolder::tryGetOrCreateConstant(ConstantMap &uniquedConstants,
     notifyRemoval(constOp);
     rewriter.eraseOp(constOp);
     referencedDialects[existingOp].push_back(dialect);
+    if (loc != existingOp->getLoc())
+      existingOp->setLoc(erasedFoldedLocation);
     return constOp = existingOp;
   }
 
diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
index 7decbce018a8..eca13f52f53d 100644
--- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
+++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Config/mlir-config.h"
 #include "mlir/IR/Action.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/IR/Verifier.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Rewrite/PatternApplicator.h"
 #include "mlir/Transforms/FoldUtils.h"
@@ -432,6 +433,10 @@ bool GreedyPatternRewriteDriver::processWorklist() {
     if (succeeded(folder.tryToFold(op))) {
       LLVM_DEBUG(logResultWithLine("success", "operation was folded"));
       changed = true;
+#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+      if (config.scope && failed(verify(config.scope->getParentOp())))
+        llvm::report_fatal_error("IR failed to verify after folding");
+#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
       continue;
     }
 
@@ -464,8 +469,9 @@ bool GreedyPatternRewriteDriver::processWorklist() {
 #endif
 
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
-    debugFingerPrints.computeFingerPrints(
-        /*topLevel=*/config.scope ? config.scope->getParentOp() : op);
+    if (config.scope) {
+      debugFingerPrints.computeFingerPrints(config.scope->getParentOp());
+    }
     auto clearFingerprints =
         llvm::make_scope_exit([&]() { debugFingerPrints.clear(); });
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
@@ -473,17 +479,24 @@ bool GreedyPatternRewriteDriver::processWorklist() {
     LogicalResult matchResult =
         matcher.matchAndRewrite(op, *this, canApply, onFailure, onSuccess);
 
+#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+    if (config.scope && failed(verify(config.scope->getParentOp())))
+      llvm::report_fatal_error("IR failed to verify after pattern application");
+#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+
     if (succeeded(matchResult)) {
       LLVM_DEBUG(logResultWithLine("success", "pattern matched"));
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
-      debugFingerPrints.notifyRewriteSuccess();
+      if (config.scope)
+        debugFingerPrints.notifyRewriteSuccess();
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
       changed = true;
       ++numRewrites;
     } else {
       LLVM_DEBUG(logResultWithLine("failure", "pattern failed to match"));
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
-      debugFingerPrints.notifyRewriteFailure();
+      if (config.scope)
+        debugFingerPrints.notifyRewriteFailure();
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
     }
   }
@@ -562,6 +575,18 @@ void GreedyPatternRewriteDriver::notifyOperationRemoved(Operation *op) {
     logger.startLine() << "** Erase   : '" << op->getName() << "'(" << op
                        << ")\n";
   });
+
+#ifndef NDEBUG
+  // Only ops that are within the configured scope are added to the worklist of
+  // the greedy pattern rewriter. Moreover, the parent op of the scope region is
+  // the part of the IR that is taken into account for the "expensive checks".
+  // A greedy pattern rewrite is not allowed to erase the parent op of the scope
+  // region, as that would break the worklist handling and the expensive checks.
+  if (config.scope && config.scope->getParentOp() == op)
+    llvm_unreachable(
+        "scope region must not be erased during greedy pattern rewrite");
+#endif // NDEBUG
+
   if (config.listener)
     config.listener->notifyOperationRemoved(op);
 
@@ -721,6 +746,12 @@ mlir::applyPatternsAndFoldGreedily(Region &region,
   if (!config.scope)
     config.scope = &region;
 
+#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+  if (failed(verify(config.scope->getParentOp())))
+    llvm::report_fatal_error(
+        "greedy pattern rewriter input IR failed to verify");
+#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+
   // Start the pattern driver.
   RegionPatternRewriteDriver driver(region.getContext(), patterns, config,
                                     region);
@@ -846,6 +877,12 @@ LogicalResult mlir::applyOpPatternsAndFold(
 #endif // NDEBUG
   }
 
+#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+  if (config.scope && failed(verify(config.scope->getParentOp())))
+    llvm::report_fatal_error(
+        "greedy pattern rewriter input IR failed to verify");
+#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+
   // Start the pattern driver.
   llvm::SmallDenseSet<Operation *, 4> surviving;
   MultiOpPatternRewriteDriver driver(ops.front()->getContext(), patterns,
diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index 41d91cf67783..266b86090fe1 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -21,7 +21,6 @@ declare_mlir_python_sources(MLIRPythonSources.Core.Python
     _mlir_libs/__init__.py
     ir.py
     passmanager.py
-    extras/types.py
     dialects/_ods_common.py
 
     # The main _mlir module has submodules: include stubs from each.
@@ -30,6 +29,14 @@ declare_mlir_python_sources(MLIRPythonSources.Core.Python
     _mlir_libs/_mlir/passmanager.pyi
 )
 
+declare_mlir_python_sources(MLIRPythonSources.Core.Python.Extras
+  ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
+  ADD_TO_PARENT MLIRPythonSources.Core.Python
+  SOURCES
+    extras/types.py
+    extras/meta.py
+)
+
 declare_mlir_python_sources(MLIRPythonSources.ExecutionEngine
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
   ADD_TO_PARENT MLIRPythonSources
@@ -172,7 +179,7 @@ declare_mlir_python_sources(
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
   GEN_ENUM_BINDINGS
   SOURCES
-    extras/dialects/transform/__init__.py)
+    dialects/transform/extras/__init__.py)
 
 declare_mlir_dialect_extension_python_bindings(
   ADD_TO_PARENT MLIRPythonSources.Dialects
@@ -372,6 +379,13 @@ declare_mlir_dialect_python_bindings(
 )
 
 declare_mlir_dialect_python_bindings(
+    ADD_TO_PARENT MLIRPythonSources.Dialects
+    ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
+    TD_FILE dialects/SPIRVOps.td
+    SOURCES dialects/spirv.py
+    DIALECT_NAME spirv)
+
+declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT MLIRPythonSources.Dialects
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
   TD_FILE dialects/TensorOps.td
diff --git a/mlir/python/mlir/dialects/SPIRVOps.td b/mlir/python/mlir/dialects/SPIRVOps.td
new file mode 100644
index 000000000000..eaae0e609d62
--- /dev/null
+++ b/mlir/python/mlir/dialects/SPIRVOps.td
@@ -0,0 +1,14 @@
+//===-- SPIRVOps.td - Entry point for SPIRVOps bind --------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PYTHON_BINDINGS_SPIRV_OPS
+#define PYTHON_BINDINGS_SPIRV_OPS
+
+include "mlir/Dialect/SPIRV/IR/SPIRVOps.td"
+
+#endif
diff --git a/mlir/python/mlir/dialects/arith.py b/mlir/python/mlir/dialects/arith.py
index 83aca0d58bf2..663a53660a64 100644
--- a/mlir/python/mlir/dialects/arith.py
+++ b/mlir/python/mlir/dialects/arith.py
@@ -11,6 +11,8 @@ try:
     from ._ods_common import (
         get_default_loc_context as _get_default_loc_context,
         _cext as _ods_cext,
+        get_op_result_or_op_results as _get_op_result_or_op_results,
+        SubClassValueT as _SubClassValueT,
     )
 
     from typing import Any, List, Union
@@ -75,3 +77,9 @@ class ConstantOp(ConstantOp):
             return FloatAttr(self.value).value
         else:
             raise ValueError("only integer and float constants have literal values")
+
+
+def constant(
+    result: Type, value: Union[int, float, Attribute], *, loc=None, ip=None
+) -> _SubClassValueT:
+    return _get_op_result_or_op_results(ConstantOp(result, value, loc=loc, ip=ip))
diff --git a/mlir/python/mlir/dialects/builtin.py b/mlir/python/mlir/dialects/builtin.py
index b71cc2466d46..1c69d6d7c3a0 100644
--- a/mlir/python/mlir/dialects/builtin.py
+++ b/mlir/python/mlir/dialects/builtin.py
@@ -2,8 +2,11 @@
 #  See https://llvm.org/LICENSE.txt for license information.
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from typing import Dict, Optional
+
 from ._builtin_ops_gen import *
 from ._builtin_ops_gen import _Dialect
+from ..extras.meta import region_op
 
 try:
     from ..ir import *
@@ -23,3 +26,23 @@ class ModuleOp(ModuleOp):
     @property
     def body(self):
         return self.regions[0].blocks[0]
+
+
+@region_op
+def module(
+    *,
+    sym_name=None,
+    sym_visibility=None,
+    attrs: Optional[Dict[str, Attribute]] = None,
+    loc=None,
+    ip=None,
+):
+    mod = ModuleOp.__base__(
+        sym_name=sym_name, sym_visibility=sym_visibility, loc=loc, ip=ip
+    )
+    if attrs is None:
+        attrs = {}
+    for attr_name, attr in attrs.items():
+        mod.operation.attributes[attr_name] = attr
+
+    return mod
diff --git a/mlir/python/mlir/dialects/func.py b/mlir/python/mlir/dialects/func.py
index 6599f67b7078..24fdcbcd85b2 100644
--- a/mlir/python/mlir/dialects/func.py
+++ b/mlir/python/mlir/dialects/func.py
@@ -243,6 +243,9 @@ class FuncOp(FuncOp):
         return decorator
 
 
+func = FuncOp.from_py_func
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class CallOp(CallOp):
     """Specialization for the call op class."""
diff --git a/mlir/python/mlir/dialects/pdl.py b/mlir/python/mlir/dialects/pdl.py
index 90d7d706238e..db07dc50aabd 100644
--- a/mlir/python/mlir/dialects/pdl.py
+++ b/mlir/python/mlir/dialects/pdl.py
@@ -5,6 +5,7 @@
 from ._pdl_ops_gen import *
 from ._pdl_ops_gen import _Dialect
 from .._mlir_libs._mlirDialectsPDL import *
+from .._mlir_libs._mlirDialectsPDL import OperationType
 
 
 try:
@@ -13,7 +14,7 @@ try:
 except ImportError as e:
     raise RuntimeError("Error loading imports from extension module") from e
 
-from typing import Union, Optional, Sequence, Mapping
+from typing import Union, Optional, Sequence, Mapping, NewType
 from ._ods_common import (
     get_op_result_or_value as _get_value,
     get_op_results_or_values as _get_values,
@@ -220,3 +221,10 @@ class TypesOp(TypesOp):
             constantTypes = []
         result = pdl.RangeType.get(pdl.TypeType.get())
         super().__init__(result, constantTypes=constantTypes, loc=loc, ip=ip)
+
+
+OperationTypeT = NewType("OperationType", OperationType)
+
+
+def op_t() -> OperationTypeT:
+    return OperationTypeT(OperationType.get())
diff --git a/mlir/python/mlir/dialects/scf.py b/mlir/python/mlir/dialects/scf.py
index 20bbed9bc93d..dad7377987e5 100644
--- a/mlir/python/mlir/dialects/scf.py
+++ b/mlir/python/mlir/dialects/scf.py
@@ -120,7 +120,7 @@ def for_(
     params = [start, stop, step]
     for i, p in enumerate(params):
         if isinstance(p, int):
-            p = constant(IntegerAttr.get(IndexType.get(), p))
+            p = constant(IndexType.get(), p)
         elif isinstance(p, float):
             raise ValueError(f"{p=} must be int.")
         params[i] = p
diff --git a/mlir/python/mlir/dialects/spirv.py b/mlir/python/mlir/dialects/spirv.py
new file mode 100644
index 000000000000..269678a2032e
--- /dev/null
+++ b/mlir/python/mlir/dialects/spirv.py
@@ -0,0 +1,5 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from ._spirv_ops_gen import *
diff --git a/mlir/python/mlir/dialects/tensor.py b/mlir/python/mlir/dialects/tensor.py
index 67248748eaf3..79dd9476ad0f 100644
--- a/mlir/python/mlir/dialects/tensor.py
+++ b/mlir/python/mlir/dialects/tensor.py
@@ -4,6 +4,7 @@
 
 from ._tensor_ops_gen import *
 from ._tensor_ops_gen import _Dialect
+from ..extras.meta import region_op
 
 try:
     from ..ir import *
@@ -40,3 +41,9 @@ class EmptyOp(EmptyOp):
                 dynamic_sizes.append(s)
         result_type = RankedTensorType.get(static_sizes, element_type)
         super().__init__(result_type, dynamic_sizes, loc=loc, ip=ip)
+
+
+generate = region_op(
+    lambda result, dynamic_extents: GenerateOp(result, dynamic_extents),
+    terminator=lambda args: YieldOp(args[0]),
+)
diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py
index 7ae4fefbac41..5b158ec6b65f 100644
--- a/mlir/python/mlir/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/__init__.py
@@ -6,6 +6,7 @@ from .._transform_enum_gen import *
 from .._transform_ops_gen import *
 from .._transform_ops_gen import _Dialect
 from ..._mlir_libs._mlirDialectsTransform import *
+from ..._mlir_libs._mlirDialectsTransform import AnyOpType, OperationType
 
 try:
     from ...ir import *
@@ -17,7 +18,7 @@ try:
 except ImportError as e:
     raise RuntimeError("Error loading imports from extension module") from e
 
-from typing import Optional, Sequence, Union
+from typing import Optional, Sequence, Union, NewType
 
 
 @_ods_cext.register_operation(_Dialect, replace=True)
@@ -174,7 +175,7 @@ class NamedSequenceOp(NamedSequenceOp):
         result_types: Sequence[Type],
         sym_visibility=None,
         arg_attrs=None,
-        res_attrs=None
+        res_attrs=None,
     ):
         function_type = FunctionType.get(input_types, result_types)
         super().__init__(
@@ -182,7 +183,7 @@ class NamedSequenceOp(NamedSequenceOp):
             function_type=TypeAttr.get(function_type),
             sym_visibility=sym_visibility,
             arg_attrs=arg_attrs,
-            res_attrs=res_attrs
+            res_attrs=res_attrs,
         )
         self.regions[0].blocks.append(*input_types)
 
@@ -211,3 +212,10 @@ class YieldOp(YieldOp):
         if operands is None:
             operands = []
         super().__init__(_get_op_results_or_values(operands), loc=loc, ip=ip)
+
+
+AnyOpTypeT = NewType("AnyOpType", AnyOpType)
+
+
+def any_op_t() -> AnyOpTypeT:
+    return AnyOpTypeT(AnyOpType.get())
diff --git a/mlir/python/mlir/extras/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/extras/__init__.py
index 9e313324318a..e4d47e9064f2 100644
--- a/mlir/python/mlir/extras/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/extras/__init__.py
@@ -2,12 +2,19 @@
 #  See https://llvm.org/LICENSE.txt for license information.
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from __future__ import annotations
-from typing import Callable, Optional, Sequence
+from typing import Callable, Optional, Sequence, Union
 
+from ....extras.meta import region_op
 from .... import ir
-from ....dialects import transform
-from ....dialects.transform import structured
+from .. import (
+    AnyOpType,
+    OperationType,
+    NamedSequenceOp,
+    YieldOp,
+    SequenceOp,
+    ApplyPatternsOp,
+)
+from .. import structured
 
 
 class Handle(ir.Value):
@@ -25,16 +32,16 @@ class Handle(ir.Value):
         self,
         v: ir.Value,
         *,
-        parent: Optional[Handle] = None,
-        children: Optional[Sequence[Handle]] = None,
+        parent: Optional["Handle"] = None,
+        children: Optional[Sequence["Handle"]] = None,
     ):
         super().__init__(v)
         self.parent = parent
         self.children = children if children is not None else []
 
 
-@ir.register_value_caster(transform.AnyOpType.get_static_typeid())
-@ir.register_value_caster(transform.OperationType.get_static_typeid())
+@ir.register_value_caster(AnyOpType.get_static_typeid())
+@ir.register_value_caster(OperationType.get_static_typeid())
 class OpHandle(Handle):
     """
     Wrapper around a transform operation handle with methods to chain further
@@ -52,11 +59,13 @@ class OpHandle(Handle):
 
     def match_ops(
         self,
-        ops: str
-        | ir.OpView
-        | structured.MatchInterfaceEnum
-        | Sequence[str | ir.OpView],
-    ) -> OpHandle:
+        ops: Union[
+            str,
+            ir.OpView,
+            structured.MatchInterfaceEnum,
+            Sequence[Union[str, ir.OpView]],
+        ],
+    ) -> "OpHandle":
         """
         Emits a `transform.structured.MatchOp`.
         Returns a handle to payload ops that match the given names, types, or
@@ -70,7 +79,7 @@ class OpHandle(Handle):
             if isinstance(ops, str):
                 ops = structured.MatchInterfaceEnum[ops]
             match_op = structured.MatchOp(
-                transform.AnyOpType.get(),
+                AnyOpType.get(),
                 self,
                 interface=ops,
             )
@@ -78,15 +87,15 @@ class OpHandle(Handle):
         # Handle op name(s), either given directly as string or given as op.
         else:
             if isinstance(ops, str):
-                op_type = transform.OperationType.get(ops)
+                op_type = OperationType.get(ops)
                 op_names = [ops]
             elif isinstance(ops, Sequence):
-                op_type = transform.AnyOpType.get()
+                op_type = AnyOpType.get()
                 op_names = [
                     op if isinstance(op, str) else op.OPERATION_NAME for op in ops
                 ]
             else:
-                op_type = transform.OperationType.get(ops.OPERATION_NAME)
+                op_type = OperationType.get(ops.OPERATION_NAME)
                 op_names = [ops.OPERATION_NAME]
             match_op = structured.MatchOp.match_op_names(
                 op_type,
@@ -100,7 +109,7 @@ class OpHandle(Handle):
 
 
 def insert_transform_script(
-    block_or_insertion_point: ir.Block | ir.InsertionPoint,
+    block_or_insertion_point: Union[ir.Block, ir.InsertionPoint],
     script: Callable[[OpHandle], None],
     dump_script: bool = False,
 ) -> None:
@@ -137,12 +146,17 @@ def insert_transform_script(
 
     with context, ir.Location.unknown(context):
         with insertion_point:
-            named_sequence_op = transform.NamedSequenceOp(
-                "__transform_main", [transform.AnyOpType.get()], []
+            named_sequence_op = NamedSequenceOp(
+                "__transform_main", [AnyOpType.get()], []
             )
         with ir.InsertionPoint(named_sequence_op.body):
             script(named_sequence_op.bodyTarget)
-            transform.YieldOp([])
+            YieldOp([])
 
     if dump_script:
         print(named_sequence_op)
+
+
+sequence = region_op(SequenceOp.__base__, terminator=YieldOp)
+named_sequence = region_op(NamedSequenceOp, terminator=YieldOp)
+apply_patterns = region_op(ApplyPatternsOp)
diff --git a/mlir/python/mlir/extras/meta.py b/mlir/python/mlir/extras/meta.py
new file mode 100644
index 000000000000..3f2defadf794
--- /dev/null
+++ b/mlir/python/mlir/extras/meta.py
@@ -0,0 +1,83 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import inspect
+from functools import wraps
+
+from ..dialects._ods_common import get_op_result_or_op_results
+from ..ir import Type, InsertionPoint
+
+
+def op_region_builder(op, op_region, terminator=None):
+    def builder_wrapper(body_builder):
+        # Add a block with block args having types determined by type hints on the wrapped function.
+        if len(op_region.blocks) == 0:
+            sig = inspect.signature(body_builder)
+            types = [p.annotation for p in sig.parameters.values()]
+            if not (
+                len(types) == len(sig.parameters)
+                and all(isinstance(t, Type) for t in types)
+            ):
+                raise ValueError(
+                    f"for {body_builder=} either missing a type annotation or type annotation isn't a mlir type: {sig}"
+                )
+
+            op_region.blocks.append(*types)
+
+        with InsertionPoint(op_region.blocks[0]):
+            results = body_builder(*list(op_region.blocks[0].arguments))
+
+        with InsertionPoint(list(op_region.blocks)[-1]):
+            if terminator is not None:
+                res = []
+                if isinstance(results, (tuple, list)):
+                    res.extend(results)
+                elif results is not None:
+                    res.append(results)
+                terminator(res)
+
+        return get_op_result_or_op_results(op)
+
+    return builder_wrapper
+
+
+def region_op(op_constructor, terminator=None):
+    """Decorator to define an MLIR Op specified as a python function.
+
+    Requires that an `mlir.ir.InsertionPoint` and `mlir.ir.Location` are
+    active for the current thread (i.e. established in a `with` block).
+
+    Supports "naked" usage i.e., no parens if no args need to be passed to the Op constructor.
+
+    When applied as a decorator to a Python function, an entry block will
+    be constructed for the Op with types as specified **as type hints on the args of the function**.
+    The block arguments will be passed positionally to the Python function.
+
+    If a terminator is specified then the return from the decorated function will be passed
+    to the terminator as the last statement in the entry block. Note, the API for the terminator
+    is a (possibly empty) list; terminator accepting single values should be wrapped in a
+    `lambda args: term(args[0])`
+
+    The identifier (name) of the function will become:
+    1. A single value result if the Op returns a single value;
+    2. An OpResultList (as a list) if the Op returns multiple values;
+    3. The Operation if the Op returns no results.
+
+    See examples in tensor.py and transform.extras.
+    """
+
+    def op_decorator(*args, **kwargs):
+        op = op_constructor(*args, **kwargs)
+        op_region = op.regions[0]
+
+        return op_region_builder(op, op_region, terminator)
+
+    @wraps(op_decorator)
+    def maybe_no_args(*args, **kwargs):
+        if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
+            return op_decorator()(args[0])
+        else:
+            return op_decorator(*args, **kwargs)
+
+    return maybe_no_args
diff --git a/mlir/test/Conversion/GPUToSPIRV/reductions.mlir b/mlir/test/Conversion/GPUToSPIRV/reductions.mlir
index af58f4173136..44f85f68587f 100644
--- a/mlir/test/Conversion/GPUToSPIRV/reductions.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/reductions.mlir
@@ -655,6 +655,26 @@ gpu.module @kernels {
 
 // -----
 
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Kernel, Addresses, Groups, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], []>, #spirv.resource_limits<>>
+} {
+
+gpu.module @kernels {
+  // CHECK-LABEL:  spirv.func @test
+  //  CHECK-SAME: (%[[ARG:.*]]: i32)
+  gpu.func @test(%arg : vector<1xi32>) kernel
+    attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 1, 1]>} {
+    // CHECK: %{{.*}} = spirv.GroupNonUniformSMax "Subgroup" "Reduce" %[[ARG]] : i32
+    %r0 = gpu.subgroup_reduce maxsi %arg : (vector<1xi32>) -> (vector<1xi32>)
+    gpu.return
+  }
+}
+
+}
+
+// -----
+
 // TODO: Handle boolean reductions.
 
 module attributes {
@@ -751,3 +771,21 @@ gpu.module @kernels {
   }
 }
 }
+
+// -----
+
+// Vector reductions need to be lowered to scalar reductions first.
+
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Kernel, Addresses, Groups, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], []>, #spirv.resource_limits<>>
+} {
+gpu.module @kernels {
+  gpu.func @maxui(%arg : vector<2xi32>) kernel
+    attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 1, 1]>} {
+    // expected-error @+1 {{failed to legalize operation 'gpu.subgroup_reduce'}}
+    %r0 = gpu.subgroup_reduce maxui %arg : (vector<2xi32>) -> (vector<2xi32>)
+    gpu.return
+  }
+}
+}
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 43de50f3dc8d..74186138c3a9 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -297,7 +297,7 @@ func.func @wgmma_f32_f16_f16(%descA : i64, %descB : i64) -> !mat64f32{
   // CHECK: %[[A2:.*]] = llvm.mlir.constant(-1 : i32) : i32
   // CHECK: %[[A3:.*]] = llvm.mlir.constant(-1 : i32) : i32
   // CHECK: %[[A4:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[A5:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[A5:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
   // CHECK: %[[V4:.*]] = llvm.extractvalue %[[RES]][4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
   // CHECK: %[[V11:.*]] = llvm.extractvalue %[[RES]][11] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>  
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
index aa010e759a0f..6616ea7cf699 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -88,7 +88,8 @@ func.func @matmul_dyn_output(%arg0: tensor<1x1x8xf32>, %arg1: tensor<1x8x1xf32>)
 // CHECK-LABEL: @fully_connected
 func.func @fully_connected(%arg0: tensor<5x3xf32>, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor<5x6xf32>) {
   // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> : tensor<2xi64>
-  // CHECK: %[[TRANSPOSED:.+]] = tosa.transpose %arg1, %[[PERM]] : (tensor<6x3xf32>, tensor<2xi64>) -> tensor<3x6xf32>
+  // CHECK: %[[TRANSPOSEDINIT:.+]] = tensor.empty() : tensor<3x6xf32>
+  // CHECK: %[[TRANSPOSED:.+]] = linalg.transpose ins(%arg1 : tensor<6x3xf32>) outs(%[[TRANSPOSEDINIT]] : tensor<3x6xf32>) permutation = [1, 0]
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<5x6xf32>
 
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<6xf32>) outs(%[[INIT]] : tensor<5x6xf32>) {
@@ -110,7 +111,7 @@ func.func @fully_connected(%arg0: tensor<5x3xf32>, %arg1: tensor<6x3xf32>, %arg2
 // CHECK-LABEL: @quantized_fully_connected
 func.func @quantized_fully_connected(%arg0: tensor<5x3xi8>, %arg1: tensor<6x3xi8>, %arg2: tensor<6xi32>) -> (tensor<5x6xi32>) {
   // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> : tensor<2xi64>
-  // CHECK: %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[PERM]] : (tensor<6x3xi8>, tensor<2xi64>) -> tensor<3x6xi8>
+  // CHECK: %[[TRANSPOSE:.+]] =  linalg.transpose ins(%arg1 : tensor<6x3xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x6xi8>) permutation = [1, 0]
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<5x6xi32>
 
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<6xi32>) outs(%[[INIT]] : tensor<5x6xi32>) {
@@ -136,7 +137,7 @@ func.func @fully_connected_dyn(%arg0: tensor<?x3xf32>, %arg1: tensor<6x3xf32>, %
   // CHECK: %[[C0:.+]] = arith.constant 0 : index
   // CHECK: %[[DIM0:.+]] = tensor.dim %arg0, %c0 : tensor<?x3xf32>
   // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> : tensor<2xi64>
-  // CHECK: %[[TRANSPOSED:.+]] = tosa.transpose %arg1, %[[PERM]] : (tensor<6x3xf32>, tensor<2xi64>) -> tensor<3x6xf32>
+  // CHECK: %[[TRANSPOSED:.+]] = linalg.transpose ins(%arg1 : tensor<6x3xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x6xf32>) permutation = [1, 0]
   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM0]]) : tensor<?x6xf32>
 
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<6xf32>) outs(%[[INIT]] : tensor<?x6xf32>) {
@@ -377,7 +378,7 @@ func.func @avg_pool_dyn(%arg0: tensor<?x6x34x62xf32>) -> (tensor<?x5x33x62xf32>)
 // CHECK-LABEL: @conv2d_i8
 func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi8>, %bias: tensor<28xi8>) -> () {
   // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi64>
-  // HWCF: %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[TRANSPOSE_DIMS]] : (tensor<28x1x1x27xi8>, tensor<4xi64>) -> tensor<1x1x27x28xi8>
+  // HWCF: %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x1x1x27xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<1x1x27x28xi8>) permutation = [1, 2, 3, 0]
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xi32>
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xi8>) outs(%[[INIT]] : tensor<1x45x40x28xi32>) {
   // CHECK:   arith.extsi
@@ -398,7 +399,7 @@ func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi
 // CHECK-LABEL: @conv2d_f32
 func.func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () {
   // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi64>
-  // HWCF: %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[TRANSPOSE_DIMS]] : (tensor<28x3x3x27xf32>, tensor<4xi64>) -> tensor<3x3x27x28xf32>
+  // HWCF: %[[TRANSPOSE:.+]] =  linalg.transpose ins(%arg1 : tensor<28x3x3x27xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x3x27x28xf32>) permutation = [1, 2, 3, 0]
 
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xf32>
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xf32>) outs(%[[INIT]] : tensor<1x45x40x28xf32>) {
@@ -677,7 +678,7 @@ func.func @depthwise_conv2d_dyn_w_h(%arg0: tensor<2x?x?x3xf32>, %arg1: tensor<3x
 // CHECK-LABEL: @conv3d_f32
 func.func @conv3d_f32(%input: tensor<1x49x48x47x27xf32>, %weights: tensor<28x3x4x5x27xf32>, %bias: tensor<28xf32>) -> () {
   // CHECK-DAG:  %[[PERMS:.+]] = arith.constant dense<[1, 2, 3, 4, 0]>
-  // CHECK-DAG:  %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[PERMS]]
+  // CHECK-DAG:  %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x3x4x5x27xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x4x5x27x28xf32>) permutation = [1, 2, 3, 4, 0]
   // CHECK-DAG:  %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x28xf32>
   // CHECK:      %[[BROADCAST:.+]] = linalg.generic
   // CHECK-SAME: {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
@@ -701,7 +702,7 @@ func.func @conv3d_f32(%input: tensor<1x49x48x47x27xf32>, %weights: tensor<28x3x4
 // CHECK-LABEL: @conv3d_i8
 func.func @conv3d_i8(%input: tensor<1x49x48x47x27xi8>, %weights: tensor<28x3x4x5x27xi8>, %bias: tensor<28xi32>) -> () {
   // CHECK-DAG:  %[[PERMS:.+]] = arith.constant dense<[1, 2, 3, 4, 0]>
-  // CHECK-DAG:  %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[PERMS]]
+  // CHECK-DAG:  %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x3x4x5x27xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x4x5x27x28xi8>) permutation = [1, 2, 3, 4, 0]
   // CHECK-DAG:  %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x28xi32>
   // CHECK:      %[[BROADCAST:.+]] = linalg.generic
   // CHECK-SAME: {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
@@ -720,3 +721,63 @@ func.func @conv3d_i8(%input: tensor<1x49x48x47x27xi8>, %weights: tensor<28x3x4x5
   %0 = tosa.conv3d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xi8>, tensor<28x3x4x5x27xi8>, tensor<28xi32>)  -> tensor<1x47x45x43x28xi32>
   return
 }
+
+// -----
+
+// CHECK-LABEL: @test_transpose
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<1x2x3xi32>)
+func.func @test_transpose(%arg0: tensor<1x2x3xi32>) -> () {
+  %0 = arith.constant dense<[1, 2, 0]> : tensor<3xi32>
+  // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<2x3x1xi32>
+  // CHECK: %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[ARG0]] : tensor<1x2x3xi32>) outs(%[[INIT]] : tensor<2x3x1xi32>) permutation = [1, 2, 0]
+  %1 = tosa.transpose %arg0, %0 : (tensor<1x2x3xi32>, tensor<3xi32>)  -> tensor<2x3x1xi32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @test_transpose_dyn
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<1x?x3x4xi32>)
+func.func @test_transpose_dyn(%arg0: tensor<1x?x3x4xi32>) -> () {
+  %0 = arith.constant dense<[1, 3, 0, 2]> : tensor<4xi32>
+  // CHECK: %[[C1:.+]] = arith.constant 1
+  // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+  // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM]]) : tensor<?x4x1x3xi32>
+  // CHECK: %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[ARG0]] : tensor<1x?x3x4xi32>) outs(%[[INIT]] : tensor<?x4x1x3xi32>) permutation = [1, 3, 0, 2]
+  %1 = tosa.transpose %arg0, %0 : (tensor<1x?x3x4xi32>, tensor<4xi32>)  -> tensor<?x4x1x3xi32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @test_transpose_dyn_multiple_2d
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?xf32>)
+func.func @test_transpose_dyn_multiple_2d(%arg0: tensor<?x?xf32>) -> () {
+  %0 = arith.constant dense<[1, 0]> : tensor<2xi32>
+  // CHECK-DAG: %[[C0:.+]] = arith.constant 0
+  // CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+  // CHECK-DAG: %[[C1:.+]] = arith.constant 1
+  // CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+  // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM1]], %[[DIM0]])
+  // CHECK: %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[ARG0]] : tensor<?x?xf32>) outs(%[[INIT]] : tensor<?x?xf32>) permutation = [1, 0]
+  %1 = tosa.transpose %arg0, %0 : (tensor<?x?xf32>, tensor<2xi32>)  -> tensor<?x?xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @test_transpose_dyn_multiple_3d
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?x?xf32>)
+func.func @test_transpose_dyn_multiple_3d(%arg0: tensor<?x?x?xf32>) {
+  %0 = arith.constant dense<[2, 0, 1]> : tensor<3xi32>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[DIM0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[DIM1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK-DAG: %[[DIM2:.*]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
+  // CHECK: %[[INIT:.*]] = tensor.empty(%[[DIM2]], %[[DIM0]], %[[DIM1]]) : tensor<?x?x?xf32>
+  // CHECK: %[[TRANSPOSE:.*]] = linalg.transpose ins(%[[ARG0]] : tensor<?x?x?xf32>) outs(%[[INIT]] : tensor<?x?x?xf32>) permutation = [2, 0, 1]
+  %1 = "tosa.transpose"(%arg0, %0) : (tensor<?x?x?xf32>, tensor<3xi32>) -> tensor<?x?x?xf32>
+  return
+}
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
index e009cab4b1bf..c2bbfd5130eb 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
@@ -38,13 +38,3 @@ func.func @avg_pool2d_with_unsupported_quant_type(%arg0: tensor<1x7x7x9x!quant.u
   %0 = "tosa.avg_pool2d"(%arg0) {acc_type = i32, kernel = array<i64: 2, 2>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} : (tensor<1x7x7x9x!quant.uniform<i8:f32, 0.01>>) -> tensor<1x7x7x9x!quant.uniform<i8:f32, 0.01>>
   return %0 : tensor<1x7x7x9x!quant.uniform<i8:f32, 0.01>>
 }
-
-// -----
-
-// check that --tosa-validate=strict-op-spec-alignment does not kick in because tosa-to-linalg-named comes before tosa-validate
-// this would have failed tosa strict-op-spec-alignment because perms of transpose is not constant
-// but tosa.transpose is lowered by tosa-to-linalg-named pass which is earlier than tosa-validate pass in the pipeline
-func.func @test_transpose_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3xi32>) -> tensor<3x13x21xf32> {
-  %0 = tosa.transpose %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3xi32>) -> tensor<3x13x21xf32>
-  return %0 : tensor<3x13x21xf32>
-}
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index e0e041139fe4..535316e5a372 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -820,7 +820,6 @@ func.func @test_negate_quantized(%arg0: tensor<1xi8>) -> () {
   return
 }
 
-
 // -----
 
 // CHECK-LABEL: @test_identity
@@ -836,90 +835,6 @@ func.func @test_identity(%arg0: tensor<1xf32>, %arg1: tensor<1xi32>) -> (tensor<
 
 // -----
 
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2) -> (d2, d0, d1)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-
-// CHECK-LABEL: @test_transpose
-// CHECK-SAME: ([[ARG0:%.+]]: tensor<1x2x3xi32>)
-func.func @test_transpose(%arg0: tensor<1x2x3xi32>) -> () {
-  %0 = arith.constant dense<[1, 2, 0]> : tensor<3xi32>
-  // CHECK: [[INIT:%.+]] = tensor.empty() : tensor<2x3x1xi32>
-  // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel"]} ins([[ARG0]] : tensor<1x2x3xi32>) outs([[OUT:%.+]] : tensor<2x3x1xi32>)
-  // CHECK: ^bb0([[ARG1:%.+]]: i32, [[ARG2:%.+]]: i32)
-  // CHECK:   linalg.yield [[ARG1]]
-  // CHECK: }
-  %1 = tosa.transpose %arg0, %0 : (tensor<1x2x3xi32>, tensor<3xi32>)  -> tensor<2x3x1xi32>
-  return
-}
-
-// -----
-
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-
-// CHECK-LABEL: @test_transpose_dyn
-// CHECK-SAME: (%[[ARG0:.+]]: tensor<1x?x3x4xi32>)
-func.func @test_transpose_dyn(%arg0: tensor<1x?x3x4xi32>) -> () {
-  %0 = arith.constant dense<[1, 3, 0, 2]> : tensor<4xi32>
-  // CHECK: %[[C1:.+]] = arith.constant 1
-  // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C1]]
-  // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM]]) : tensor<?x4x1x3xi32>
-  // CHECK: %[[GENERIC:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[ARG0]] : tensor<1x?x3x4xi32>) outs([[OUT:%.+]] : tensor<?x4x1x3xi32>)
-  // CHECK: ^bb0([[ARG1:%.+]]: i32, [[ARG2:%.+]]: i32)
-  // CHECK:   linalg.yield [[ARG1]]
-  // CHECK: }
-  %1 = tosa.transpose %arg0, %0 : (tensor<1x?x3x4xi32>, tensor<4xi32>)  -> tensor<?x4x1x3xi32>
-  return
-}
-
-// -----
-
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-
-// CHECK-LABEL: @test_transpose_dyn_multiple_2d
-// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?xf32>)
-func.func @test_transpose_dyn_multiple_2d(%arg0: tensor<?x?xf32>) -> () {
-  %0 = arith.constant dense<[1, 0]> : tensor<2xi32>
-  // CHECK-DAG: %[[C0:.+]] = arith.constant 0
-  // CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
-  // CHECK-DAG: %[[C1:.+]] = arith.constant 1
-  // CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
-  // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM1]], %[[DIM0]])
-  // CHECK: %[[GENERIC:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[ARG0]] : tensor<?x?xf32>) outs([[OUT:%.+]] : tensor<?x?xf32>)
-  // CHECK: ^bb0([[ARG1:%.+]]: f32, [[ARG2:%.+]]: f32)
-  // CHECK:   linalg.yield [[ARG1]]
-  // CHECK: }
-  %1 = tosa.transpose %arg0, %0 : (tensor<?x?xf32>, tensor<2xi32>)  -> tensor<?x?xf32>
-  return
-}
-
-// -----
-
-// CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1, d2) -> (d1, d2, d0)>
-// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-
-// CHECK-LABEL: @test_transpose_dyn_multiple_3d
-// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?x?xf32>)
-func.func @test_transpose_dyn_multiple_3d(%arg0: tensor<?x?x?xf32>) {
-  %0 = arith.constant dense<[2, 0, 1]> : tensor<3xi32>
-  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK-DAG: %[[DIM0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
-  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  // CHECK-DAG: %[[DIM1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
-  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
-  // CHECK-DAG: %[[DIM2:.*]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
-  // CHECK: %[[INIT:.*]] = tensor.empty(%[[DIM2]], %[[DIM0]], %[[DIM1]]) : tensor<?x?x?xf32>
-  // CHECK: %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel"]} ins(%[[ARG0]] : tensor<?x?x?xf32>) outs(%[[INIT]] : tensor<?x?x?xf32>) {
-  // CHECK: ^bb0(%[[IN0:.*]]: f32, %[[OUT0:.*]]: f32):
-  // CHECK:   linalg.yield %[[IN0]] : f32
-  // CHECK: } -> tensor<?x?x?xf32>
-  %1 = "tosa.transpose"(%arg0, %0) : (tensor<?x?x?xf32>, tensor<3xi32>) -> tensor<?x?x?xf32>
-  return
-}
-
-// -----
-
 // CHECK-LABEL: @reduce_float
 // CHECK-SAME: [[ARG0:%.+]]: tensor<5x4xf32>
 func.func @reduce_float(%arg0: tensor<5x4xf32>) -> () {
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index d80392ebd87b..7ea0197bdecb 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -2059,6 +2059,36 @@ func.func @vector_store_op_index(%memref : memref<200x100xindex>, %i : index, %j
 
 // -----
 
+func.func @vector_load_op_0d(%memref : memref<200x100xf32>, %i : index, %j : index) -> vector<f32> {
+  %0 = vector.load %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+  return %0 : vector<f32>
+}
+
+// CHECK-LABEL: func @vector_load_op_0d
+// CHECK: %[[load:.*]] = memref.load %{{.*}}[%{{.*}}, %{{.*}}]
+// CHECK: %[[vec:.*]] = llvm.mlir.undef : vector<1xf32>
+// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[inserted:.*]] = llvm.insertelement %[[load]], %[[vec]][%[[c0]] : i32] : vector<1xf32>
+// CHECK: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[inserted]] : vector<1xf32> to vector<f32>
+// CHECK: return %[[cast]] : vector<f32>
+
+// -----
+
+func.func @vector_store_op_0d(%memref : memref<200x100xf32>, %i : index, %j : index) {
+  %val = arith.constant dense<11.0> : vector<f32>
+  vector.store %val, %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+  return
+}
+
+// CHECK-LABEL: func @vector_store_op_0d
+// CHECK: %[[val:.*]] = arith.constant dense<1.100000e+01> : vector<f32>
+// CHECK: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[val]] : vector<f32> to vector<1xf32>
+// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[extracted:.*]] = llvm.extractelement %[[cast]][%[[c0]] : i64] : vector<1xf32>
+// CHECK: memref.store %[[extracted]], %{{.*}}[%{{.*}}, %{{.*}}]
+
+// -----
+
 func.func @masked_load_op(%arg0: memref<?xf32>, %arg1: vector<16xi1>, %arg2: vector<16xf32>) -> vector<16xf32> {
   %c0 = arith.constant 0: index
   %0 = vector.maskedload %arg0[%c0], %arg1, %arg2 : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
diff --git a/mlir/test/Dialect/Affine/loop-fusion.mlir b/mlir/test/Dialect/Affine/loop-fusion.mlir
index 8c536e631a86..045b1bec272e 100644
--- a/mlir/test/Dialect/Affine/loop-fusion.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion.mlir
@@ -1541,5 +1541,37 @@ func.func @should_fuse_and_preserve_dep_on_constant() {
   return
 }
 
+// -----
+
+// CHECK-LABEL: @producer_consumer_with_outmost_user
+func.func @producer_consumer_with_outmost_user(%arg0 : f16) {
+  %c0 = arith.constant 0 : index
+  %src = memref.alloc() : memref<f16, 1>
+  %dst = memref.alloc() : memref<f16>
+  %tag = memref.alloc() : memref<1xi32>
+  affine.for %arg1 = 4 to 6 {
+    affine.for %arg2 = 0 to 1 {
+      %0 = arith.addf %arg0, %arg0 : f16
+      affine.store %0, %src[] : memref<f16, 1>
+    }
+    affine.for %arg3 = 0 to 1 {
+      %0 = affine.load %src[] : memref<f16, 1>
+    }
+  }
+  affine.dma_start %src[], %dst[], %tag[%c0], %c0 : memref<f16, 1>, memref<f16>, memref<1xi32>
+  // CHECK:       %[[CST_INDEX:.*]] = arith.constant 0 : index
+  // CHECK:       %[[DMA_SRC:.*]] = memref.alloc() : memref<f16, 1>
+  // CHECK:       %[[DMA_DST:.*]] = memref.alloc() : memref<f16>
+  // CHECK:       %[[DMA_TAG:.*]] = memref.alloc() : memref<1xi32>
+  // CHECK:       affine.for %arg1 = 4 to 6
+  // CHECK-NEXT:  affine.for %arg2 = 0 to 1
+  // CHECK-NEXT:  %[[RESULT_ADD:.*]] = arith.addf %arg0, %arg0 : f16
+  // CHECK-NEXT:  affine.store %[[RESULT_ADD]], %[[DMA_SRC]][] : memref<f16, 1>
+  // CHECK-NEXT:  affine.load %[[DMA_SRC]][] : memref<f16, 1>
+  // CHECK:       affine.dma_start %[[DMA_SRC]][], %[[DMA_DST]][], %[[DMA_TAG]][%[[CST_INDEX]]], %[[CST_INDEX]] : memref<f16, 1>, memref<f16>, memref<1xi32>
+  // CHECK-NEXT:  return
+  return
+}
+
 // Add further tests in mlir/test/Transforms/loop-fusion-4.mlir
 
diff --git a/mlir/test/Dialect/Arith/expand-ops.mlir b/mlir/test/Dialect/Arith/expand-ops.mlir
index 2c41f098c6c1..046e8ff64fba 100644
--- a/mlir/test/Dialect/Arith/expand-ops.mlir
+++ b/mlir/test/Dialect/Arith/expand-ops.mlir
@@ -176,8 +176,8 @@ func.func @ceildivui_index(%arg0: index, %arg1: index) -> (index) {
 
 // -----
 
-// CHECK-LABEL: func @maxf
-func.func @maxf(%a: f32, %b: f32) -> f32 {
+// CHECK-LABEL: func @maximumf
+func.func @maximumf(%a: f32, %b: f32) -> f32 {
   %result = arith.maximumf %a, %b : f32
   return %result : f32
 }
@@ -190,8 +190,8 @@ func.func @maxf(%a: f32, %b: f32) -> f32 {
 
 // -----
 
-// CHECK-LABEL: func @maxf_vector
-func.func @maxf_vector(%a: vector<4xf16>, %b: vector<4xf16>) -> vector<4xf16> {
+// CHECK-LABEL: func @maximumf_vector
+func.func @maximumf_vector(%a: vector<4xf16>, %b: vector<4xf16>) -> vector<4xf16> {
   %result = arith.maximumf %a, %b : vector<4xf16>
   return %result : vector<4xf16>
 }
@@ -204,8 +204,23 @@ func.func @maxf_vector(%a: vector<4xf16>, %b: vector<4xf16>) -> vector<4xf16> {
 
 // -----
 
-// CHECK-LABEL: func @minf
-func.func @minf(%a: f32, %b: f32) -> f32 {
+// CHECK-LABEL: func @maxnumf
+func.func @maxnumf(%a: f32, %b: f32) -> f32 {
+  %result = arith.maxnumf %a, %b : f32
+  return %result : f32
+}
+
+// CHECK-SAME: %[[LHS:.*]]: f32, %[[RHS:.*]]: f32)
+// CHECK-NEXT: %[[CMP:.*]] = arith.cmpf ugt, %[[LHS]], %[[RHS]] : f32
+// CHECK-NEXT: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LHS]], %[[RHS]] : f32
+// CHECK-NEXT: %[[IS_NAN:.*]] = arith.cmpf uno, %[[LHS]], %[[LHS]] : f32
+// CHECK-NEXT: %[[RESULT:.*]] = arith.select %[[IS_NAN]], %[[RHS]], %[[SELECT]] : f32
+// CHECK-NEXT: return %[[RESULT]] : f32
+
+// -----
+
+// CHECK-LABEL: func @minimumf
+func.func @minimumf(%a: f32, %b: f32) -> f32 {
   %result = arith.minimumf %a, %b : f32
   return %result : f32
 }
@@ -219,6 +234,21 @@ func.func @minf(%a: f32, %b: f32) -> f32 {
 
 // -----
 
+// CHECK-LABEL: func @minnumf
+func.func @minnumf(%a: f32, %b: f32) -> f32 {
+  %result = arith.minnumf %a, %b : f32
+  return %result : f32
+}
+
+// CHECK-SAME: %[[LHS:.*]]: f32, %[[RHS:.*]]: f32)
+// CHECK-NEXT: %[[CMP:.*]] = arith.cmpf ult, %[[LHS]], %[[RHS]] : f32
+// CHECK-NEXT: %[[SELECT:.*]] = arith.select %[[CMP]], %[[LHS]], %[[RHS]] : f32
+// CHECK-NEXT: %[[IS_NAN:.*]] = arith.cmpf uno, %[[LHS]], %[[LHS]] : f32
+// CHECK-NEXT: %[[RESULT:.*]] = arith.select %[[IS_NAN]], %[[RHS]], %[[SELECT]] : f32
+// CHECK-NEXT: return %[[RESULT]] : f32
+
+// -----
+
 func.func @truncf_f32(%arg0 : f32) -> bf16 {
     %0 = arith.truncf %arg0 : f32 to bf16
     return %0 : bf16
diff --git a/mlir/test/Dialect/EmitC/invalid_ops.mlir b/mlir/test/Dialect/EmitC/invalid_ops.mlir
index 6ad646d7c62f..46eccb1c24ee 100644
--- a/mlir/test/Dialect/EmitC/invalid_ops.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_ops.mlir
@@ -1,7 +1,15 @@
 // RUN: mlir-opt %s -split-input-file -verify-diagnostics
 
+func.func @const_attribute_str() {
+    // expected-error @+1 {{'emitc.constant' op string attributes are not supported, use #emitc.opaque instead}}                 
+    %c0 = "emitc.constant"(){value = "NULL"} : () -> !emitc.ptr<i32>
+    return
+}
+
+// -----
+
 func.func @const_attribute_return_type_1() {
-    // expected-error @+1 {{'emitc.constant' op requires attribute's type ('i64') to match op's return type ('i32')}}
+    // expected-error @+1 {{'emitc.constant' op requires attribute to either be an #emitc.opaque attribute or it's type ('i64') to match the op's result type ('i32')}}
     %c0 = "emitc.constant"(){value = 42: i64} : () -> i32
     return
 }
@@ -9,8 +17,8 @@ func.func @const_attribute_return_type_1() {
 // -----
 
 func.func @const_attribute_return_type_2() {
-    // expected-error @+1 {{'emitc.constant' op requires attribute's type ('!emitc.opaque<"char">') to match op's return type ('!emitc.opaque<"mychar">')}}
-    %c0 = "emitc.constant"(){value = "CHAR_MIN" : !emitc.opaque<"char">} : () -> !emitc.opaque<"mychar">
+    // expected-error @+1 {{'emitc.constant' op attribute 'value' failed to satisfy constraint: An opaque attribute or TypedAttr instance}}
+    %c0 = "emitc.constant"(){value = unit} : () -> i32
     return
 }
 
@@ -18,7 +26,7 @@ func.func @const_attribute_return_type_2() {
 
 func.func @empty_constant() {
     // expected-error @+1 {{'emitc.constant' op value must not be empty}}
-    %c0 = "emitc.constant"(){value = ""} : () -> i32
+    %c0 = "emitc.constant"(){value = #emitc.opaque<"">} : () -> i32
     return
 }
 
@@ -98,7 +106,7 @@ func.func @illegal_operand() {
 // -----
 
 func.func @var_attribute_return_type_1() {
-    // expected-error @+1 {{'emitc.variable' op requires attribute's type ('i64') to match op's return type ('i32')}}
+    // expected-error @+1 {{'emitc.variable' op requires attribute to either be an #emitc.opaque attribute or it's type ('i64') to match the op's result type ('i32')}}
     %c0 = "emitc.variable"(){value = 42: i64} : () -> i32
     return
 }
@@ -106,8 +114,8 @@ func.func @var_attribute_return_type_1() {
 // -----
 
 func.func @var_attribute_return_type_2() {
-    // expected-error @+1 {{'emitc.variable' op requires attribute's type ('!emitc.ptr<i64>') to match op's return type ('!emitc.ptr<i32>')}}
-    %c0 = "emitc.variable"(){value = "nullptr" : !emitc.ptr<i64>} : () -> !emitc.ptr<i32>
+    // expected-error @+1 {{'emitc.variable' op attribute 'value' failed to satisfy constraint: An opaque attribute or TypedAttr instance}}
+    %c0 = "emitc.variable"(){value = unit} : () -> i32
     return
 }
 
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index d8a40f89f80a..8a34d6432607 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -333,9 +333,17 @@ func.func @reduce_invalid_op_type_maximumf(%arg0 : i32) {
 
 // -----
 
-func.func @subgroup_reduce_bad_type(%arg0 : vector<2xf32>) {
-  // expected-error@+1 {{'gpu.subgroup_reduce' op operand #0 must be Integer or Float}}
-  %res = gpu.subgroup_reduce add %arg0 : (vector<2xf32>) -> vector<2xf32>
+func.func @subgroup_reduce_bad_type(%arg0 : vector<2x2xf32>) {
+  // expected-error@+1 {{'gpu.subgroup_reduce' op operand #0 must be Integer or Float or vector of}}
+  %res = gpu.subgroup_reduce add %arg0 : (vector<2x2xf32>) -> vector<2x2xf32>
+  return
+}
+
+// -----
+
+func.func @subgroup_reduce_bad_type_scalable(%arg0 : vector<[2]xf32>) {
+  // expected-error@+1 {{is not compatible with scalable vector types}}
+  %res = gpu.subgroup_reduce add %arg0 : (vector<[2]xf32>) -> vector<[2]xf32>
   return
 }
 
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 481934364156..605124243830 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -84,6 +84,8 @@ module attributes {gpu.container_module} {
 
       %one = arith.constant 1.0 : f32
 
+      %vec = vector.broadcast %arg0 : f32 to vector<4xf32>
+
       // CHECK: %{{.*}} = gpu.all_reduce add %{{.*}} {
       // CHECK-NEXT: } : (f32) -> f32
       %sum = gpu.all_reduce add %one {} : (f32) -> (f32)
@@ -98,6 +100,9 @@ module attributes {gpu.container_module} {
       // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} uniform : (f32) -> f32
       %sum_subgroup1 = gpu.subgroup_reduce add %one uniform : (f32) -> f32
 
+      // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} : (vector<4xf32>) -> vector<4xf32>
+      %sum_subgroup2 = gpu.subgroup_reduce add %vec : (vector<4xf32>) -> vector<4xf32>
+
       %width = arith.constant 7 : i32
       %offset = arith.constant 3 : i32
       // CHECK: gpu.shuffle xor %{{.*}}, %{{.*}}, %{{.*}} : f32
diff --git a/mlir/test/Dialect/GPU/subgroup-redule-lowering.mlir b/mlir/test/Dialect/GPU/subgroup-redule-lowering.mlir
new file mode 100644
index 000000000000..b7146071bf2f
--- /dev/null
+++ b/mlir/test/Dialect/GPU/subgroup-redule-lowering.mlir
@@ -0,0 +1,71 @@
+// RUN: mlir-opt --allow-unregistered-dialect --test-gpu-subgroup-reduce-lowering %s | FileCheck %s
+
+// CHECK: gpu.module @kernels {
+gpu.module @kernels {
+
+  // CHECK-LABEL: gpu.func @kernel0(
+  // CHECK-SAME: %[[ARG0:.+]]: vector<5xf16>)
+  gpu.func @kernel0(%arg0: vector<5xf16>) kernel {
+    // CHECK: %[[VZ:.+]] = arith.constant dense<0.0{{.*}}> : vector<5xf16>
+    // CHECK: %[[E0:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
+    // CHECK: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (vector<2xf16>) -> vector<2xf16>
+    // CHECK: %[[V0:.+]] = vector.insert_strided_slice %[[R0]], %[[VZ]] {offsets = [0], strides = [1]} : vector<2xf16> into vector<5xf16>
+    // CHECK: %[[E1:.+]] = vector.extract_strided_slice %[[ARG0]] {offsets = [2], sizes = [2], strides = [1]} : vector<5xf16> to vector<2xf16>
+    // CHECK: %[[R1:.+]] = gpu.subgroup_reduce add %[[E1]] : (vector<2xf16>) -> vector<2xf16>
+    // CHECK: %[[V1:.+]] = vector.insert_strided_slice %[[R1]], %[[V0]] {offsets = [2], strides = [1]} : vector<2xf16> into vector<5xf16>
+    // CHECK: %[[E2:.+]] = vector.extract %[[ARG0]][4] : f16 from vector<5xf16>
+    // CHECK: %[[R2:.+]] = gpu.subgroup_reduce add %[[E2]] : (f16) -> f16
+    // CHECK: %[[V2:.+]] = vector.insert %[[R2]], %[[V1]] [4] : f16 into vector<5xf16>
+    // CHECK: "test.consume"(%[[V2]]) : (vector<5xf16>) -> ()
+    %sum0 = gpu.subgroup_reduce add %arg0 : (vector<5xf16>) -> (vector<5xf16>)
+    "test.consume"(%sum0) : (vector<5xf16>) -> ()
+
+
+    // CHECK-COUNT-3: gpu.subgroup_reduce mul {{.+}} uniform
+    // CHECK: "test.consume"
+    %sum1 = gpu.subgroup_reduce mul %arg0 uniform : (vector<5xf16>) -> (vector<5xf16>)
+    "test.consume"(%sum1) : (vector<5xf16>) -> ()
+
+    // CHECK: gpu.return
+    gpu.return
+  }
+
+  // CHECK-LABEL: gpu.func @kernel1(
+  // CHECK-SAME: %[[ARG0:.+]]: vector<1xf32>)
+  gpu.func @kernel1(%arg0: vector<1xf32>) kernel {
+    // CHECK: %[[E0:.+]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32>
+    // CHECK: %[[R0:.+]] = gpu.subgroup_reduce add %[[E0]] : (f32) -> f32
+    // CHECK: %[[V0:.+]] = vector.broadcast %[[R0]] : f32 to vector<1xf32>
+    // CHECK: "test.consume"(%[[V0]]) : (vector<1xf32>) -> ()
+    %sum0 = gpu.subgroup_reduce add %arg0 : (vector<1xf32>) -> (vector<1xf32>)
+    "test.consume"(%sum0) : (vector<1xf32>) -> ()
+
+    // CHECK: gpu.subgroup_reduce add {{.+}} uniform : (f32) -> f32
+    // CHECK: "test.consume"
+    %sum1 = gpu.subgroup_reduce add %arg0 uniform : (vector<1xf32>) -> (vector<1xf32>)
+    "test.consume"(%sum1) : (vector<1xf32>) -> ()
+
+    // CHECK: gpu.return
+    gpu.return
+  }
+
+  // These vectors fit the native shuffle size and should not be broken down.
+  //
+  // CHECK-LABEL: gpu.func @kernel2(
+  // CHECK-SAME: %[[ARG0:.+]]: vector<3xi8>, %[[ARG1:.+]]: vector<4xi8>)
+  gpu.func @kernel2(%arg0: vector<3xi8>, %arg1: vector<4xi8>) kernel {
+    // CHECK: %[[R0:.+]] = gpu.subgroup_reduce add %[[ARG0]] : (vector<3xi8>) -> vector<3xi8>
+    // CHECK: "test.consume"(%[[R0]]) : (vector<3xi8>) -> ()
+    %sum0 = gpu.subgroup_reduce add %arg0 : (vector<3xi8>) -> (vector<3xi8>)
+    "test.consume"(%sum0) : (vector<3xi8>) -> ()
+
+    // CHECK: %[[R1:.+]] = gpu.subgroup_reduce add %[[ARG1]] : (vector<4xi8>) -> vector<4xi8>
+    // CHECK: "test.consume"(%[[R1]]) : (vector<4xi8>) -> ()
+    %sum1 = gpu.subgroup_reduce add %arg1 : (vector<4xi8>) -> (vector<4xi8>)
+    "test.consume"(%sum1) : (vector<4xi8>) -> ()
+
+    // CHECK: gpu.return
+    gpu.return
+  }
+
+}
diff --git a/mlir/test/Dialect/LLVMIR/func.mlir b/mlir/test/Dialect/LLVMIR/func.mlir
index d09df0767612..9dc1bc57034e 100644
--- a/mlir/test/Dialect/LLVMIR/func.mlir
+++ b/mlir/test/Dialect/LLVMIR/func.mlir
@@ -191,14 +191,16 @@ module {
 
   // CHECK: llvm.func @test_ccs
   llvm.func @test_ccs() {
+    // CHECK-NEXT: %[[PTR:.*]] = llvm.mlir.addressof @cconv4 : !llvm.ptr
+    %ptr = llvm.mlir.addressof @cconv4 : !llvm.ptr
     // CHECK-NEXT: llvm.call        @cconv1() : () -> ()
     // CHECK-NEXT: llvm.call        @cconv2() : () -> ()
     // CHECK-NEXT: llvm.call fastcc @cconv3() : () -> ()
-    // CHECK-NEXT: llvm.call cc_10  @cconv4() : () -> ()
+    // CHECK-NEXT: llvm.call cc_10  %[[PTR]]() : !llvm.ptr, () -> ()
     llvm.call        @cconv1() : () -> ()
     llvm.call ccc    @cconv2() : () -> ()
     llvm.call fastcc @cconv3() : () -> ()
-    llvm.call cc_10  @cconv4() : () -> ()
+    llvm.call cc_10  %ptr() : !llvm.ptr, () -> ()
     llvm.return
   }
 
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index b684be1f9626..63e7a46f1bdb 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -324,6 +324,53 @@ llvm.func @test_inline(%cond0 : i1, %cond1 : i1, %funcArg : f32) -> f32 {
 
 // -----
 
+llvm.func @static_alloca() -> f32 {
+  %0 = llvm.mlir.constant(4 : i32) : i32
+  %1 = llvm.alloca %0 x f32 : (i32) -> !llvm.ptr
+  %2 = llvm.load %1 : !llvm.ptr -> f32
+  llvm.return %2 : f32
+}
+
+// CHECK-LABEL: llvm.func @test_inline
+llvm.func @test_inline(%cond0 : i1) {
+  // Verify the alloca is relocated to the entry block of the parent function
+  // if the region operation is neither marked as isolated from above or
+  // automatic allocation scope.
+  // CHECK: %[[ALLOCA:.+]] = llvm.alloca
+  // CHECK: "test.one_region_op"() ({
+  "test.one_region_op"() ({
+    %0 = llvm.call @static_alloca() : () -> f32
+    // CHECK-NEXT: llvm.intr.lifetime.start 4, %[[ALLOCA]]
+    // CHECK-NEXT: %[[RES:.+]] = llvm.load %[[ALLOCA]]
+    // CHECK-NEXT: llvm.intr.lifetime.end 4, %[[ALLOCA]]
+    // CHECK-NEXT: test.region_yield %[[RES]]
+    test.region_yield %0 : f32
+  }) : () -> ()
+  // Verify the alloca is not relocated out of operations that are marked as
+  // isolated from above.
+  // CHECK-NOT: llvm.alloca
+  // CHECK: test.isolated_regions
+  test.isolated_regions {
+    // CHECK: %[[ALLOCA:.+]] = llvm.alloca
+    %0 = llvm.call @static_alloca() : () -> f32
+    // CHECK: test.region_yield
+    test.region_yield %0 : f32
+  }
+  // Verify the alloca is not relocated out of operations that are marked as
+  // automatic allocation scope.
+  // CHECK-NOT: llvm.alloca
+  // CHECK: test.alloca_scope_region
+  test.alloca_scope_region {
+    // CHECK: %[[ALLOCA:.+]] = llvm.alloca
+    %0 = llvm.call @static_alloca() : () -> f32
+    // CHECK: test.region_yield
+    test.region_yield %0 : f32
+  }
+  llvm.return
+}
+
+// -----
+
 llvm.func @alloca_with_lifetime(%cond: i1) -> f32 {
   %0 = llvm.mlir.constant(4 : i32) : i32
   %1 = llvm.alloca %0 x f32 : (i32) -> !llvm.ptr
diff --git a/mlir/test/Dialect/Linalg/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize-empty-tensor-elimination.mlir
index 0172760576ef..761b75d81837 100644
--- a/mlir/test/Dialect/Linalg/one-shot-bufferize-empty-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Linalg/one-shot-bufferize-empty-tensor-elimination.mlir
@@ -42,3 +42,89 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+#map = affine_map<(d0) -> (d0)>
+
+// This test is intended to check that the produced IR does not contain any
+// type errors from sharing empty tensor operations with different types.
+// The verifiers are sufficient to lock down the intended behavior.
+
+// CHECK-LABEL: func.func @collapse_shape_prevents_reuse(
+func.func @collapse_shape_prevents_reuse(%fill_value: f32) -> tensor<56xf32>
+{
+  %init0 = tensor.empty() : tensor<56xf32>
+  %init1 = tensor.empty() : tensor<56x1xf32>
+
+  %filled_tensor = linalg.fill
+    ins(%fill_value : f32)
+    outs(%init1 : tensor<56x1xf32>) -> tensor<56x1xf32>
+
+  // The collapse shape alters the tensor rank, so the %init1 tensor.empty cannot be
+  // pushed into the output of the linalg.generic.
+  %reshaped_tensor = tensor.collapse_shape %filled_tensor [[0, 1]]
+    : tensor<56x1xf32> into tensor<56xf32>
+
+  %bias = linalg.generic {
+    indexing_maps = [#map, #map],
+    iterator_types = ["parallel"]
+  } ins(%reshaped_tensor : tensor<56xf32>)
+    outs(%init0 : tensor<56xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+  } -> tensor<56xf32>
+
+  return %bias : tensor<56xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.eliminate_empty_tensors %0 : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+
+// This test is intended to check that the produced IR does not contain any
+// type errors from sharing empty tensor operations with different types.
+// The verifiers are sufficient to lock down the intended behavior.
+
+// CHECK-LABEL: func.func @collapse_cast_prevents_reuse(
+func.func @collapse_cast_prevents_reuse(%fill_value: f32) -> tensor<56x?xf32>
+{
+  %c1 = arith.constant 1 : index
+  %init0 = tensor.empty(%c1) : tensor<56x?xf32>
+  %init1 = tensor.empty() : tensor<56x1xf32>
+
+  %filled_tensor = linalg.fill
+    ins(%fill_value : f32)
+    outs(%init1 : tensor<56x1xf32>) -> tensor<56x1xf32>
+
+  // The cast alters the number of dynamic dims, so the %init1 tensor.empty cannot be
+  // pushed into the output of the linalg.generic.
+  %cast = tensor.cast %filled_tensor : tensor<56x1xf32> to tensor<56x?xf32>
+
+  %bias = linalg.generic {
+    indexing_maps = [#map, #map],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%cast : tensor<56x?xf32>)
+    outs(%init0 : tensor<56x?xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+  } -> tensor<56x?xf32>
+
+  return %bias : tensor<56x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.eliminate_empty_tensors %0 : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir
index b9ac68d0592c..c18d964b370f 100644
--- a/mlir/test/Dialect/OpenACC/invalid.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid.mlir
@@ -462,8 +462,8 @@ acc.loop gang() {
 // -----
 
 %i64value = arith.constant 1 : i64
-// expected-error@+1 {{num_gangs expects a maximum of 3 values}}
-acc.parallel num_gangs(%i64value, %i64value, %i64value, %i64value : i64, i64, i64, i64) {
+// expected-error@+1 {{num_gangs expects a maximum of 3 values per segment}}
+acc.parallel num_gangs({%i64value: i64, %i64value : i64, %i64value : i64, %i64value : i64}) {
 }
 
 // -----
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 05b0450c7fb9..5a95811685f8 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -137,7 +137,7 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
   %pd = acc.present varPtr(%d : memref<10xf32>) -> memref<10xf32>
   acc.data dataOperands(%pa, %pb, %pc, %pd: memref<10x10xf32>, memref<10x10xf32>, memref<10xf32>, memref<10xf32>) {
     %private = acc.private varPtr(%c : memref<10xf32>) -> memref<10xf32>
-    acc.parallel num_gangs(%numGangs: i64) num_workers(%numWorkers: i64) private(@privatization_memref_10_f32 -> %private : memref<10xf32>) {
+    acc.parallel num_gangs({%numGangs: i64}) num_workers(%numWorkers: i64 [#acc.device_type<nvidia>]) private(@privatization_memref_10_f32 -> %private : memref<10xf32>) {
       acc.loop gang {
         scf.for %x = %lb to %c10 step %st {
           acc.loop worker {
@@ -180,7 +180,7 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
 // CHECK-NEXT:   [[NUMWORKERS:%.*]] = arith.constant 10 : i64
 // CHECK:        acc.data dataOperands(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<10x10xf32>, memref<10x10xf32>, memref<10xf32>, memref<10xf32>) {
 // CHECK-NEXT:     %[[P_ARG2:.*]] = acc.private varPtr([[ARG2]] : memref<10xf32>) -> memref<10xf32> 
-// CHECK-NEXT:     acc.parallel num_gangs([[NUMGANG]] : i64) num_workers([[NUMWORKERS]] : i64) private(@privatization_memref_10_f32 -> %[[P_ARG2]] : memref<10xf32>) {
+// CHECK-NEXT:     acc.parallel num_gangs({[[NUMGANG]] : i64}) num_workers([[NUMWORKERS]] : i64 [#acc.device_type<nvidia>]) private(@privatization_memref_10_f32 -> %[[P_ARG2]] : memref<10xf32>) {
 // CHECK-NEXT:       acc.loop gang {
 // CHECK-NEXT:         scf.for %{{.*}} = [[C0]] to [[C10]] step [[C1]] {
 // CHECK-NEXT:           acc.loop worker {
@@ -439,25 +439,25 @@ func.func @testparallelop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x
   }
   acc.parallel async(%idxValue: index) {
   }
-  acc.parallel wait(%i64value: i64) {
+  acc.parallel wait({%i64value: i64}) {
   }
-  acc.parallel wait(%i32value: i32) {
+  acc.parallel wait({%i32value: i32}) {
   }
-  acc.parallel wait(%idxValue: index) {
+  acc.parallel wait({%idxValue: index}) {
   }
-  acc.parallel wait(%i64value, %i32value, %idxValue : i64, i32, index) {
+  acc.parallel wait({%i64value : i64, %i32value : i32, %idxValue : index}) {
   }
-  acc.parallel num_gangs(%i64value: i64) {
+  acc.parallel num_gangs({%i64value: i64}) {
   }
-  acc.parallel num_gangs(%i32value: i32) {
+  acc.parallel num_gangs({%i32value: i32}) {
   }
-  acc.parallel num_gangs(%idxValue: index) {
+  acc.parallel num_gangs({%idxValue: index}) {
   }
-  acc.parallel num_gangs(%i64value, %i64value, %idxValue : i64, i64, index) {
+  acc.parallel num_gangs({%i64value: i64, %i64value: i64, %idxValue: index}) {
   }
-  acc.parallel num_workers(%i64value: i64) {
+  acc.parallel num_workers(%i64value: i64 [#acc.device_type<nvidia>]) {
   }
-  acc.parallel num_workers(%i32value: i32) {
+  acc.parallel num_workers(%i32value: i32 [#acc.device_type<default>]) {
   }
   acc.parallel num_workers(%idxValue: index) {
   }
@@ -492,25 +492,25 @@ func.func @testparallelop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x
 // CHECK-NEXT: }
 // CHECK:      acc.parallel async([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel wait([[I64VALUE]] : i64) {
+// CHECK:      acc.parallel wait({[[I64VALUE]] : i64}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel wait([[I32VALUE]] : i32) {
+// CHECK:      acc.parallel wait({[[I32VALUE]] : i32}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel wait([[IDXVALUE]] : index) {
+// CHECK:      acc.parallel wait({[[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel wait([[I64VALUE]], [[I32VALUE]], [[IDXVALUE]] : i64, i32, index) {
+// CHECK:      acc.parallel wait({[[I64VALUE]] : i64, [[I32VALUE]] : i32, [[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_gangs([[I64VALUE]] : i64) {
+// CHECK:      acc.parallel num_gangs({[[I64VALUE]] : i64}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_gangs([[I32VALUE]] : i32) {
+// CHECK:      acc.parallel num_gangs({[[I32VALUE]] : i32}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_gangs([[IDXVALUE]] : index) {
+// CHECK:      acc.parallel num_gangs({[[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_gangs([[I64VALUE]], [[I64VALUE]], [[IDXVALUE]] : i64, i64, index) {
+// CHECK:      acc.parallel num_gangs({[[I64VALUE]] : i64, [[I64VALUE]] : i64, [[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_workers([[I64VALUE]] : i64) {
+// CHECK:      acc.parallel num_workers([[I64VALUE]] : i64 [#acc.device_type<nvidia>]) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_workers([[I32VALUE]] : i32) {
+// CHECK:      acc.parallel num_workers([[I32VALUE]] : i32 [#acc.device_type<default>]) {
 // CHECK-NEXT: }
 // CHECK:      acc.parallel num_workers([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
@@ -590,13 +590,13 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10
   }
   acc.serial async(%idxValue: index) {
   }
-  acc.serial wait(%i64value: i64) {
+  acc.serial wait({%i64value: i64}) {
   }
-  acc.serial wait(%i32value: i32) {
+  acc.serial wait({%i32value: i32}) {
   }
-  acc.serial wait(%idxValue: index) {
+  acc.serial wait({%idxValue: index}) {
   }
-  acc.serial wait(%i64value, %i32value, %idxValue : i64, i32, index) {
+  acc.serial wait({%i64value : i64, %i32value : i32, %idxValue : index}) {
   }
   %firstprivate = acc.firstprivate varPtr(%b : memref<10xf32>) -> memref<10xf32>
   acc.serial private(@privatization_memref_10_f32 -> %a : memref<10xf32>, @privatization_memref_10_10_f32 -> %c : memref<10x10xf32>) firstprivate(@firstprivatization_memref_10xf32 -> %firstprivate : memref<10xf32>) {
@@ -627,13 +627,13 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10
 // CHECK-NEXT: }
 // CHECK:      acc.serial async([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.serial wait([[I64VALUE]] : i64) {
+// CHECK:      acc.serial wait({[[I64VALUE]] : i64}) {
 // CHECK-NEXT: }
-// CHECK:      acc.serial wait([[I32VALUE]] : i32) {
+// CHECK:      acc.serial wait({[[I32VALUE]] : i32}) {
 // CHECK-NEXT: }
-// CHECK:      acc.serial wait([[IDXVALUE]] : index) {
+// CHECK:      acc.serial wait({[[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
-// CHECK:      acc.serial wait([[I64VALUE]], [[I32VALUE]], [[IDXVALUE]] : i64, i32, index) {
+// CHECK:      acc.serial wait({[[I64VALUE]] : i64, [[I32VALUE]] : i32, [[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
 // CHECK:      %[[FIRSTP:.*]] = acc.firstprivate varPtr([[ARGB]] : memref<10xf32>) -> memref<10xf32>
 // CHECK:      acc.serial firstprivate(@firstprivatization_memref_10xf32 -> %[[FIRSTP]] : memref<10xf32>) private(@privatization_memref_10_f32 -> [[ARGA]] : memref<10xf32>, @privatization_memref_10_10_f32 -> [[ARGC]] : memref<10x10xf32>) {
@@ -665,13 +665,13 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10
   }
   acc.kernels async(%idxValue: index) {
   }
-  acc.kernels wait(%i64value: i64) {
+  acc.kernels wait({%i64value: i64}) {
   }
-  acc.kernels wait(%i32value: i32) {
+  acc.kernels wait({%i32value: i32}) {
   }
-  acc.kernels wait(%idxValue: index) {
+  acc.kernels wait({%idxValue: index}) {
   }
-  acc.kernels wait(%i64value, %i32value, %idxValue : i64, i32, index) {
+  acc.kernels wait({%i64value : i64, %i32value : i32, %idxValue : index}) {
   }
   acc.kernels {
   } attributes {defaultAttr = #acc<defaultvalue none>}
@@ -699,13 +699,13 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10
 // CHECK-NEXT: }
 // CHECK:      acc.kernels async([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.kernels wait([[I64VALUE]] : i64) {
+// CHECK:      acc.kernels wait({[[I64VALUE]] : i64}) {
 // CHECK-NEXT: }
-// CHECK:      acc.kernels wait([[I32VALUE]] : i32) {
+// CHECK:      acc.kernels wait({[[I32VALUE]] : i32}) {
 // CHECK-NEXT: }
-// CHECK:      acc.kernels wait([[IDXVALUE]] : index) {
+// CHECK:      acc.kernels wait({[[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
-// CHECK:      acc.kernels wait([[I64VALUE]], [[I32VALUE]], [[IDXVALUE]] : i64, i32, index) {
+// CHECK:      acc.kernels wait({[[I64VALUE]] : i64, [[I32VALUE]] : i32, [[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
 // CHECK:      acc.kernels {
 // CHECK-NEXT: } attributes {defaultAttr = #acc<defaultvalue none>}
diff --git a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
index 871ecd4f28b1..1cb69891a70e 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
@@ -1007,6 +1007,102 @@ func.func @umod_fail_fold(%arg0: i32) -> (i32, i32) {
 // -----
 
 //===----------------------------------------------------------------------===//
+// spirv.SNegate
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @snegate_twice
+// CHECK-SAME: (%[[ARG:.*]]: i32)
+func.func @snegate_twice(%arg0 : i32) -> i32 {
+  %0 = spirv.SNegate %arg0 : i32
+  %1 = spirv.SNegate %0 : i32
+
+  // CHECK: return %[[ARG]] : i32
+  return %1 : i32
+}
+
+// CHECK-LABEL: @snegate_min
+func.func @snegate_min() -> (i8, i8) {
+  // CHECK: %[[MIN:.*]] = spirv.Constant -128 : i8
+  %cmin = spirv.Constant -128 : i8
+
+  %0 = spirv.SNegate %cmin : i8
+  %1 = spirv.SNegate %0 : i8
+
+  // CHECK: return %[[MIN]], %[[MIN]]
+  return %0, %1 : i8, i8
+}
+
+// CHECK-LABEL: @const_fold_scalar_snegate
+func.func @const_fold_scalar_snegate() -> (i32, i32, i32) {
+  %c0 = spirv.Constant 0 : i32
+  %c3 = spirv.Constant 3 : i32
+  %cn3 = spirv.Constant -3 : i32
+
+  // CHECK-DAG: %[[THREE:.*]] = spirv.Constant 3 : i32
+  // CHECK-DAG: %[[NTHREE:.*]] = spirv.Constant -3 : i32
+  // CHECK-DAG: %[[ZERO:.*]] = spirv.Constant 0 : i32
+  %0 = spirv.SNegate %c0 : i32
+  %1 = spirv.SNegate %c3 : i32
+  %2 = spirv.SNegate %cn3 : i32
+
+  // CHECK: return %[[ZERO]], %[[NTHREE]], %[[THREE]]
+  return %0, %1, %2  : i32, i32, i32
+}
+
+// CHECK-LABEL: @const_fold_vector_snegate
+func.func @const_fold_vector_snegate() -> vector<3xi32> {
+  // CHECK: spirv.Constant dense<[0, 3, -3]>
+  %cv = spirv.Constant dense<[0, -3, 3]> : vector<3xi32>
+  %0 = spirv.SNegate %cv : vector<3xi32>
+  return %0  : vector<3xi32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.Not
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @not_twice
+// CHECK-SAME: (%[[ARG:.*]]: i32)
+func.func @not_twice(%arg0 : i32) -> i32 {
+  %0 = spirv.Not %arg0 : i32
+  %1 = spirv.Not %0 : i32
+
+  // CHECK: return %[[ARG]] : i32
+  return %1 : i32
+}
+
+// CHECK-LABEL: @const_fold_scalar_not
+func.func @const_fold_scalar_not() -> (i32, i32, i32) {
+  %c0 = spirv.Constant 0 : i32
+  %c3 = spirv.Constant 3 : i32
+  %cn3 = spirv.Constant -3 : i32
+
+  // CHECK-DAG: %[[TWO:.*]] = spirv.Constant 2 : i32
+  // CHECK-DAG: %[[NFOUR:.*]] = spirv.Constant -4 : i32
+  // CHECK-DAG: %[[NONE:.*]] = spirv.Constant -1 : i32
+  %0 = spirv.Not %c0 : i32
+  %1 = spirv.Not %c3 : i32
+  %2 = spirv.Not %cn3 : i32
+
+  // CHECK: return %[[NONE]], %[[NFOUR]], %[[TWO]]
+  return %0, %1, %2  : i32, i32, i32
+}
+
+// CHECK-LABEL: @const_fold_vector_not
+func.func @const_fold_vector_not() -> vector<3xi32> {
+  %cv = spirv.Constant dense<[-1, -4, 2]> : vector<3xi32>
+
+  // CHECK: spirv.Constant dense<[0, 3, -3]>
+  %0 = spirv.Not %cv : vector<3xi32>
+
+  return %0 : vector<3xi32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
 // spirv.LogicalAnd
 //===----------------------------------------------------------------------===//
 
@@ -1040,6 +1136,38 @@ func.func @convert_logical_and_true_false_vector(%arg: vector<3xi1>) -> (vector<
 // spirv.LogicalNot
 //===----------------------------------------------------------------------===//
 
+// CHECK-LABEL: @logical_not_twice
+// CHECK-SAME: (%[[ARG:.*]]: i1)
+func.func @logical_not_twice(%arg0 : i1) -> i1 {
+  %0 = spirv.LogicalNot %arg0 : i1
+  %1 = spirv.LogicalNot %0 : i1
+
+  // CHECK: return %[[ARG]] : i1
+  return %1 : i1
+}
+
+// CHECK-LABEL: @const_fold_scalar_logical_not
+func.func @const_fold_scalar_logical_not() -> i1 {
+  %true = spirv.Constant true
+
+  // CHECK: spirv.Constant false
+  %0 = spirv.LogicalNot %true : i1
+
+  return %0 : i1
+}
+
+// CHECK-LABEL: @const_fold_vector_logical_not
+func.func @const_fold_vector_logical_not() -> vector<2xi1> {
+  %cv = spirv.Constant dense<[true, false]> : vector<2xi1>
+
+  // CHECK: spirv.Constant dense<[false, true]>
+  %0 = spirv.LogicalNot %cv : vector<2xi1>
+
+  return %0 : vector<2xi1>
+}
+
+// -----
+
 func.func @convert_logical_not_to_not_equal(%arg0: vector<3xi64>, %arg1: vector<3xi64>) -> vector<3xi1> {
   // CHECK: %[[RESULT:.*]] = spirv.INotEqual {{%.*}}, {{%.*}} : vector<3xi64>
   // CHECK-NEXT: spirv.ReturnValue %[[RESULT]] : vector<3xi1>
diff --git a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir b/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
new file mode 100644
index 000000000000..bdfe18acd86c
--- /dev/null
+++ b/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
@@ -0,0 +1,58 @@
+// RUN: mlir-opt -split-input-file -test-tensor-transform-patterns="test-simplify-pack-unpack-patterns" %s | FileCheck %s
+
+// CHECK-LABEL: func.func @single_dim_packing(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<256xf32>)
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1]] : tensor<256xf32> into tensor<8x32xf32>
+// CHECK:         return %[[EXPANDED]] : tensor<8x32xf32>
+func.func @single_dim_packing(%arg0: tensor<256xf32>) -> tensor<8x32xf32> {
+  %empty = tensor.empty() : tensor<8x32xf32>
+  %0 = tensor.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256xf32> -> tensor<8x32xf32>
+  return %0 : tensor<8x32xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @single_dim_packing_with_padding(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<255xf32>)
+// CHECK-NOT:     tensor.expand_shape
+// CHECK:         tensor.pack
+func.func @single_dim_packing_with_padding(%arg0: tensor<255xf32>) -> tensor<8x32xf32> {
+  %empty = tensor.empty() : tensor<8x32xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.pack %arg0 padding_value(%cst : f32) inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<255xf32> -> tensor<8x32xf32>
+  return %0 : tensor<8x32xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @single_last_inner_dim_packing(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<5x256xf32>)
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2]] : tensor<5x256xf32> into tensor<5x8x32xf32>
+// CHECK:         return %[[EXPANDED]] : tensor<5x8x32xf32>
+func.func @single_last_inner_dim_packing(%arg0: tensor<5x256xf32>) -> tensor<5x8x32xf32> {
+  %empty = tensor.empty() : tensor<5x8x32xf32>
+  %0 = tensor.pack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<5x8x32xf32>
+  return %0 : tensor<5x8x32xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @packing_with_outer_dims_perm(
+// CHECK-NOT:     tensor.expand_shape
+// CHECK:         tensor.pack
+func.func @packing_with_outer_dims_perm(%arg0: tensor<5x256xf32>) -> tensor<8x5x32xf32> {
+  %empty = tensor.empty() : tensor<8x5x32xf32>
+  %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<8x5x32xf32>
+  return %0 : tensor<8x5x32xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @single_first_inner_dim_packing(
+// CHECK-NOT:     tensor.expand_shape
+// CHECK:         tensor.pack
+func.func @single_first_inner_dim_packing(%arg0: tensor<256x5xf32>) -> tensor<8x5x32xf32> {
+  %empty = tensor.empty() : tensor<8x5x32xf32>
+  %0 = tensor.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256x5xf32> -> tensor<8x5x32xf32>
+  return %0 : tensor<8x5x32xf32>
+}
diff --git a/mlir/test/Dialect/Tensor/simplify-tensor-pack.mlir b/mlir/test/Dialect/Tensor/simplify-tensor-pack.mlir
deleted file mode 100644
index 75eb33ed033b..000000000000
--- a/mlir/test/Dialect/Tensor/simplify-tensor-pack.mlir
+++ /dev/null
@@ -1,24 +0,0 @@
-// RUN: mlir-opt -split-input-file -test-tensor-transform-patterns="test-simplify-pack-patterns" %s | FileCheck %s
-
-// CHECK: func.func @single_dim_packing(
-// CHECK-SAME: %[[ARG0:.+]]: tensor<256xf32>)
-// CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1]] : tensor<256xf32> into tensor<8x32xf32>
-// CHECK: return %[[EXPANDED]] : tensor<8x32xf32>
-func.func @single_dim_packing(%arg0: tensor<256xf32>) -> tensor<8x32xf32> {
-  %empty = tensor.empty() : tensor<8x32xf32>
-  %0 = tensor.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256xf32> -> tensor<8x32xf32>
-  return %0 : tensor<8x32xf32>
-}
-
-// -----
-
-// CHECK: func.func @single_dim_packing_with_padding(
-// CHECK-SAME: %[[ARG0:.+]]: tensor<255xf32>)
-// CHECK-NOT: tensor.expand_shape
-// CHECK: tensor.pack
-func.func @single_dim_packing_with_padding(%arg0: tensor<255xf32>) -> tensor<8x32xf32> {
-  %empty = tensor.empty() : tensor<8x32xf32>
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = tensor.pack %arg0 padding_value(%cst : f32) inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<255xf32> -> tensor<8x32xf32>
-  return %0 : tensor<8x32xf32>
-}
diff --git a/mlir/test/Dialect/Transform/test-pattern-application.mlir b/mlir/test/Dialect/Transform/test-pattern-application.mlir
index 2fd47c6bae39..ff9a535c8384 100644
--- a/mlir/test/Dialect/Transform/test-pattern-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pattern-application.mlir
@@ -179,7 +179,6 @@ module {
 //       CHECK:   return %[[c5]]
 func.func @canonicalization(%t: tensor<5xf32>) -> index {
   %c0 = arith.constant 0 : index
-  // expected-remark @below {{op was replaced}}
   %dim = tensor.dim %t, %c0 : tensor<5xf32>
   return %dim : index
 }
@@ -191,7 +190,6 @@ transform.sequence failures(propagate) {
   transform.apply_patterns to %1 {
     transform.apply_patterns.canonicalization
   } : !transform.any_op
-  transform.test_print_remark_at_operand %0, "op was replaced" : !transform.any_op
 }
 
 // -----
diff --git a/mlir/test/Dialect/Vector/fold-arith-extf-into-vector-contract.mlir b/mlir/test/Dialect/Vector/fold-arith-extf-into-vector-contract.mlir
index 79429afd8ff2..31ae126906f2 100644
--- a/mlir/test/Dialect/Vector/fold-arith-extf-into-vector-contract.mlir
+++ b/mlir/test/Dialect/Vector/fold-arith-extf-into-vector-contract.mlir
@@ -10,9 +10,41 @@
 //  CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
 //  CHECK-SAME:   %[[ARG0]], %[[ARG1]], %[[ARG2]] : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
 //  CHECK-NEXT:   return %[[R]] : vector<64x64xf32>
-func.func @fold_arith_extf_into_contract(%arg0: vector<64x64xf16>, %arg1: vector<64x64xf16>, %arg2: vector<64x64xf32>) -> vector<64x64xf32> {
+func.func @fold_arith_extf_into_contract(
+  %arg0: vector<64x64xf16>,
+  %arg1: vector<64x64xf16>,
+  %arg2: vector<64x64xf32>) -> vector<64x64xf32> {
     %lhs_f32 = arith.extf %arg0 : vector<64x64xf16> to vector<64x64xf32>
     %rhs_f32 = arith.extf %arg1 : vector<64x64xf16> to vector<64x64xf32>
-    %result = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_f32, %rhs_f32, %arg2 : vector<64x64xf32>, vector<64x64xf32> into vector<64x64xf32>
+    %result = vector.contract {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel", "reduction"],
+      kind = #vector.kind<add>}
+      %lhs_f32, %rhs_f32, %arg2 : vector<64x64xf32>, vector<64x64xf32> into vector<64x64xf32>
     return %result : vector<64x64xf32>
-}
-\ No newline at end of file
+}
+
+// -----
+
+// CHECK-DAG: #[[$map0:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK-DAG: #[[$map1:.*]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK-DAG: #[[$map2:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func.func @fold_arith_extf_into_contract_scalable
+//  CHECK-SAME: (%[[ARG0:.*]]: vector<[64]x64xf16>, %[[ARG1:.*]]: vector<64x64xf16>, %[[ARG2:.*]]: vector<[64]x64xf32>)
+//  CHECK-NEXT:   %[[R:.+]] = vector.contract {indexing_maps = [#[[$map0]], #[[$map1]], #[[$map2]]],
+//  CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
+//  CHECK-SAME:   %[[ARG0]], %[[ARG1]], %[[ARG2]] : vector<[64]x64xf16>, vector<64x64xf16> into vector<[64]x64xf32>
+//  CHECK-NEXT:   return %[[R]] : vector<[64]x64xf32>
+func.func @fold_arith_extf_into_contract_scalable(
+  %arg0: vector<[64]x64xf16>,
+  %arg1: vector<64x64xf16>,
+  %arg2: vector<[64]x64xf32>) -> vector<[64]x64xf32> {
+    %lhs_f32 = arith.extf %arg0 : vector<[64]x64xf16> to vector<[64]x64xf32>
+    %rhs_f32 = arith.extf %arg1 : vector<64x64xf16> to vector<64x64xf32>
+    %result = vector.contract {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel", "reduction"],
+      kind = #vector.kind<add>}
+      %lhs_f32, %rhs_f32, %arg2 : vector<[64]x64xf32>, vector<64x64xf32> into vector<[64]x64xf32>
+    return %result : vector<[64]x64xf32>
+}
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 9f1ec21cdabf..03532c5c1ceb 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -725,6 +725,16 @@ func.func @flat_transpose_int(%arg0: vector<16xi32>) -> vector<16xi32> {
   return %0 : vector<16xi32>
 }
 
+// CHECK-LABEL: @vector_load_and_store_0d_scalar_memref
+func.func @vector_load_and_store_0d_scalar_memref(%memref : memref<200x100xf32>,
+                                                  %i : index, %j : index) {
+  // CHECK: %[[ld:.*]] = vector.load %{{.*}}[%{{.*}}] : memref<200x100xf32>, vector<f32>
+  %0 = vector.load %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+  // CHECK: vector.store %[[ld]], %{{.*}}[%{{.*}}] : memref<200x100xf32>, vector<f32>
+  vector.store %0, %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+  return
+}
+
 // CHECK-LABEL: @vector_load_and_store_1d_scalar_memref
 func.func @vector_load_and_store_1d_scalar_memref(%memref : memref<200x100xf32>,
                                              %i : index, %j : index) {
diff --git a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matmul-transforms.mlir
index 7588b738ff9a..7a60ff8ea858 100644
--- a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matmul-transforms.mlir
@@ -1,20 +1,22 @@
 // RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
 
-// NOTE - tests in this file are duplicated so that there's a version for
-//    * _fixed width_ and for _scalable_ vectors.
-// In order for the "vector.contract -> vector.outerproduct" patterns to work,
-// only the non-reduction dimension can be scalable (*). For Matmul operations
-// that is set to be the N dimension (i.e. rows of the output matrix), which
-// matches how matrix multiplication are normally implemented for e.g. 
-// Arm SVE. However, making the M dimension scalable (i.e. columns of the
-// output matrix) should work as well.
-//
-// (*) The conversion tested in this file unrolls along the reduction
-// dimension, which is not supported for scalable vectors.
+/// Tests for `vector.contract` -> `vector.outerproduct` transformations for
+/// matmul operations:
+///   C += A * B.
+/// (A, B and C are 2-d matrices). ATM three different variants / are tested:
+///   * plain (no mask, fixed-wdith vectors),
+///   * masked (fixed-width vectors,
+///   * scalable (mask + scalable vectors).
+/// In order for the "vector.contract -> vector.outerproduct" patterns to work,
+/// only the non-reduction dimension can be scalable (*). For matmul operations
+/// that is set to be the N dimension (i.e. rows of the output matrix), which
+/// matches how matrix multiplication are normally implemented for e.g.
+/// Arm SVE. However, making the M dimension scalable (i.e. columns of the
+/// output matrix) should work as well.
+///
+/// (*) The conversion tested in this file unrolls along the reduction
+/// dimension, which is not supported for scalable vectors.
 
-// ============================================================================
-//  Matmul 0 (plain + masked + mixed types)
-// ============================================================================
 #matmat_accesses_0 = [
   affine_map<(m, n, k) -> (m, k)>,
   affine_map<(m, n, k) -> (k, n)>,
@@ -25,6 +27,49 @@
   iterator_types = ["parallel", "parallel", "reduction"]
 }
 
+#matmat_accesses_1 = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (n, k)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+#matmat_trait_1 = {
+  indexing_maps = #matmat_accesses_1,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+#matmat_accesses_2 = [
+  affine_map<(m, n, k) -> (k, m)>,
+  affine_map<(m, n, k) -> (k, n)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+#matmat_trait_2 = {
+  indexing_maps = #matmat_accesses_2,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+#matmat_accesses_3 = [
+  affine_map<(m, n, k) -> (k, m)>,
+  affine_map<(m, n, k) -> (n, k)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+#matmat_trait_3 = {
+  indexing_maps = #matmat_accesses_3,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+#matmat_accesses_4 = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (k, n)>,
+  affine_map<(m, n, k) -> (n, m)>
+]
+#matmat_trait_4 = {
+  indexing_maps = #matmat_accesses_4,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+// ============================================================================
+//  Matmul 0 (plain + masked + mixed types)
+// ============================================================================
 // CHECK-LABEL: func @matmul
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x4xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<4x3xf32>,
@@ -53,10 +98,10 @@
 // CHECK-SAME:  : vector<2xf32>, vector<3xf32>
 //
 //      CHECK: return %[[c3]] : vector<2x3xf32>
-func.func @matmul(%arg0: vector<2x4xf32>,
-                  %arg1: vector<4x3xf32>,
-                  %arg2: vector<2x3xf32>) -> vector<2x3xf32> {
-  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+func.func @matmul(%A: vector<2x4xf32>,
+                  %B: vector<4x3xf32>,
+                  %C: vector<2x3xf32>) -> vector<2x3xf32> {
+  %0 = vector.contract #matmat_trait_0 %A, %B, %C
     : vector<2x4xf32>, vector<4x3xf32> into vector<2x3xf32>
   return %0 : vector<2x3xf32>
 }
@@ -89,10 +134,10 @@ func.func @matmul(%arg0: vector<2x4xf32>,
 // CHECK-SAME:  : vector<2xf32>, vector<[3]xf32>
 //
 //      CHECK: return %[[c3]] : vector<2x[3]xf32>
-func.func @matmul_scalable(%arg0: vector<2x4xf32>,
-                           %arg1: vector<4x[3]xf32>,
-                           %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32> {
-  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+func.func @matmul_scalable(%A: vector<2x4xf32>,
+                           %B: vector<4x[3]xf32>,
+                           %C: vector<2x[3]xf32>) -> vector<2x[3]xf32> {
+  %0 = vector.contract #matmat_trait_0 %A, %B, %C
     : vector<2x4xf32>, vector<4x[3]xf32> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
@@ -114,11 +159,11 @@ func.func @matmul_scalable(%arg0: vector<2x4xf32>,
 // CHECK:         %[[T_MASK_R4:.*]] = vector.extract %[[T_MASK]][4] : vector<3x7xi1> from vector<5x3x7xi1>
 // CHECK:         %{{.*}} = vector.mask %[[T_MASK_R4]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
 
-func.func @masked_matmul(%arg0: vector<3x5xf32>,
-                         %arg1: vector<5x7xf32>,
-                         %arg2: vector<3x7xf32>,
+func.func @masked_matmul(%A: vector<3x5xf32>,
+                         %B: vector<5x7xf32>,
+                         %C: vector<3x7xf32>,
                          %m : vector<3x7x5xi1>) -> vector<3x7xf32> {
-  %0 = vector.mask %m { vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+  %0 = vector.mask %m { vector.contract #matmat_trait_0 %A, %B, %C
   : vector<3x5xf32>, vector<5x7xf32> into vector<3x7xf32> } : vector<3x7x5xi1> -> vector<3x7xf32>
   return %0 : vector<3x7xf32>
 }
@@ -140,11 +185,11 @@ func.func @masked_matmul(%arg0: vector<3x5xf32>,
 // CHECK:         %[[T_MASK_R4:.*]] = vector.extract %[[T_MASK]][4] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
 // CHECK:         %{{.*}} = vector.mask %[[T_MASK_R4]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
 
-func.func @masked_matmul_scalable(%arg0: vector<3x5xf32>,
-                                  %arg1: vector<5x[7]xf32>,
-                                  %arg2: vector<3x[7]xf32>,
+func.func @masked_matmul_scalable(%A: vector<3x5xf32>,
+                                  %B: vector<5x[7]xf32>,
+                                  %C: vector<3x[7]xf32>,
                                   %m : vector<3x[7]x5xi1>) -> vector<3x[7]xf32> {
-  %0 = vector.mask %m { vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+  %0 = vector.mask %m { vector.contract #matmat_trait_0 %A, %B, %C
   : vector<3x5xf32>, vector<5x[7]xf32> into vector<3x[7]xf32> } : vector<3x[7]x5xi1> -> vector<3x[7]xf32>
   return %0 : vector<3x[7]xf32>
 }
@@ -160,11 +205,11 @@ func.func @masked_matmul_scalable(%arg0: vector<3x5xf32>,
 //      CHECK: %[[b1:.*]] = arith.extf %[[b0]] : vector<3xf16> to vector<3xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a1]], %[[b1]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x3xf32>
-func.func @matmul_mixed(%arg0: vector<2x1xf16>,
-                          %arg1: vector<1x3xf16>,
-                          %arg2: vector<2x3xf32>) -> vector<2x3xf32>
+func.func @matmul_mixed(%A: vector<2x1xf16>,
+                        %B: vector<1x3xf16>,
+                        %C: vector<2x3xf32>) -> vector<2x3xf32>
 {
-  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_0 %A, %B, %C
     : vector<2x1xf16>, vector<1x3xf16> into vector<2x3xf32>
   return %0 : vector<2x3xf32>
 }
@@ -180,28 +225,18 @@ func.func @matmul_mixed(%arg0: vector<2x1xf16>,
 //      CHECK: %[[b1:.*]] = arith.extf %[[b0]] : vector<[3]xf16> to vector<[3]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a1]], %[[b1]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x[3]xf32>
-func.func @matmul_mixed_scalable(%arg0: vector<2x1xf16>,
-                                   %arg1: vector<1x[3]xf16>,
-                                   %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32>
+func.func @matmul_mixed_scalable(%A: vector<2x1xf16>,
+                                 %B: vector<1x[3]xf16>,
+                                 %C: vector<2x[3]xf32>) -> vector<2x[3]xf32>
 {
-  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_0 %A, %B, %C
     : vector<2x1xf16>, vector<1x[3]xf16> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
 
 // ============================================================================
-//  Matmul 1 (plain)
+//  Matmul 1 (plain + scalable)
 // ============================================================================
-#matmat_accesses_1 = [
-  affine_map<(m, n, k) -> (m, k)>,
-  affine_map<(m, n, k) -> (n, k)>,
-  affine_map<(m, n, k) -> (m, n)>
-]
-#matmat_trait_1 = {
-  indexing_maps = #matmat_accesses_1,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
 // CHECK-LABEL: func @matmul_1
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<3x1xf32>,
@@ -212,11 +247,11 @@ func.func @matmul_mixed_scalable(%arg0: vector<2x1xf16>,
 //      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<3xf32> from vector<1x3xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x3xf32>
-func.func @matmul_1(%arg0: vector<2x1xf32>,
-                    %arg1: vector<3x1xf32>,
-                    %arg2: vector<2x3xf32>) -> vector<2x3xf32>
+func.func @matmul_1(%A: vector<2x1xf32>,
+                    %B: vector<3x1xf32>,
+                    %C: vector<2x3xf32>) -> vector<2x3xf32>
 {
-  %0 = vector.contract #matmat_trait_1 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_1 %A, %B, %C
     : vector<2x1xf32>, vector<3x1xf32> into vector<2x3xf32>
   return %0 : vector<2x3xf32>
 }
@@ -231,28 +266,18 @@ func.func @matmul_1(%arg0: vector<2x1xf32>,
 //      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<[3]xf32> from vector<1x[3]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x[3]xf32>
-func.func @matmul_1_scalable(%arg0: vector<2x1xf32>,
-                             %arg1: vector<[3]x1xf32>,
-                             %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32>
+func.func @matmul_1_scalable(%A: vector<2x1xf32>,
+                             %B: vector<[3]x1xf32>,
+                             %C: vector<2x[3]xf32>) -> vector<2x[3]xf32>
 {
-  %0 = vector.contract #matmat_trait_1 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_1 %A, %B, %C
     : vector<2x1xf32>, vector<[3]x1xf32> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
 
 // ============================================================================
-//  Matmul 2 (plain)
+//  Matmul 2 (plain + scalable)
 // ============================================================================
-#matmat_accesses_2 = [
-  affine_map<(m, n, k) -> (k, m)>,
-  affine_map<(m, n, k) -> (k, n)>,
-  affine_map<(m, n, k) -> (m, n)>
-]
-#matmat_trait_2 = {
-  indexing_maps = #matmat_accesses_2,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
 // CHECK-LABEL: func @matmul_2
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<1x2xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf32>,
@@ -261,11 +286,11 @@ func.func @matmul_1_scalable(%arg0: vector<2x1xf32>,
 //      CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<3xf32> from vector<1x3xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x3xf32>
-func.func @matmul_2(%arg0: vector<1x2xf32>,
-                    %arg1: vector<1x3xf32>,
-                    %arg2: vector<2x3xf32>) -> vector<2x3xf32>
+func.func @matmul_2(%A: vector<1x2xf32>,
+                    %B: vector<1x3xf32>,
+                    %C: vector<2x3xf32>) -> vector<2x3xf32>
 {
-  %0 = vector.contract #matmat_trait_2 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_2 %A, %B, %C
     : vector<1x2xf32>, vector<1x3xf32> into vector<2x3xf32>
   return %0 : vector<2x3xf32>
 }
@@ -278,28 +303,18 @@ func.func @matmul_2(%arg0: vector<1x2xf32>,
 //      CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<[3]xf32> from vector<1x[3]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x[3]xf32>
-func.func @matmul_2_scalable(%arg0: vector<1x2xf32>,
-                             %arg1: vector<1x[3]xf32>,
-                             %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32>
+func.func @matmul_2_scalable(%A: vector<1x2xf32>,
+                             %B: vector<1x[3]xf32>,
+                             %C: vector<2x[3]xf32>) -> vector<2x[3]xf32>
 {
-  %0 = vector.contract #matmat_trait_2 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_2 %A, %B, %C
     : vector<1x2xf32>, vector<1x[3]xf32> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
 
 // ============================================================================
-//  Matmul 3 (plain)
+//  Matmul 3 (plain + scalable)
 // ============================================================================
-#matmat_accesses_3 = [
-  affine_map<(m, n, k) -> (k, m)>,
-  affine_map<(m, n, k) -> (n, k)>,
-  affine_map<(m, n, k) -> (m, n)>
-]
-#matmat_trait_3 = {
-  indexing_maps = #matmat_accesses_3,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
 // CHECK-LABEL: func @matmul_3
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<1x2xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<3x1xf32>,
@@ -309,11 +324,11 @@ func.func @matmul_2_scalable(%arg0: vector<1x2xf32>,
 //      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<3xf32> from vector<1x3xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x3xf32>
-func.func @matmul_3(%arg0: vector<1x2xf32>,
-                    %arg1: vector<3x1xf32>,
-                    %arg2: vector<2x3xf32>) -> vector<2x3xf32>
+func.func @matmul_3(%A: vector<1x2xf32>,
+                    %B: vector<3x1xf32>,
+                    %C: vector<2x3xf32>) -> vector<2x3xf32>
 {
-  %0 = vector.contract #matmat_trait_3 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_3 %A, %B, %C
     : vector<1x2xf32>, vector<3x1xf32> into vector<2x3xf32>
   return %0 : vector<2x3xf32>
 }
@@ -327,28 +342,18 @@ func.func @matmul_3(%arg0: vector<1x2xf32>,
 //      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<[3]xf32> from vector<1x[3]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x[3]xf32>
-func.func @matmul_3_scalable(%arg0: vector<1x2xf32>,
-                             %arg1: vector<[3]x1xf32>,
-                             %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32>
+func.func @matmul_3_scalable(%A: vector<1x2xf32>,
+                             %B: vector<[3]x1xf32>,
+                             %C: vector<2x[3]xf32>) -> vector<2x[3]xf32>
 {
-  %0 = vector.contract #matmat_trait_3 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_3 %A, %B, %C
     : vector<1x2xf32>, vector<[3]x1xf32> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
 
 // ============================================================================
-//  Matmul 4 (plain)
+//  Matmul 4 (plain + scalable)
 // ============================================================================
-#matmat_accesses_4 = [
-  affine_map<(m, n, k) -> (m, k)>,
-  affine_map<(m, n, k) -> (k, n)>,
-  affine_map<(m, n, k) -> (n, m)>
-]
-#matmat_trait_4 = {
-  indexing_maps = #matmat_accesses_4,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
 // CHECK-LABEL: func @matmul_4
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf32>,
@@ -358,11 +363,11 @@ func.func @matmul_3_scalable(%arg0: vector<1x2xf32>,
 //      CHECK: %[[a0:.*]] = vector.extract %[[At]][0] : vector<2xf32> from vector<1x2xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[b0]], %[[a0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<3x2xf32>
-func.func @matmul_4(%arg0: vector<2x1xf32>,
-                    %arg1: vector<1x3xf32>,
-                    %arg2: vector<3x2xf32>) -> vector<3x2xf32>
+func.func @matmul_4(%A: vector<2x1xf32>,
+                    %B: vector<1x3xf32>,
+                    %C: vector<3x2xf32>) -> vector<3x2xf32>
 {
-  %0 = vector.contract #matmat_trait_4 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_4 %A, %B, %C
     : vector<2x1xf32>, vector<1x3xf32> into vector<3x2xf32>
   return %0 : vector<3x2xf32>
 }
@@ -376,11 +381,11 @@ func.func @matmul_4(%arg0: vector<2x1xf32>,
 //      CHECK: %[[a0:.*]] = vector.extract %[[At]][0] : vector<[2]xf32> from vector<1x[2]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[b0]], %[[a0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<3x[2]xf32>
-func.func @matmul_4_scalable(%arg0: vector<[2]x1xf32>,
-                             %arg1: vector<1x3xf32>,
-                             %arg2: vector<3x[2]xf32>) -> vector<3x[2]xf32>
+func.func @matmul_4_scalable(%A: vector<[2]x1xf32>,
+                             %B: vector<1x3xf32>,
+                             %C: vector<3x[2]xf32>) -> vector<3x[2]xf32>
 {
-  %0 = vector.contract #matmat_trait_4 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_4 %A, %B, %C
     : vector<[2]x1xf32>, vector<1x3xf32> into vector<3x[2]xf32>
   return %0 : vector<3x[2]xf32>
 }
diff --git a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir
index c09a4d569638..d86c6158bcdf 100644
--- a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir
@@ -235,6 +235,23 @@ func.func @masked_matvec_mk_k_m_max_scalable_parallel_dim(%A: vector<[2]x3xf32>,
 // ============================================================================
 //  Matvec 2 (plain + masked + scalable)
 // ============================================================================
+// CHECK-LABEL: func @matvec_km_k_m
+// CHECK-SAME: %[[A:.*0]]: vector<2x2xf32>
+// CHECK-SAME: %[[X:.*1]]: vector<2xf32>
+// CHECK-SAME: %[[B:.*2]]: vector<2xf32>
+// CHECK: %[[T3:.*]] = vector.extract %[[A]][0] : vector<2xf32> from vector<2x2xf32>
+// CHECK: %[[T4:.*]] = vector.extract %[[X]][0] : f32 from vector<2xf32>
+// CHECK: %[[T5:.*]] = vector.outerproduct %[[T3]], %[[T4]], %[[B]] {kind = #vector.kind<add>} : vector<2xf32>, f32
+// CHECK: %[[T6:.*]] = vector.extract %[[A]][1] : vector<2xf32> from vector<2x2xf32>
+// CHECK: %[[T7:.*]] = vector.extract %[[X]][1] : f32 from vector<2xf32>
+// CHECK: %[[T8:.*]] = vector.outerproduct %[[T6]], %[[T7]], %[[T5]] {kind = #vector.kind<add>} : vector<2xf32>, f32
+func.func @matvec_km_k_m(%A: vector<2x2xf32>,
+                         %x: vector<2xf32>,
+                         %b: vector<2xf32>) -> vector<2xf32> {
+  %0 = vector.contract #matvec_trait_2 %A, %x, %b : vector<2x2xf32>, vector<2xf32> into vector<2xf32>
+  return %0 : vector<2xf32>
+}
+
 // CHECK-LABEL: @masked_matvec_km_k_m
 // CHECK-SAME:  %[[A:.+]]: vector<2x4xf32>
 // CHECK-SAME:  %[[X:.+]]: vector<2xf32>
@@ -273,26 +290,27 @@ func.func @masked_matvec_km_k_m_scalable_parallel_dim(%A: vector<2x[4]xf32>,
   return %res : vector<[4]xf32>
 }
 
-// CHECK-LABEL: func @matvec_km_k_m
+// ============================================================================
+//  Matvec 3 (plain + masked + scalable)
+// ============================================================================
+// CHECK-LABEL: func @matvec_k_mk_m
 // CHECK-SAME: %[[A:.*0]]: vector<2x2xf32>
 // CHECK-SAME: %[[X:.*1]]: vector<2xf32>
 // CHECK-SAME: %[[B:.*2]]: vector<2xf32>
-// CHECK: %[[T3:.*]] = vector.extract %[[A]][0] : vector<2xf32> from vector<2x2xf32>
-// CHECK: %[[T4:.*]] = vector.extract %[[X]][0] : f32 from vector<2xf32>
-// CHECK: %[[T5:.*]] = vector.outerproduct %[[T3]], %[[T4]], %[[B]] {kind = #vector.kind<add>} : vector<2xf32>, f32
-// CHECK: %[[T6:.*]] = vector.extract %[[A]][1] : vector<2xf32> from vector<2x2xf32>
-// CHECK: %[[T7:.*]] = vector.extract %[[X]][1] : f32 from vector<2xf32>
-// CHECK: %[[T8:.*]] = vector.outerproduct %[[T6]], %[[T7]], %[[T5]] {kind = #vector.kind<add>} : vector<2xf32>, f32
-func.func @matvec_km_k_m(%A: vector<2x2xf32>,
+// CHECK: %[[T3:.*]] = vector.transpose %[[A]], [1, 0] : vector<2x2xf32> to vector<2x2xf32>
+// CHECK: %[[T4:.*]] = vector.extract %[[T3]][0] : vector<2xf32> from vector<2x2xf32>
+// CHECK: %[[T5:.*]] = vector.extract %[[X]][0] : f32 from vector<2xf32>
+// CHECK: %[[T6:.*]] = vector.outerproduct %[[T4]], %[[T5]], %[[B]] {kind = #vector.kind<add>} : vector<2xf32>, f32
+// CHECK: %[[T7:.*]] = vector.extract %[[T3]][1] : vector<2xf32> from vector<2x2xf32>
+// CHECK: %[[T8:.*]] = vector.extract %[[X]][1] : f32 from vector<2xf32>
+// CHECK: %[[T9:.*]] = vector.outerproduct %[[T7]], %[[T8]], %[[T6]] {kind = #vector.kind<add>} : vector<2xf32>, f32
+func.func @matvec_k_mk_m(%A: vector<2x2xf32>, 
                          %x: vector<2xf32>,
                          %b: vector<2xf32>) -> vector<2xf32> {
-  %0 = vector.contract #matvec_trait_2 %A, %x, %b : vector<2x2xf32>, vector<2xf32> into vector<2xf32>
+  %0 = vector.contract #matvec_trait_3 %x, %A, %b : vector<2xf32>, vector<2x2xf32> into vector<2xf32>
   return %0 : vector<2xf32>
 }
 
-// ============================================================================
-//  Matvec 3 (plain + masked + scalable)
-// ============================================================================
 // CHECK-LABEL: @masked_matvec_k_mk_m
 // CHECK-SAME:  %[[A:.+]]: vector<4x2xf32>
 // CHECK-SAME:  %[[X:.+]]: vector<2xf32>
@@ -331,24 +349,6 @@ func.func @masked_matvec_k_mk_m_scalable_parallel_dim(%A: vector<[4]x2xf32>,
   return %res : vector<[4]xf32>
 }
 
-// CHECK-LABEL: func @matvec_k_mk_m
-// CHECK-SAME: %[[A:.*0]]: vector<2x2xf32>
-// CHECK-SAME: %[[X:.*1]]: vector<2xf32>
-// CHECK-SAME: %[[B:.*2]]: vector<2xf32>
-// CHECK: %[[T3:.*]] = vector.transpose %[[A]], [1, 0] : vector<2x2xf32> to vector<2x2xf32>
-// CHECK: %[[T4:.*]] = vector.extract %[[T3]][0] : vector<2xf32> from vector<2x2xf32>
-// CHECK: %[[T5:.*]] = vector.extract %[[X]][0] : f32 from vector<2xf32>
-// CHECK: %[[T6:.*]] = vector.outerproduct %[[T4]], %[[T5]], %[[B]] {kind = #vector.kind<add>} : vector<2xf32>, f32
-// CHECK: %[[T7:.*]] = vector.extract %[[T3]][1] : vector<2xf32> from vector<2x2xf32>
-// CHECK: %[[T8:.*]] = vector.extract %[[X]][1] : f32 from vector<2xf32>
-// CHECK: %[[T9:.*]] = vector.outerproduct %[[T7]], %[[T8]], %[[T6]] {kind = #vector.kind<add>} : vector<2xf32>, f32
-func.func @matvec_k_mk_m(%A: vector<2x2xf32>, 
-                         %x: vector<2xf32>,
-                         %b: vector<2xf32>) -> vector<2xf32> {
-  %0 = vector.contract #matvec_trait_3 %x, %A, %b : vector<2xf32>, vector<2x2xf32> into vector<2xf32>
-  return %0 : vector<2xf32>
-}
-
 // ============================================================================
 //  Matvec 4 (plain + masked + scalable)
 // ============================================================================
diff --git a/mlir/test/Integration/Dialect/Memref/cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/cast-runtime-verification.mlir
index 6ad817a73408..52b8c16d753d 100644
--- a/mlir/test/Integration/Dialect/Memref/cast-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Memref/cast-runtime-verification.mlir
@@ -33,26 +33,26 @@ func.func @main() {
   %alloc = memref.alloc() : memref<5xf32>
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: memref.cast %{{.*}} : memref<?xf32> to memref<10xf32>
+  // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<?xf32>) -> memref<10xf32>
   // CHECK-NEXT: ^ size mismatch of dim 0
   // CHECK-NEXT: Location: loc({{.*}})
   %1 = memref.cast %alloc : memref<5xf32> to memref<?xf32>
   func.call @cast_to_static_dim(%1) : (memref<?xf32>) -> (memref<10xf32>)
 
   // CHECK-NEXT: ERROR: Runtime op verification failed
-  // CHECK-NEXT: memref.cast %{{.*}} : memref<*xf32> to memref<f32>
+  // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<*xf32>) -> memref<f32>
   // CHECK-NEXT: ^ rank mismatch
   // CHECK-NEXT: Location: loc({{.*}})
   %3 = memref.cast %alloc : memref<5xf32> to memref<*xf32>
   func.call @cast_to_ranked(%3) : (memref<*xf32>) -> (memref<f32>)
 
   // CHECK-NEXT: ERROR: Runtime op verification failed
-  // CHECK-NEXT: memref.cast %{{.*}} : memref<?xf32, strided<[?], offset: ?>> to memref<?xf32, strided<[9], offset: 5>>
+  // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<?xf32, strided<[?], offset: ?>>) -> memref<?xf32, strided<[9], offset: 5>>
   // CHECK-NEXT: ^ offset mismatch
   // CHECK-NEXT: Location: loc({{.*}})
 
   // CHECK-NEXT: ERROR: Runtime op verification failed
-  // CHECK-NEXT: memref.cast %{{.*}} : memref<?xf32, strided<[?], offset: ?>> to memref<?xf32, strided<[9], offset: 5>>
+  // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<?xf32, strided<[?], offset: ?>>) -> memref<?xf32, strided<[9], offset: 5>>
   // CHECK-NEXT: ^ stride mismatch of dim 0
   // CHECK-NEXT: Location: loc({{.*}})
   %4 = memref.cast %alloc
diff --git a/mlir/test/Integration/Dialect/Memref/load-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/load-runtime-verification.mlir
new file mode 100644
index 000000000000..169dfd705645
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Memref/load-runtime-verification.mlir
@@ -0,0 +1,67 @@
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -expand-strided-metadata \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
+func.func @load(%memref: memref<1xf32>, %index: index) {
+    memref.load %memref[%index] :  memref<1xf32>
+    return
+}
+
+func.func @load_dynamic(%memref: memref<?xf32>, %index: index) {
+    memref.load %memref[%index] :  memref<?xf32>
+    return
+}
+
+func.func @load_nd_dynamic(%memref: memref<?x?x?xf32>, %index0: index, %index1: index, %index2: index) {
+    memref.load %memref[%index0, %index1, %index2] :  memref<?x?x?xf32>
+    return
+}
+
+func.func @main() {
+  %0 = arith.constant 0 : index
+  %1 = arith.constant 1 : index
+  %n1 = arith.constant -1 : index
+  %2 = arith.constant 2 : index
+  %alloca_1 = memref.alloca() : memref<1xf32>
+  %alloc_1 = memref.alloc(%1) : memref<?xf32>
+  %alloc_2x2x2 = memref.alloc(%2, %2, %2) : memref<?x?x?xf32>
+
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.load"(%{{.*}}, %{{.*}}) : (memref<1xf32>, index) -> f32
+  // CHECK-NEXT: ^ out-of-bounds access
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @load(%alloca_1, %1) : (memref<1xf32>, index) -> ()
+
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.load"(%{{.*}}, %{{.*}}) : (memref<?xf32>, index) -> f32
+  // CHECK-NEXT: ^ out-of-bounds access
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @load_dynamic(%alloc_1, %1) : (memref<?xf32>, index) -> ()
+
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.load"(%{{.*}}, %{{.*}}) : (memref<?x?x?xf32>, index, index, index) -> f32
+  // CHECK-NEXT: ^ out-of-bounds access
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @load_nd_dynamic(%alloc_2x2x2, %1, %n1, %0) : (memref<?x?x?xf32>, index, index, index) -> ()
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @load(%alloca_1, %0) : (memref<1xf32>, index) -> ()
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @load_dynamic(%alloc_1, %0) : (memref<?xf32>, index) -> ()
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @load_nd_dynamic(%alloc_2x2x2, %1, %1, %0) : (memref<?x?x?xf32>, index, index, index) -> ()
+
+  memref.dealloc %alloc_1 : memref<?xf32>
+  memref.dealloc %alloc_2x2x2 : memref<?x?x?xf32>
+
+  return
+}
+
diff --git a/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir
new file mode 100644
index 000000000000..370029154054
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir
@@ -0,0 +1,74 @@
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -lower-affine \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
+func.func @reinterpret_cast(%memref: memref<1xf32>, %offset: index) {
+    memref.reinterpret_cast %memref to
+                    offset: [%offset],
+                    sizes: [1],
+                    strides: [1]
+                  : memref<1xf32> to  memref<1xf32, strided<[1], offset: ?>>
+    return
+}
+
+func.func @reinterpret_cast_fully_dynamic(%memref: memref<?xf32>, %offset: index, %size: index, %stride: index)  {
+    memref.reinterpret_cast %memref to
+                    offset: [%offset],
+                    sizes: [%size],
+                    strides: [%stride]
+                  : memref<?xf32> to  memref<?xf32, strided<[?], offset: ?>>
+    return
+}
+
+func.func @main() {
+  %0 = arith.constant 0 : index
+  %1 = arith.constant 1 : index
+  %n1 = arith.constant -1 : index
+  %4 = arith.constant 4 : index
+  %5 = arith.constant 5 : index
+
+  %alloca_1 = memref.alloca() : memref<1xf32>
+  %alloc_4 = memref.alloc(%4) : memref<?xf32>
+
+  // Offset is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
+  // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @reinterpret_cast(%alloca_1, %1) : (memref<1xf32>, index) -> ()
+
+  // Offset is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
+  // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @reinterpret_cast(%alloca_1, %n1) : (memref<1xf32>, index) -> ()
+
+  // Size is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
+  // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %5, %1) : (memref<?xf32>, index, index, index) -> ()
+
+  // Stride is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
+  // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %4, %4) : (memref<?xf32>, index, index, index) -> ()
+
+  //  CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @reinterpret_cast(%alloca_1, %0) : (memref<1xf32>, index) -> ()
+
+  //  CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %4, %1) : (memref<?xf32>, index, index, index) -> ()
+
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir
new file mode 100644
index 000000000000..48987ce216f1
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir
@@ -0,0 +1,89 @@
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -expand-strided-metadata \
+// RUN:     -lower-affine \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
+func.func @subview(%memref: memref<1xf32>, %offset: index) {
+    memref.subview %memref[%offset] [1] [1] : 
+        memref<1xf32> to 
+        memref<1xf32, strided<[1], offset: ?>>
+    return
+}
+
+func.func @subview_dynamic(%memref: memref<?x4xf32>, %offset: index, %size: index, %stride: index) {
+    memref.subview %memref[%offset, 0] [%size, 4] [%stride, 1] : 
+        memref<?x4xf32> to 
+        memref<?x4xf32, strided<[?, 1], offset: ?>>
+    return
+}
+
+func.func @subview_dynamic_rank_reduce(%memref: memref<?x4xf32>, %offset: index, %size: index, %stride: index) {
+    memref.subview %memref[%offset, 0] [%size, 1] [%stride, 1] :
+        memref<?x4xf32> to
+        memref<?xf32, strided<[?], offset: ?>>
+    return
+}
+
+func.func @main() {
+  %0 = arith.constant 0 : index
+  %1 = arith.constant 1 : index
+  %n1 = arith.constant -1 : index
+  %4 = arith.constant 4 : index
+  %5 = arith.constant 5 : index
+
+  %alloca = memref.alloca() : memref<1xf32>
+  %alloc = memref.alloc(%4) : memref<?x4xf32>
+
+  // Offset is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.subview"
+  // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @subview_dynamic_rank_reduce(%alloc, %5, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
+
+  // Offset is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.subview"
+  // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @subview(%alloca, %1) : (memref<1xf32>, index) -> ()
+
+  // Offset is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.subview"
+  // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @subview(%alloca, %n1) : (memref<1xf32>, index) -> ()
+
+  // Size is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.subview"
+  // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @subview_dynamic(%alloc, %0, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
+
+  // Stride is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.subview"
+  // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @subview_dynamic(%alloc, %0, %4, %4) : (memref<?x4xf32>, index, index, index) -> ()
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @subview(%alloca, %0) : (memref<1xf32>, index) -> ()
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @subview_dynamic(%alloc, %0, %4, %1) : (memref<?x4xf32>, index, index, index) -> ()
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @subview_dynamic_rank_reduce(%alloc, %0, %1, %0) : (memref<?x4xf32>, index, index, index) -> ()
+
+
+  return
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose_coo.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose_coo.mlir
index 95030fdf258b..dba897334830 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose_coo.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose_coo.mlir
@@ -17,8 +17,7 @@
 // DEFINE: %{env} =
 //--------------------------------------------------------------------------------------------------
 
-// FIXME: lib path does not support all of COO yet
-// R_U_N: %{compile} | %{run} | FileCheck %s
+// RUN: %{compile} | %{run} | FileCheck %s
 //
 // Do the same run, but now with direct IR generation.
 // REDEFINE: %{sparsifier_opts} = enable-runtime-library=false
diff --git a/mlir/test/Target/Cpp/const.mlir b/mlir/test/Target/Cpp/const.mlir
index e6c94732e9f6..28a547909a0a 100644
--- a/mlir/test/Target/Cpp/const.mlir
+++ b/mlir/test/Target/Cpp/const.mlir
@@ -2,7 +2,7 @@
 // RUN: mlir-translate -mlir-to-cpp -declare-variables-at-top %s | FileCheck %s -check-prefix=CPP-DECLTOP
 
 func.func @emitc_constant() {
-  %c0 = "emitc.constant"(){value = #emitc.opaque<"">} : () -> i32
+  %c0 = "emitc.constant"(){value = #emitc.opaque<"INT_MAX">} : () -> i32
   %c1 = "emitc.constant"(){value = 42 : i32} : () -> i32
   %c2 = "emitc.constant"(){value = -1 : i32} : () -> i32
   %c3 = "emitc.constant"(){value = -1 : si8} : () -> si8
@@ -11,7 +11,7 @@ func.func @emitc_constant() {
   return
 }
 // CPP-DEFAULT: void emitc_constant() {
-// CPP-DEFAULT-NEXT: int32_t [[V0:[^ ]*]];
+// CPP-DEFAULT-NEXT: int32_t [[V0:[^ ]*]] = INT_MAX;
 // CPP-DEFAULT-NEXT: int32_t [[V1:[^ ]*]] = 42;
 // CPP-DEFAULT-NEXT: int32_t [[V2:[^ ]*]] = -1;
 // CPP-DEFAULT-NEXT: int8_t [[V3:[^ ]*]] = -1;
@@ -25,7 +25,7 @@ func.func @emitc_constant() {
 // CPP-DECLTOP-NEXT: int8_t [[V3:[^ ]*]];
 // CPP-DECLTOP-NEXT: uint8_t [[V4:[^ ]*]];
 // CPP-DECLTOP-NEXT: char [[V5:[^ ]*]];
-// CPP-DECLTOP-NEXT: ;
+// CPP-DECLTOP-NEXT: [[V0]] = INT_MAX;
 // CPP-DECLTOP-NEXT: [[V1]] = 42;
 // CPP-DECLTOP-NEXT: [[V2]] = -1;
 // CPP-DECLTOP-NEXT: [[V3]] = -1;
diff --git a/mlir/test/Target/LLVMIR/Import/calling-convention.ll b/mlir/test/Target/LLVMIR/Import/calling-convention.ll
new file mode 100644
index 000000000000..bf5dc6a694f2
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/Import/calling-convention.ll
@@ -0,0 +1,32 @@
+; RUN: mlir-translate -import-llvm -split-input-file %s | FileCheck %s
+
+; CHECK: llvm.func fastcc @cconv_fastcc()
+declare fastcc void @cconv_fastcc()
+; CHECK: llvm.func @cconv_ccc()
+declare ccc void @cconv_ccc()
+
+; CHECK-LABEL: @call_cconv
+define void @call_cconv() {
+  ; CHECK: llvm.call fastcc @cconv_fastcc()
+  call fastcc void @cconv_fastcc()
+  ; CHECK: llvm.call @cconv_ccc()
+  call ccc void @cconv_ccc()
+  ret void
+}
+
+; // -----
+
+; CHECK: llvm.func fastcc @cconv_fastcc()
+declare fastcc void @cconv_fastcc()
+declare i32 @__gxx_personality_v0(...)
+
+; CHECK-LABEL: @invoke_cconv
+define i32 @invoke_cconv() personality ptr @__gxx_personality_v0 {
+  ; CHECK: llvm.invoke fastcc @cconv_fastcc() to ^bb2 unwind ^bb1 : () -> ()
+  invoke fastcc void @cconv_fastcc() to label %bb2 unwind label %bb1
+bb1:
+  %1 = landingpad { ptr, i32 } cleanup
+  br label %bb2
+bb2:
+  ret i32 1
+}
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index f52ad6b56d14..1ec9005458c5 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -720,6 +720,13 @@ define void @coro_resume(ptr %0) {
   ret void
 }
 
+; CHECK-LABEL:  llvm.func @coro_promise
+define void @coro_promise(ptr %0, i32 %1, i1 %2) {
+  ; CHECK: llvm.intr.coro.promise %{{.*}}, %{{.*}}, %{{.*}} : (!llvm.ptr, i32, i1) -> !llvm.ptr
+  %4 = call ptr @llvm.coro.promise(ptr %0, i32 %1, i1 %2)
+  ret void
+}
+
 ; CHECK-LABEL:  llvm.func @eh_typeid_for
 define void @eh_typeid_for(ptr %0) {
   ; CHECK: llvm.intr.eh.typeid.for %{{.*}} : (!llvm.ptr) -> i32
@@ -1046,6 +1053,7 @@ declare i8 @llvm.coro.suspend(token, i1)
 declare i1 @llvm.coro.end(ptr, i1, token)
 declare ptr @llvm.coro.free(token, ptr nocapture readonly)
 declare void @llvm.coro.resume(ptr)
+declare ptr @llvm.coro.promise(ptr nocapture, i32, i1)
 declare i32 @llvm.eh.typeid.for(ptr)
 declare ptr @llvm.stacksave.p0()
 declare ptr addrspace(1) @llvm.stacksave.p1()
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index 1c0aa8d3407a..fc2e0fd201a7 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -712,6 +712,13 @@ llvm.func @coro_resume(%arg0: !llvm.ptr) {
   llvm.return
 }
 
+// CHECK-LABEL: @coro_promise
+llvm.func @coro_promise(%arg0: !llvm.ptr, %arg1 : i32, %arg2 : i1) {
+  // CHECK: call ptr @llvm.coro.promise
+  %0 = llvm.intr.coro.promise %arg0, %arg1, %arg2 : (!llvm.ptr, i32, i1) -> !llvm.ptr
+  llvm.return
+}
+
 // CHECK-LABEL: @eh_typeid_for
 llvm.func @eh_typeid_for(%arg0 : !llvm.ptr) {
     // CHECK: call i32 @llvm.eh.typeid.for
@@ -1056,6 +1063,7 @@ llvm.func @ssa_copy(%arg: f32) -> f32 {
 // CHECK-DAG: declare i1 @llvm.coro.end(ptr, i1, token)
 // CHECK-DAG: declare ptr @llvm.coro.free(token, ptr nocapture readonly)
 // CHECK-DAG: declare void @llvm.coro.resume(ptr)
+// CHECK-DAG: declare ptr @llvm.coro.promise(ptr nocapture, i32, i1)
 // CHECK-DAG: declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 // CHECK-DAG: declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 // CHECK-DAG: declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
new file mode 100644
index 000000000000..43d0934d3a93
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// The aim of the test is to check the GPU LLVM IR codegen
+// for nested omp do loop inside omp target region
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
+  llvm.func @target_parallel_wsloop(%arg0: !llvm.ptr ){
+    omp.parallel {
+      %loop_ub = llvm.mlir.constant(9 : i32) : i32
+      %loop_lb = llvm.mlir.constant(0 : i32) : i32
+      %loop_step = llvm.mlir.constant(1 : i32) : i32
+      omp.wsloop for  (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+        %gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
+        llvm.store %loop_cnt, %gep : i32, !llvm.ptr
+        omp.yield
+      }
+     omp.terminator
+    }
+
+    llvm.return
+  }
+
+}
+// CHECK:      call void @__kmpc_parallel_51(ptr addrspacecast
+// CHECK-SAME:  (ptr addrspace(1) @[[GLOB:[0-9]+]] to ptr),
+// CHECK-SAME:  i32 %[[THREAD_NUM:.*]], i32 1, i32 -1, i32 -1,
+// CHECK-SAME:  ptr @[[PARALLEL_FUNC:.*]], ptr null, ptr %[[PARALLEL_ARGS:.*]], i64 1)
+
+// CHECK:      define internal void @[[PARALLEL_FUNC]]
+// CHECK-SAME:  (ptr noalias noundef %[[TID_ADDR:.*]], ptr noalias noundef %[[ZERO_ADDR:.*]],
+// CHECK-SAME:  ptr %[[ARG_PTR:.*]])
+// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB]] to ptr),
+// CHECK-SAME:   ptr @[[LOOP_BODY_FUNC:.*]], ptr %[[LOO_BODY_FUNC_ARG:.*]], i32 10,
+// CHECK-SAME:   i32 %[[THREAD_NUM:.*]], i32 0)
+
+// CHECK:      define internal void @[[LOOP_BODY_FUNC]](i32 %[[CNT:.*]], ptr %[[LOOP_BODY_ARG_PTR:.*]]) {
diff --git a/mlir/test/Target/SPIRV/physical-storage-buffer.mlir b/mlir/test/Target/SPIRV/physical-storage-buffer.mlir
new file mode 100644
index 000000000000..040cfb891cb3
--- /dev/null
+++ b/mlir/test/Target/SPIRV/physical-storage-buffer.mlir
@@ -0,0 +1,48 @@
+// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
+
+// Test file showing how the Physical Storage Buffer extension works end-2-end.
+
+!f32_binding = !spirv.struct<binding_f32_t, (!spirv.rtarray<f32, stride=4> [0])>
+!f32_binding_ptr = !spirv.ptr<!f32_binding, PhysicalStorageBuffer>
+
+!set_0 = !spirv.struct<set_0_t, (!f32_binding_ptr [0],
+                                 !f32_binding_ptr [8],
+                                 !f32_binding_ptr [16])>
+!set_0_ptr = !spirv.ptr<!set_0, StorageBuffer>
+
+!set_1 = !spirv.struct<set_1_t, (!f32_binding_ptr [0],
+                                 !f32_binding_ptr [8])>
+!set_1_ptr = !spirv.ptr<!set_1, StorageBuffer>
+
+spirv.module PhysicalStorageBuffer64 GLSL450 requires #spirv.vce<v1.5,
+    [Shader, Int64, PhysicalStorageBufferAddresses], [SPV_KHR_physical_storage_buffer]> {
+
+  spirv.GlobalVariable @set_0 bind(3, 0) : !set_0_ptr
+  spirv.GlobalVariable @set_1 bind(3, 1) : !set_1_ptr
+
+  // CHECK-LABEL: spirv.func @main() "None"
+  spirv.func @main() "None" {
+    %idx0 = spirv.Constant 0 : i64
+    %idx1 = spirv.Constant 1 : i64
+    %idx2 = spirv.Constant 2 : i64
+    %set_0_addr = spirv.mlir.addressof @set_0 : !set_0_ptr
+    %s0_b2_ptr = spirv.AccessChain %set_0_addr[%idx2] : !set_0_ptr, i64
+    %b2_ptr = spirv.Load "StorageBuffer" %s0_b2_ptr : !f32_binding_ptr
+    %b2_data_ptr = spirv.AccessChain %b2_ptr[%idx0, %idx0] : !f32_binding_ptr, i64, i64
+
+    // CHECK: spirv.Load "PhysicalStorageBuffer"
+    %b2_data = spirv.Load "PhysicalStorageBuffer" %b2_data_ptr ["Aligned", 4] : f32
+
+    %set_1_addr = spirv.mlir.addressof @set_1 : !set_1_ptr
+    %s1_b1_ptr = spirv.AccessChain %set_1_addr[%idx1] : !set_1_ptr, i64
+    %b1_ptr = spirv.Load "StorageBuffer" %s1_b1_ptr : !f32_binding_ptr
+    %b1_data_ptr = spirv.AccessChain %b1_ptr[%idx0, %idx0] : !f32_binding_ptr, i64, i64
+
+    // CHECK: spirv.Store "PhysicalStorageBuffer"
+    spirv.Store "PhysicalStorageBuffer" %b1_data_ptr, %b2_data ["Aligned", 4] : f32
+
+    spirv.Return
+  }
+
+  spirv.EntryPoint "GLCompute" @main, @set_0, @set_1
+}
diff --git a/mlir/test/Transforms/canonicalize-debuginfo.mlir b/mlir/test/Transforms/canonicalize-debuginfo.mlir
new file mode 100644
index 000000000000..30c8022daa76
--- /dev/null
+++ b/mlir/test/Transforms/canonicalize-debuginfo.mlir
@@ -0,0 +1,46 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(canonicalize{test-convergence}))' -split-input-file -mlir-print-debuginfo | FileCheck %s
+
+// CHECK-LABEL: func @merge_constants
+func.func @merge_constants() -> (index, index, index, index) {
+  // CHECK-NEXT: arith.constant 42 : index loc(#[[UnknownLoc:.*]])
+  %0 = arith.constant 42 : index loc("merge_constants":0:0)
+  %1 = arith.constant 42 : index loc("merge_constants":1:0)
+  %2 = arith.constant 42 : index loc("merge_constants":2:0)
+  %3 = arith.constant 42 : index loc("merge_constants":2:0)
+  return %0, %1, %2, %3 : index, index, index, index
+}
+// CHECK: #[[UnknownLoc]] = loc(unknown)
+
+// -----
+
+// CHECK-LABEL: func @simple_hoist
+func.func @simple_hoist(%arg0: memref<8xi32>) -> i32 {
+  // CHECK: arith.constant 88 : i32 loc(#[[UnknownLoc:.*]])
+  // CHECK: arith.constant 42 : i32 loc(#[[ConstLoc0:.*]])
+  // CHECK: arith.constant 0 : index loc(#[[ConstLoc1:.*]])
+  %0 = arith.constant 42 : i32 loc("simple_hoist":0:0)
+  %1 = arith.constant 0 : index loc("simple_hoist":1:0)
+  memref.store %0, %arg0[%1] : memref<8xi32>
+
+  %2 = arith.constant 88 : i32 loc("simple_hoist":2:0)
+
+  return %2 : i32
+}
+// CHECK-DAG: #[[UnknownLoc]] = loc(unknown)
+// CHECK-DAG: #[[ConstLoc0]] = loc("simple_hoist":0:0)
+// CHECK-DAG: #[[ConstLoc1]] = loc("simple_hoist":1:0)
+
+// -----
+
+// CHECK-LABEL: func @hoist_and_merge
+func.func @hoist_and_merge(%arg0: memref<8xi32>) {
+  // CHECK-NEXT: arith.constant 42 : i32 loc(#[[UnknownLoc:.*]])
+  affine.for %arg1 = 0 to 8 {
+    %0 = arith.constant 42 : i32 loc("hoist_and_merge":0:0)
+    %1 = arith.constant 42 : i32 loc("hoist_and_merge":1:0)
+    memref.store %0, %arg0[%arg1] : memref<8xi32>
+    memref.store %1, %arg0[%arg1] : memref<8xi32>
+  }
+  return
+} loc("hoist_and_merge":2:0)
+// CHECK: #[[UnknownLoc]] = loc(unknown)
diff --git a/mlir/test/Transforms/constant-fold-debuginfo.mlir b/mlir/test/Transforms/constant-fold-debuginfo.mlir
new file mode 100644
index 000000000000..c308bc477bee
--- /dev/null
+++ b/mlir/test/Transforms/constant-fold-debuginfo.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-opt %s -split-input-file -test-constant-fold -mlir-print-debuginfo | FileCheck %s
+
+// CHECK-LABEL: func @fold_and_merge
+func.func @fold_and_merge() -> (i32, i32) {
+  // CHECK-NEXT: [[C:%.+]] = arith.constant 6 : i32 loc(#[[UnknownLoc:.*]])
+  %0 = arith.constant 1 : i32 loc("fold_and_merge":0:0)
+  %1 = arith.constant 5 : i32 loc("fold_and_merge":1:0)
+  %2 = arith.addi %0, %1 : i32 loc("fold_and_merge":2:0)
+
+  %3 = arith.constant 6 : i32 loc("fold_and_merge":3:0)
+
+  return %2, %3: i32, i32
+}
+// CHECK: #[[UnknownLoc]] = loc(unknown)
+
+// -----
+
+// CHECK-LABEL: func @materialize_different_dialect
+func.func @materialize_different_dialect() -> (f32, f32) {
+  // CHECK: arith.constant 1.{{0*}}e+00 : f32 loc(#[[UnknownLoc:.*]])
+  %0 = arith.constant -1.0 : f32 loc("materialize_different_dialect":0:0)
+  %1 = math.absf %0 : f32 loc("materialize_different_dialect":1:0)
+  %2 = arith.constant 1.0 : f32 loc("materialize_different_dialect":2:0)
+
+  return %1, %2: f32, f32
+}
+// CHECK: #[[UnknownLoc]] = loc(unknown)
+
+// -----
+
+// CHECK-LABEL: func @materialize_in_front
+func.func @materialize_in_front(%arg0: memref<8xi32>) {
+  // CHECK-NEXT: arith.constant 6 : i32 loc(#[[UnknownLoc:.*]])
+  affine.for %arg1 = 0 to 8 {
+    %1 = arith.constant 1 : i32 loc("materialize_in_front":0:0)
+    %2 = arith.constant 5 : i32 loc("materialize_in_front":1:0)
+    %3 = arith.addi %1, %2 : i32 loc("materialize_in_front":2:0)
+    memref.store %3, %arg0[%arg1] : memref<8xi32>
+  }
+  return
+} loc("materialize_in_front":3:0)
+// CHECK: #[[UnknownLoc]] = loc(unknown)
diff --git a/mlir/test/lib/Analysis/TestDataFlowFramework.cpp b/mlir/test/lib/Analysis/TestDataFlowFramework.cpp
index ed361b5a0e27..b6b33182440c 100644
--- a/mlir/test/lib/Analysis/TestDataFlowFramework.cpp
+++ b/mlir/test/lib/Analysis/TestDataFlowFramework.cpp
@@ -100,7 +100,7 @@ LogicalResult FooAnalysis::initialize(Operation *top) {
     return top->emitError("expected at least one block in the region");
 
   // Initialize the top-level state.
-  getOrCreate<FooState>(&top->getRegion(0).front())->join(0);
+  (void)getOrCreate<FooState>(&top->getRegion(0).front())->join(0);
 
   // Visit all nested blocks and operations.
   for (Block &block : top->getRegion(0)) {
diff --git a/mlir/test/lib/Dialect/GPU/CMakeLists.txt b/mlir/test/lib/Dialect/GPU/CMakeLists.txt
index aa94bce275ea..48cbc4ad5505 100644
--- a/mlir/test/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/GPU/CMakeLists.txt
@@ -27,6 +27,7 @@ set(LIBS
   MLIRTransforms
   MLIRTransformUtils
   MLIRTranslateLib
+  MLIRVectorDialect
   MLIRVectorToLLVMPass
   )
 
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index db65f3bccec5..21cc89c0d89b 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -39,10 +40,34 @@ struct TestGpuRewritePass
     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
   }
 };
+
+struct TestGpuSubgroupReduceLoweringPass
+    : public PassWrapper<TestGpuSubgroupReduceLoweringPass,
+                         OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      TestGpuSubgroupReduceLoweringPass)
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<arith::ArithDialect, vector::VectorDialect>();
+  }
+  StringRef getArgument() const final {
+    return "test-gpu-subgroup-reduce-lowering";
+  }
+  StringRef getDescription() const final {
+    return "Applies gpu.subgroup_reduce lowering patterns.";
+  }
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    populateGpuBreakDownSubgrupReducePatterns(patterns,
+                                              /*maxShuffleBitwidth=*/32);
+    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+  }
+};
 } // namespace
 
 namespace mlir {
-void registerTestAllReduceLoweringPass() {
+void registerTestGpuLoweringPasses() {
   PassRegistration<TestGpuRewritePass>();
+  PassRegistration<TestGpuSubgroupReduceLoweringPass>();
 }
 } // namespace mlir
diff --git a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
index 3e142155df8d..b907f77e9108 100644
--- a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
@@ -84,9 +84,9 @@ struct TestTensorTransforms
           "the extract_slice of collapse_shape pattern"),
       llvm::cl::init(false)};
 
-  Option<bool> testSimplifyPackPatterns{
-      *this, "test-simplify-pack-patterns",
-      llvm::cl::desc("Test patterns to simplify tensor.pack"),
+  Option<bool> testSimplifyPackUnpackPatterns{
+      *this, "test-simplify-pack-unpack-patterns",
+      llvm::cl::desc("Test patterns to simplify tensor.pack and tensor.unpack"),
       llvm::cl::init(false)};
 
   Option<bool> testTrackingListener{
@@ -137,9 +137,9 @@ applyDropRedundantInsertSliceRankExpansionPatterns(Operation *rootOp) {
   (void)applyPatternsAndFoldGreedily(rootOp, std::move(patterns));
 }
 
-static void applySimplifyPackPatterns(Operation *rootOp) {
+static void applySimplifyPackUnpackPatterns(Operation *rootOp) {
   RewritePatternSet patterns(rootOp->getContext());
-  tensor::populateSimplifyTensorPack(patterns);
+  tensor::populateSimplifyPackAndUnpackPatterns(patterns);
   (void)applyPatternsAndFoldGreedily(rootOp, std::move(patterns));
 }
 
@@ -376,8 +376,8 @@ static LogicalResult testTrackingListenerReplacements(Operation *rootOp) {
 
 void TestTensorTransforms::runOnOperation() {
   Operation *rootOp = getOperation();
-  if (testSimplifyPackPatterns)
-    applySimplifyPackPatterns(rootOp);
+  if (testSimplifyPackUnpackPatterns)
+    applySimplifyPackUnpackPatterns(rootOp);
   if (testFoldConstantExtractSlice)
     applyFoldConstantExtractSlicePatterns(rootOp);
   if (testFoldConsecutiveInsertExtractSlice)
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 70ccc71883e3..48b41d869876 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -480,6 +480,12 @@ def IsolatedRegionsOp : TEST_Op<"isolated_regions", [IsolatedFromAbove]> {
   let assemblyFormat = "attr-dict-with-keyword $regions";
 }
 
+def AllocaScopeRegionOp : TEST_Op<"alloca_scope_region",
+                                  [AutomaticAllocationScope]> {
+  let regions = (region AnyRegion:$region);
+  let assemblyFormat = "attr-dict-with-keyword $region";
+}
+
 //===----------------------------------------------------------------------===//
 // NoTerminator Operation
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Transforms/TestIntRangeInference.cpp b/mlir/test/lib/Transforms/TestIntRangeInference.cpp
index 2f6dd5b8095d..5758f6acf2f0 100644
--- a/mlir/test/lib/Transforms/TestIntRangeInference.cpp
+++ b/mlir/test/lib/Transforms/TestIntRangeInference.cpp
@@ -40,9 +40,8 @@ static LogicalResult replaceWithConstant(DataFlowSolver &solver, OpBuilder &b,
       maybeDefiningOp ? maybeDefiningOp->getDialect()
                       : value.getParentRegion()->getParentOp()->getDialect();
   Attribute constAttr = b.getIntegerAttr(value.getType(), *maybeConstValue);
-  Value constant =
-      folder.getOrCreateConstant(b.getInsertionBlock(), valueDialect, constAttr,
-                                 value.getType(), value.getLoc());
+  Value constant = folder.getOrCreateConstant(
+      b.getInsertionBlock(), valueDialect, constAttr, value.getType());
   if (!constant)
     return failure();
 
diff --git a/mlir/test/mlir-tblgen/gen-dialect-doc.td b/mlir/test/mlir-tblgen/gen-dialect-doc.td
index ca0b6e38edf8..c9492eb9ac3c 100644
--- a/mlir/test/mlir-tblgen/gen-dialect-doc.td
+++ b/mlir/test/mlir-tblgen/gen-dialect-doc.td
@@ -81,9 +81,9 @@ def TestTypeDefParams : TypeDef<Test_Dialect, "TestTypeDefParams"> {
 // CHECK: Other group
 // CHECK: test.b
 // CHECK: test.c
-// CHECK: Traits: SingleBlock, SingleBlockImplicitTerminator<YieldOp>
-// CHECK: Interfaces: NoMemoryEffect (MemoryEffectOpInterface)
-// CHECK: Effects: MemoryEffects::Effect{}
+// CHECK: Traits: `SingleBlockImplicitTerminator<YieldOp>`, `SingleBlock`
+// CHECK: Interfaces: `NoMemoryEffect (MemoryEffectOpInterface)`
+// CHECK: Effects: `MemoryEffects::Effect{}`
 
 // CHECK: ## Attribute constraints
 // CHECK: ### attribute summary
diff --git a/mlir/test/mlir-vulkan-runner/addf_if.mlir b/mlir/test/mlir-vulkan-runner/addf_if.mlir
new file mode 100644
index 000000000000..fbd1fae6d0b5
--- /dev/null
+++ b/mlir/test/mlir-vulkan-runner/addf_if.mlir
@@ -0,0 +1,54 @@
+// RUN: mlir-vulkan-runner %s --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils --entry-point-result=void | FileCheck %s
+
+// CHECK: [3.3,  3.3,  3.3,  3.3,  0,  0,  0,  0]
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<
+    #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<>>
+} {
+  gpu.module @kernels {
+    gpu.func @kernel_add(%arg0 : memref<8xf32>, %arg1 : memref<8xf32>, %arg2 : memref<8xf32>)
+      kernel attributes { spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [1, 1, 1]>} {
+      %0 = gpu.block_id x
+      %limit = arith.constant 4 : index
+      %cond = arith.cmpi slt, %0, %limit : index
+      scf.if %cond {
+        %1 = memref.load %arg0[%0] : memref<8xf32>
+        %2 = memref.load %arg1[%0] : memref<8xf32>
+        %3 = arith.addf %1, %2 : f32
+        memref.store %3, %arg2[%0] : memref<8xf32>
+      }
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    %arg0 = memref.alloc() : memref<8xf32>
+    %arg1 = memref.alloc() : memref<8xf32>
+    %arg2 = memref.alloc() : memref<8xf32>
+    %0 = arith.constant 0 : i32
+    %1 = arith.constant 1 : i32
+    %2 = arith.constant 2 : i32
+    %value0 = arith.constant 0.0 : f32
+    %value1 = arith.constant 1.1 : f32
+    %value2 = arith.constant 2.2 : f32
+    %arg3 = memref.cast %arg0 : memref<8xf32> to memref<?xf32>
+    %arg4 = memref.cast %arg1 : memref<8xf32> to memref<?xf32>
+    %arg5 = memref.cast %arg2 : memref<8xf32> to memref<?xf32>
+    call @fillResource1DFloat(%arg3, %value1) : (memref<?xf32>, f32) -> ()
+    call @fillResource1DFloat(%arg4, %value2) : (memref<?xf32>, f32) -> ()
+    call @fillResource1DFloat(%arg5, %value0) : (memref<?xf32>, f32) -> ()
+
+    %cst1 = arith.constant 1 : index
+    %cst8 = arith.constant 8 : index
+    gpu.launch_func @kernels::@kernel_add
+        blocks in (%cst8, %cst1, %cst1) threads in (%cst1, %cst1, %cst1)
+        args(%arg0 : memref<8xf32>, %arg1 : memref<8xf32>, %arg2 : memref<8xf32>)
+    %arg6 = memref.cast %arg5 : memref<?xf32> to memref<*xf32>
+    call @printMemrefF32(%arg6) : (memref<*xf32>) -> ()
+    return
+  }
+  func.func private @fillResource1DFloat(%0 : memref<?xf32>, %1 : f32)
+  func.func private @printMemrefF32(%ptr : memref<*xf32>)
+}
+
diff --git a/mlir/test/python/dialects/arith_dialect.py b/mlir/test/python/dialects/arith_dialect.py
index f80f2c084a0f..8bb80eed2b81 100644
--- a/mlir/test/python/dialects/arith_dialect.py
+++ b/mlir/test/python/dialects/arith_dialect.py
@@ -75,7 +75,7 @@ def testArithValue():
         f64_t = F64Type.get()
 
         with InsertionPoint(module.body):
-            a = arith.constant(value=FloatAttr.get(f16_t, 42.42))
+            a = arith.constant(f16_t, 42.42)
             # CHECK: ArithValue(%cst = arith.constant 4.240
             print(a)
 
@@ -83,12 +83,12 @@ def testArithValue():
             # CHECK: ArithValue(%0 = arith.addf %cst, %cst : f16)
             print(b)
 
-            a = arith.constant(value=FloatAttr.get(f32_t, 42.42))
+            a = arith.constant(f32_t, 42.42)
             b = a - a
             # CHECK: ArithValue(%1 = arith.subf %cst_0, %cst_0 : f32)
             print(b)
 
-            a = arith.constant(value=FloatAttr.get(f64_t, 42.42))
+            a = arith.constant(f64_t, 42.42)
             b = a * a
             # CHECK: ArithValue(%2 = arith.mulf %cst_1, %cst_1 : f64)
             print(b)
diff --git a/mlir/test/python/dialects/spirv_dialect.py b/mlir/test/python/dialects/spirv_dialect.py
new file mode 100644
index 000000000000..d5b9e6cedb5d
--- /dev/null
+++ b/mlir/test/python/dialects/spirv_dialect.py
@@ -0,0 +1,21 @@
+# RUN: %PYTHON %s | FileCheck %s
+
+from mlir.ir import *
+import mlir.dialects.spirv as spirv
+
+
+def run(f):
+    print("\nTEST:", f.__name__)
+    f()
+
+
+# CHECK-LABEL: TEST: testConstantOp
+@run
+def testConstantOps():
+    with Context() as ctx, Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            i32 = IntegerType.get_signless(32)
+            spirv.ConstantOp(value=IntegerAttr.get(i32, 42), constant=i32)
+        # CHECK: spirv.Constant 42 : i32
+        print(module)
diff --git a/mlir/test/python/dialects/tensor.py b/mlir/test/python/dialects/tensor.py
index b690c934dc46..ca9066b23911 100644
--- a/mlir/test/python/dialects/tensor.py
+++ b/mlir/test/python/dialects/tensor.py
@@ -4,6 +4,7 @@ from mlir.ir import *
 import mlir.dialects.arith as arith
 import mlir.dialects.func as func
 import mlir.dialects.tensor as tensor
+from mlir.extras import types as T
 
 
 def run(f):
@@ -139,3 +140,37 @@ def testFromElementsOp():
                 t = tensor.FromElementsOp(RankedTensorType.get((1, 2), f32), [c0, c1])
                 # CHECK: %{{.*}} = "tensor.from_elements"(%[[C0]], %[[C1]]) : (f32, f32) -> tensor<1x2xf32>
                 print(t)
+
+
+# CHECK-LABEL: TEST: testGenerateRegionOp
+@run
+def testGenerateRegionOp():
+    S = ShapedType.get_dynamic_size()
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            # CHECK: %[[VAL_0:.*]] = arith.constant 1 : index
+            # CHECK: %[[VAL_1:.*]] = arith.constant 2 : index
+            one = arith.constant(T.index(), 1)
+            two = arith.constant(T.index(), 2)
+
+            @tensor.generate(T.tensor(S, 3, S, T.index()), dynamic_extents=[one, two])
+            def generate_one(i: T.index(), j: T.index(), k: T.index()):
+                ij = arith.addi(i, j)
+                ijk = arith.addi(ij, k)
+                return ijk
+
+            assert (
+                isinstance(generate_one, Value)
+                and generate_one.owner.name == "tensor.generate"
+            )
+
+        # CHECK:         %[[GENERATED:.*]] = tensor.generate
+        # CHECK-SAME:    %[[VAL_0]],
+        # CHECK-SAME:    %[[VAL_1]] {
+        # CHECK:         ^bb0(%[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: index):
+        # CHECK:           %[[VAL_4:.*]] = arith.addi %[[VAL_1]], %[[VAL_2]] : index
+        # CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_4]], %[[VAL_3]] : index
+        # CHECK:           tensor.yield %[[VAL_5]] : index
+        # CHECK:         } : tensor<?x3x?xindex>
+        print(module)
diff --git a/mlir/test/python/dialects/transform_extras.py b/mlir/test/python/dialects/transform_extras.py
index dbfa8a2dc73c..358f8c32f75c 100644
--- a/mlir/test/python/dialects/transform_extras.py
+++ b/mlir/test/python/dialects/transform_extras.py
@@ -2,9 +2,34 @@
 
 from typing import Callable
 from mlir import ir
-from mlir.dialects import scf
-from mlir.dialects.transform import structured
-from mlir.extras.dialects.transform import OpHandle, insert_transform_script
+from mlir.dialects import scf, pdl
+from mlir.dialects.transform import (
+    structured,
+    get_parent_op,
+    apply_patterns_canonicalization,
+    apply_cse,
+    any_op_t,
+)
+from mlir.dialects.transform import FailurePropagationMode
+from mlir.dialects.transform.structured import structured_match
+from mlir.dialects.transform.loop import loop_unroll
+from mlir.dialects.transform.extras import (
+    OpHandle,
+    insert_transform_script,
+    sequence,
+    apply_patterns,
+)
+from mlir.extras import types as T
+
+
+def construct_and_print_in_module(f):
+    print("\nTEST:", f.__name__)
+    with ir.Context(), ir.Location.unknown():
+        module = ir.Module.create()
+        with ir.InsertionPoint(module.body):
+            f()
+        print(module)
+    return f
 
 
 def build_transform_script(script: Callable[[OpHandle], None]):
@@ -93,3 +118,45 @@ def test_match_ops_mixed(op: OpHandle):
     # CHECK-NEXT: %[[VAL_1:.*]] = transform.structured.match
     # CHECK-SAME:   ops{["scf.for", "linalg.matmul", "scf.forall"]} in %[[VAL_0]]
     # CHECK-SAME:     -> !transform.any_op
+
+
+# CHECK-LABEL: TEST: test_sequence_region
+@construct_and_print_in_module
+def test_sequence_region():
+    # CHECK:   transform.sequence  failures(propagate) {
+    # CHECK:   ^{{.*}}(%[[VAL_0:.*]]: !transform.any_op):
+    # CHECK:     %[[VAL_1:.*]] = transform.structured.match ops{["arith.addi"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:     %[[VAL_2:.*]] = get_parent_op %[[VAL_1]] {op_name = "scf.for"} : (!transform.any_op) -> !pdl.operation
+    # CHECK:     transform.loop.unroll %[[VAL_2]] {factor = 4 : i64} : !pdl.operation
+    # CHECK:   }
+    @sequence([], FailurePropagationMode.Propagate, [])
+    def basic(target: any_op_t()):
+        m = structured_match(any_op_t(), target, ops=["arith.addi"])
+        loop = get_parent_op(pdl.op_t(), m, op_name="scf.for")
+        loop_unroll(loop, 4)
+
+
+# CHECK-LABEL: TEST: test_apply_patterns
+@construct_and_print_in_module
+def test_apply_patterns():
+    # CHECK:   transform.sequence  failures(propagate) {
+    # CHECK:   ^{{.*}}(%[[VAL_0:.*]]: !transform.any_op):
+    # CHECK:     %[[VAL_1:.*]] = transform.structured.match ops{["linalg.matmul"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:     %[[VAL_2:.*]] = get_parent_op %[[VAL_1]] {op_name = "func.func"} : (!transform.any_op) -> !pdl.operation
+    # CHECK:     apply_patterns to %[[VAL_2]] {
+    # CHECK:       transform.apply_patterns.canonicalization
+    # CHECK:     } : !pdl.operation
+    # CHECK:     %[[VAL_3:.*]] = transform.structured.match ops{["func.func"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:     apply_cse to %[[VAL_3]] : !transform.any_op
+    # CHECK:   }
+    @sequence([], FailurePropagationMode.Propagate, [])
+    def basic(variant_op: any_op_t()):
+        matmul = structured_match(any_op_t(), variant_op, ops=["linalg.matmul"])
+        top_func = get_parent_op(pdl.op_t(), matmul, op_name="func.func")
+
+        @apply_patterns(top_func)
+        def pats():
+            apply_patterns_canonicalization()
+
+        top_func = structured_match(any_op_t(), variant_op, ops=["func.func"])
+        apply_cse(top_func)
diff --git a/mlir/test/python/integration/dialects/transform.py b/mlir/test/python/integration/dialects/transform.py
new file mode 100644
index 000000000000..bc88a61314d0
--- /dev/null
+++ b/mlir/test/python/integration/dialects/transform.py
@@ -0,0 +1,155 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+from mlir.passmanager import PassManager
+from mlir.ir import Context, Location, Module, InsertionPoint, UnitAttr
+from mlir.dialects import scf, pdl, func, arith, linalg
+from mlir.dialects.transform import (
+    get_parent_op,
+    apply_patterns_canonicalization,
+    apply_cse,
+    any_op_t,
+)
+from mlir.dialects.transform.structured import structured_match
+from mlir.dialects.transform.loop import loop_unroll
+from mlir.dialects.transform.extras import named_sequence, apply_patterns
+from mlir.extras import types as T
+from mlir.dialects.builtin import module, ModuleOp
+
+
+def construct_and_print_in_module(f):
+    print("\nTEST:", f.__name__)
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            module = f(module)
+        if module is not None:
+            print(module)
+    return f
+
+
+# CHECK-LABEL: TEST: test_named_sequence
+@construct_and_print_in_module
+def test_named_sequence(module_):
+    # CHECK-LABEL:   func.func @loop_unroll_op() {
+    # CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+    # CHECK:           %[[VAL_1:.*]] = arith.constant 42 : index
+    # CHECK:           %[[VAL_2:.*]] = arith.constant 5 : index
+    # CHECK:           scf.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+    # CHECK:             %[[VAL_4:.*]] = arith.addi %[[VAL_3]], %[[VAL_3]] : index
+    # CHECK:           }
+    # CHECK:           return
+    # CHECK:         }
+    @func.func()
+    def loop_unroll_op():
+        for i in scf.for_(0, 42, 5):
+            v = arith.addi(i, i)
+            scf.yield_([])
+
+    # CHECK-LABEL:   module attributes {transform.with_named_sequence} {
+    # CHECK:           transform.named_sequence @__transform_main(%[[VAL_0:.*]]: !transform.any_op) {
+    # CHECK:             %[[VAL_1:.*]] = transform.structured.match ops{["arith.addi"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:             %[[VAL_2:.*]] = transform.get_parent_op %[[VAL_1]] {op_name = "scf.for"} : (!transform.any_op) -> !pdl.operation
+    # CHECK:             transform.loop.unroll %[[VAL_2]] {factor = 4 : i64} : !pdl.operation
+    # CHECK:             transform.yield
+    # CHECK:           }
+    # CHECK:         }
+    @module(attrs={"transform.with_named_sequence": UnitAttr.get()})
+    def mod():
+        @named_sequence("__transform_main", [any_op_t()], [])
+        def basic(target: any_op_t()):
+            m = structured_match(any_op_t(), target, ops=["arith.addi"])
+            loop = get_parent_op(pdl.op_t(), m, op_name="scf.for")
+            loop_unroll(loop, 4)
+
+    # The identifier (name) of the function becomes the Operation
+    assert isinstance(mod.opview, ModuleOp)
+
+    print(module_)
+
+    pm = PassManager.parse("builtin.module(transform-interpreter)")
+    pm.run(module_.operation)
+
+    # CHECK-LABEL: func.func @loop_unroll_op() {
+    # CHECK:         %[[VAL_0:.*]] = arith.constant 0 : index
+    # CHECK:         %[[VAL_1:.*]] = arith.constant 42 : index
+    # CHECK:         %[[VAL_2:.*]] = arith.constant 5 : index
+    # CHECK:         %[[VAL_6:.*]] = arith.constant 40 : index
+    # CHECK:         %[[VAL_7:.*]] = arith.constant 20 : index
+    # CHECK:         scf.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_6]] step %[[VAL_7]] {
+    # CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_3]], %[[VAL_3]] : index
+    # CHECK:           %[[VAL_8:.*]] = arith.constant 1 : index
+    # CHECK:           %[[VAL_9:.*]] = arith.muli %[[VAL_2]], %[[VAL_8]] : index
+    # CHECK:           %[[VAL_10:.*]] = arith.addi %[[VAL_3]], %[[VAL_9]] : index
+    # CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_10]] : index
+    # CHECK:           %[[VAL_12:.*]] = arith.constant 2 : index
+    # CHECK:           %[[VAL_13:.*]] = arith.muli %[[VAL_2]], %[[VAL_12]] : index
+    # CHECK:           %[[VAL_14:.*]] = arith.addi %[[VAL_3]], %[[VAL_13]] : index
+    # CHECK:           %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_14]] : index
+    # CHECK:           %[[VAL_16:.*]] = arith.constant 3 : index
+    # CHECK:           %[[VAL_17:.*]] = arith.muli %[[VAL_2]], %[[VAL_16]] : index
+    # CHECK:           %[[VAL_18:.*]] = arith.addi %[[VAL_3]], %[[VAL_17]] : index
+    # CHECK:           %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_18]] : index
+    # CHECK:         }
+    # CHECK:         %[[VAL_4:.*]] = arith.addi %[[VAL_6]], %[[VAL_6]] : index
+    # CHECK:         return
+    # CHECK:       }
+    print(module_)
+
+
+# CHECK-LABEL: TEST: test_apply_patterns
+@construct_and_print_in_module
+def test_apply_patterns(module_):
+    M, N, K = 3, 5, 3
+
+    # CHECK-LABEL:   func.func @matmul(
+    # CHECK-SAME:                      %[[VAL_0:.*]]: tensor<3x5xf32>, %[[VAL_1:.*]]: tensor<5x3xf32>, %[[VAL_2:.*]]: tensor<3x3xf32>) -> tensor<3x3xf32> {
+    # CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
+    # CHECK:           %[[VAL_4:.*]] = arith.addi %[[VAL_3]], %[[VAL_3]] : i32
+    # CHECK:           %[[VAL_5:.*]] = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%[[VAL_0]], %[[VAL_1]] : tensor<3x5xf32>, tensor<5x3xf32>) outs(%[[VAL_2]] : tensor<3x3xf32>) -> tensor<3x3xf32>
+    # CHECK:           return %[[VAL_5]] : tensor<3x3xf32>
+    # CHECK:         }
+    @func.func(
+        T.tensor(M, N, T.f32()), T.tensor(N, K, T.f32()), T.tensor(M, K, T.f32())
+    )
+    def matmul(A, B, C):
+        i = arith.constant(T.i32(), 1)
+        v = arith.addi(i, i)
+        return linalg.matmul(A, B, outs=[C])
+
+    # CHECK-LABEL:   module attributes {transform.with_named_sequence} {
+    # CHECK:           transform.named_sequence @__transform_main(%[[VAL_0:.*]]: !transform.any_op) {
+    # CHECK:             %[[VAL_1:.*]] = transform.structured.match ops{["linalg.matmul"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:             %[[VAL_2:.*]] = transform.get_parent_op %[[VAL_1]] {op_name = "func.func"} : (!transform.any_op) -> !pdl.operation
+    # CHECK:             transform.apply_patterns to %[[VAL_2]] {
+    # CHECK:               transform.apply_patterns.canonicalization
+    # CHECK:             } : !pdl.operation
+    # CHECK:             %[[VAL_3:.*]] = transform.structured.match ops{["func.func"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:             transform.apply_cse to %[[VAL_3]] : !transform.any_op
+    # CHECK:             transform.yield
+    # CHECK:           }
+    # CHECK:         }
+    @module(attrs={"transform.with_named_sequence": UnitAttr.get()})
+    def mod():
+        @named_sequence("__transform_main", [any_op_t()], [])
+        def basic(variant_op: any_op_t()):
+            matmul = structured_match(any_op_t(), variant_op, ops=["linalg.matmul"])
+            top_func = get_parent_op(pdl.op_t(), matmul, op_name="func.func")
+
+            @apply_patterns(top_func)
+            def pats():
+                apply_patterns_canonicalization()
+
+            top_func = structured_match(any_op_t(), variant_op, ops=["func.func"])
+            apply_cse(top_func)
+
+    print(module_)
+
+    pm = PassManager.parse("builtin.module(transform-interpreter)")
+    pm.run(module_.operation)
+
+    # CHECK-LABEL:   func.func @matmul(
+    # CHECK-SAME:                      %[[VAL_0:.*]]: tensor<3x5xf32>, %[[VAL_1:.*]]: tensor<5x3xf32>, %[[VAL_2:.*]]: tensor<3x3xf32>) -> tensor<3x3xf32> {
+    # CHECK:           %[[VAL_3:.*]] = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%[[VAL_0]], %[[VAL_1]] : tensor<3x5xf32>, tensor<5x3xf32>) outs(%[[VAL_2]] : tensor<3x3xf32>) -> tensor<3x3xf32>
+    # CHECK:           return %[[VAL_3]] : tensor<3x3xf32>
+    # CHECK:         }
+    print(module_)
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index eedade691c6c..dc4121dc46bb 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -47,7 +47,7 @@ void registerTestAffineReifyValueBoundsPass();
 void registerTestBytecodeRoundtripPasses();
 void registerTestDecomposeAffineOpPass();
 void registerTestAffineLoopUnswitchingPass();
-void registerTestAllReduceLoweringPass();
+void registerTestGpuLoweringPasses();
 void registerTestFunc();
 void registerTestGpuMemoryPromotionPass();
 void registerTestLoopPermutationPass();
@@ -167,7 +167,7 @@ void registerTestPasses() {
   registerTestAffineReifyValueBoundsPass();
   registerTestDecomposeAffineOpPass();
   registerTestAffineLoopUnswitchingPass();
-  registerTestAllReduceLoweringPass();
+  registerTestGpuLoweringPasses();
   registerTestBytecodeRoundtripPasses();
   registerTestFunc();
   registerTestGpuMemoryPromotionPass();
diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp
index 877ef1089dce..7cd2690ea815 100644
--- a/mlir/tools/mlir-tblgen/OpDocGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp
@@ -123,6 +123,12 @@ static void emitAssemblyFormat(StringRef opName, StringRef format,
   os << "```\n\n";
 }
 
+/// Place `text` between backticks so that the Markdown processor renders it as
+/// inline code.
+static std::string backticks(const std::string &text) {
+  return '`' + text + '`';
+}
+
 static void emitOpTraitsDoc(const Operator &op, raw_ostream &os) {
   // TODO: We should link to the trait/documentation of it. That also means we
   // should add descriptions to traits that can be queried.
@@ -155,14 +161,14 @@ static void emitOpTraitsDoc(const Operator &op, raw_ostream &os) {
           os << effect << " on " << rec->getValueAsString("resource");
         });
         os << "}";
-        effects.insert(os.str());
+        effects.insert(backticks(os.str()));
         name.append(llvm::formatv(" ({0})", traitName).str());
       }
-      interfaces.insert(name);
+      interfaces.insert(backticks(name));
       continue;
     }
 
-    traits.insert(name);
+    traits.insert(backticks(name));
   }
   if (!traits.empty()) {
     llvm::interleaveComma(traits, os << "\nTraits: ");
diff --git a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
index 5b8e236b4618..032f5760361f 100644
--- a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
+++ b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
@@ -27,6 +27,7 @@
 #include "mlir/Dialect/LLVMIR/Transforms/RequestCWrappers.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/Transforms/Passes.h"
@@ -105,8 +106,8 @@ int main(int argc, char **argv) {
   mlir::DialectRegistry registry;
   registry.insert<mlir::arith::ArithDialect, mlir::LLVM::LLVMDialect,
                   mlir::gpu::GPUDialect, mlir::spirv::SPIRVDialect,
-                  mlir::func::FuncDialect, mlir::memref::MemRefDialect,
-                  mlir::vector::VectorDialect>();
+                  mlir::scf::SCFDialect, mlir::func::FuncDialect,
+                  mlir::memref::MemRefDialect, mlir::vector::VectorDialect>();
   mlir::registerBuiltinDialectTranslation(registry);
   mlir::registerLLVMDialectTranslation(registry);
 
diff --git a/mlir/unittests/Analysis/Presburger/CMakeLists.txt b/mlir/unittests/Analysis/Presburger/CMakeLists.txt
index b6ce273e35a0..e37133354e53 100644
--- a/mlir/unittests/Analysis/Presburger/CMakeLists.txt
+++ b/mlir/unittests/Analysis/Presburger/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_unittest(MLIRPresburgerTests
   PresburgerRelationTest.cpp
   PresburgerSpaceTest.cpp
   PWMAFunctionTest.cpp
+  QuasiPolynomialTest.cpp
   SimplexTest.cpp
   UtilsTest.cpp
 )
diff --git a/mlir/unittests/Analysis/Presburger/MatrixTest.cpp b/mlir/unittests/Analysis/Presburger/MatrixTest.cpp
index e6e452790f82..103619518c15 100644
--- a/mlir/unittests/Analysis/Presburger/MatrixTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/MatrixTest.cpp
@@ -251,6 +251,9 @@ TEST(MatrixTest, computeHermiteNormalForm) {
 }
 
 TEST(MatrixTest, inverse) {
+  IntMatrix mat1 = makeIntMatrix(2, 2, {{2, 1}, {7, 0}});
+  EXPECT_EQ(mat1.determinant(), -7);
+
   FracMatrix mat = makeFracMatrix(
       2, 2, {{Fraction(2), Fraction(1)}, {Fraction(7), Fraction(0)}});
   FracMatrix inverse = makeFracMatrix(
diff --git a/mlir/unittests/Analysis/Presburger/PresburgerSpaceTest.cpp b/mlir/unittests/Analysis/Presburger/PresburgerSpaceTest.cpp
index dd06d462f54b..154cedcd2296 100644
--- a/mlir/unittests/Analysis/Presburger/PresburgerSpaceTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/PresburgerSpaceTest.cpp
@@ -110,6 +110,20 @@ TEST(PresburgerSpaceTest, removeVarRangeIdentifier) {
   EXPECT_EQ(space.getId(VarKind::Range, 1), Identifier(&identifiers[5]));
 }
 
+TEST(PresburgerSpaceTest, IdentifierIsEqual) {
+  PresburgerSpace space = PresburgerSpace::getRelationSpace(1, 2, 0, 0);
+  space.resetIds();
+
+  int identifiers[2] = {0, 1};
+  space.getId(VarKind::Domain, 0) = Identifier(&identifiers[0]);
+  space.getId(VarKind::Range, 0) = Identifier(&identifiers[0]);
+  space.getId(VarKind::Range, 1) = Identifier(&identifiers[1]);
+
+  EXPECT_EQ(space.getId(VarKind::Domain, 0), space.getId(VarKind::Range, 0));
+  EXPECT_FALSE(
+      space.getId(VarKind::Range, 0).isEqual(space.getId(VarKind::Range, 1)));
+}
+
 TEST(PresburgerSpaceTest, convertVarKind) {
   PresburgerSpace space = PresburgerSpace::getRelationSpace(2, 2, 0, 0);
   space.resetIds();
@@ -179,3 +193,73 @@ TEST(PresburgerSpaceTest, convertVarKind2) {
   EXPECT_EQ(space.getId(VarKind::Symbol, 2), Identifier(&identifiers[1]));
   EXPECT_EQ(space.getId(VarKind::Symbol, 3), Identifier(&identifiers[3]));
 }
+
+TEST(PresburgerSpaceTest, mergeAndAlignSymbols) {
+  PresburgerSpace space = PresburgerSpace::getRelationSpace(3, 3, 2, 0);
+  space.resetIds();
+
+  PresburgerSpace otherSpace = PresburgerSpace::getRelationSpace(3, 2, 3, 0);
+  otherSpace.resetIds();
+
+  // Attach identifiers.
+  int identifiers[7] = {0, 1, 2, 3, 4, 5, 6};
+  int otherIdentifiers[8] = {10, 11, 12, 13, 14, 15, 16, 17};
+
+  space.getId(VarKind::Domain, 0) = Identifier(&identifiers[0]);
+  space.getId(VarKind::Domain, 1) = Identifier(&identifiers[1]);
+  // Note the common identifier.
+  space.getId(VarKind::Domain, 2) = Identifier(&otherIdentifiers[2]);
+  space.getId(VarKind::Range, 0) = Identifier(&identifiers[2]);
+  space.getId(VarKind::Range, 1) = Identifier(&identifiers[3]);
+  space.getId(VarKind::Range, 2) = Identifier(&identifiers[4]);
+  space.getId(VarKind::Symbol, 0) = Identifier(&identifiers[5]);
+  space.getId(VarKind::Symbol, 1) = Identifier(&identifiers[6]);
+
+  otherSpace.getId(VarKind::Domain, 0) = Identifier(&otherIdentifiers[0]);
+  otherSpace.getId(VarKind::Domain, 1) = Identifier(&otherIdentifiers[1]);
+  otherSpace.getId(VarKind::Domain, 2) = Identifier(&otherIdentifiers[2]);
+  otherSpace.getId(VarKind::Range, 0) = Identifier(&otherIdentifiers[3]);
+  otherSpace.getId(VarKind::Range, 1) = Identifier(&otherIdentifiers[4]);
+  // Note the common identifier.
+  otherSpace.getId(VarKind::Symbol, 0) = Identifier(&identifiers[6]);
+  otherSpace.getId(VarKind::Symbol, 1) = Identifier(&otherIdentifiers[5]);
+  otherSpace.getId(VarKind::Symbol, 2) = Identifier(&otherIdentifiers[7]);
+
+  space.mergeAndAlignSymbols(otherSpace);
+
+  // Check if merge & align is successful.
+  // Check symbol var identifiers.
+  EXPECT_EQ(4u, space.getNumSymbolVars());
+  EXPECT_EQ(4u, otherSpace.getNumSymbolVars());
+  EXPECT_EQ(space.getId(VarKind::Symbol, 0), Identifier(&identifiers[5]));
+  EXPECT_EQ(space.getId(VarKind::Symbol, 1), Identifier(&identifiers[6]));
+  EXPECT_EQ(space.getId(VarKind::Symbol, 2), Identifier(&otherIdentifiers[5]));
+  EXPECT_EQ(space.getId(VarKind::Symbol, 3), Identifier(&otherIdentifiers[7]));
+  EXPECT_EQ(otherSpace.getId(VarKind::Symbol, 0), Identifier(&identifiers[5]));
+  EXPECT_EQ(otherSpace.getId(VarKind::Symbol, 1), Identifier(&identifiers[6]));
+  EXPECT_EQ(otherSpace.getId(VarKind::Symbol, 2),
+            Identifier(&otherIdentifiers[5]));
+  EXPECT_EQ(otherSpace.getId(VarKind::Symbol, 3),
+            Identifier(&otherIdentifiers[7]));
+  // Check that domain and range var identifiers are not affected.
+  EXPECT_EQ(3u, space.getNumDomainVars());
+  EXPECT_EQ(3u, space.getNumRangeVars());
+  EXPECT_EQ(space.getId(VarKind::Domain, 0), Identifier(&identifiers[0]));
+  EXPECT_EQ(space.getId(VarKind::Domain, 1), Identifier(&identifiers[1]));
+  EXPECT_EQ(space.getId(VarKind::Domain, 2), Identifier(&otherIdentifiers[2]));
+  EXPECT_EQ(space.getId(VarKind::Range, 0), Identifier(&identifiers[2]));
+  EXPECT_EQ(space.getId(VarKind::Range, 1), Identifier(&identifiers[3]));
+  EXPECT_EQ(space.getId(VarKind::Range, 2), Identifier(&identifiers[4]));
+  EXPECT_EQ(3u, otherSpace.getNumDomainVars());
+  EXPECT_EQ(2u, otherSpace.getNumRangeVars());
+  EXPECT_EQ(otherSpace.getId(VarKind::Domain, 0),
+            Identifier(&otherIdentifiers[0]));
+  EXPECT_EQ(otherSpace.getId(VarKind::Domain, 1),
+            Identifier(&otherIdentifiers[1]));
+  EXPECT_EQ(otherSpace.getId(VarKind::Domain, 2),
+            Identifier(&otherIdentifiers[2]));
+  EXPECT_EQ(otherSpace.getId(VarKind::Range, 0),
+            Identifier(&otherIdentifiers[3]));
+  EXPECT_EQ(otherSpace.getId(VarKind::Range, 1),
+            Identifier(&otherIdentifiers[4]));
+}
diff --git a/mlir/unittests/Analysis/Presburger/QuasiPolynomialTest.cpp b/mlir/unittests/Analysis/Presburger/QuasiPolynomialTest.cpp
new file mode 100644
index 000000000000..a84f0234067a
--- /dev/null
+++ b/mlir/unittests/Analysis/Presburger/QuasiPolynomialTest.cpp
@@ -0,0 +1,140 @@
+//===- MatrixTest.cpp - Tests for QuasiPolynomial -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Presburger/QuasiPolynomial.h"
+#include "./Utils.h"
+#include "mlir/Analysis/Presburger/Fraction.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+using namespace mlir;
+using namespace presburger;
+
+// Test the arithmetic operations on QuasiPolynomials;
+// addition, subtraction, multiplication, and division
+// by a constant.
+// Two QPs of 3 parameters each were generated randomly
+// and their sum, difference, and product computed by hand.
+TEST(QuasiPolynomialTest, arithmetic) {
+  QuasiPolynomial qp1(
+      3, {Fraction(1, 3), Fraction(1, 1), Fraction(1, 2)},
+      {{{Fraction(1, 1), Fraction(-1, 2), Fraction(4, 5), Fraction(0, 1)},
+        {Fraction(2, 3), Fraction(3, 4), Fraction(-1, 1), Fraction(5, 7)}},
+       {{Fraction(1, 2), Fraction(1, 1), Fraction(4, 5), Fraction(1, 1)}},
+       {{Fraction(-3, 2), Fraction(1, 1), Fraction(5, 6), Fraction(7, 5)},
+        {Fraction(1, 4), Fraction(2, 1), Fraction(6, 5), Fraction(-9, 8)},
+        {Fraction(3, 2), Fraction(2, 5), Fraction(-7, 4), Fraction(0, 1)}}});
+  QuasiPolynomial qp2(
+      3, {Fraction(1, 1), Fraction(2, 1)},
+      {{{Fraction(1, 2), Fraction(0, 1), Fraction(-1, 3), Fraction(5, 3)},
+        {Fraction(2, 1), Fraction(5, 4), Fraction(9, 7), Fraction(-1, 5)}},
+       {{Fraction(1, 3), Fraction(-2, 3), Fraction(1, 1), Fraction(0, 1)}}});
+
+  QuasiPolynomial sum = qp1 + qp2;
+  EXPECT_EQ_REPR_QUASIPOLYNOMIAL(
+      sum,
+      QuasiPolynomial(
+          3,
+          {Fraction(1, 3), Fraction(1, 1), Fraction(1, 2), Fraction(1, 1),
+           Fraction(2, 1)},
+          {{{Fraction(1, 1), Fraction(-1, 2), Fraction(4, 5), Fraction(0, 1)},
+            {Fraction(2, 3), Fraction(3, 4), Fraction(-1, 1), Fraction(5, 7)}},
+           {{Fraction(1, 2), Fraction(1, 1), Fraction(4, 5), Fraction(1, 1)}},
+           {{Fraction(-3, 2), Fraction(1, 1), Fraction(5, 6), Fraction(7, 5)},
+            {Fraction(1, 4), Fraction(2, 1), Fraction(6, 5), Fraction(-9, 8)},
+            {Fraction(3, 2), Fraction(2, 5), Fraction(-7, 4), Fraction(0, 1)}},
+           {{Fraction(1, 2), Fraction(0, 1), Fraction(-1, 3), Fraction(5, 3)},
+            {Fraction(2, 1), Fraction(5, 4), Fraction(9, 7), Fraction(-1, 5)}},
+           {{Fraction(1, 3), Fraction(-2, 3), Fraction(1, 1),
+             Fraction(0, 1)}}}));
+
+  QuasiPolynomial diff = qp1 - qp2;
+  EXPECT_EQ_REPR_QUASIPOLYNOMIAL(
+      diff,
+      QuasiPolynomial(
+          3,
+          {Fraction(1, 3), Fraction(1, 1), Fraction(1, 2), Fraction(-1, 1),
+           Fraction(-2, 1)},
+          {{{Fraction(1, 1), Fraction(-1, 2), Fraction(4, 5), Fraction(0, 1)},
+            {Fraction(2, 3), Fraction(3, 4), Fraction(-1, 1), Fraction(5, 7)}},
+           {{Fraction(1, 2), Fraction(1, 1), Fraction(4, 5), Fraction(1, 1)}},
+           {{Fraction(-3, 2), Fraction(1, 1), Fraction(5, 6), Fraction(7, 5)},
+            {Fraction(1, 4), Fraction(2, 1), Fraction(6, 5), Fraction(-9, 8)},
+            {Fraction(3, 2), Fraction(2, 5), Fraction(-7, 4), Fraction(0, 1)}},
+           {{Fraction(1, 2), Fraction(0, 1), Fraction(-1, 3), Fraction(5, 3)},
+            {Fraction(2, 1), Fraction(5, 4), Fraction(9, 7), Fraction(-1, 5)}},
+           {{Fraction(1, 3), Fraction(-2, 3), Fraction(1, 1),
+             Fraction(0, 1)}}}));
+
+  QuasiPolynomial prod = qp1 * qp2;
+  EXPECT_EQ_REPR_QUASIPOLYNOMIAL(
+      prod,
+      QuasiPolynomial(
+          3,
+          {Fraction(1, 3), Fraction(2, 3), Fraction(1, 1), Fraction(2, 1),
+           Fraction(1, 2), Fraction(1, 1)},
+          {{{Fraction(1, 1), Fraction(-1, 2), Fraction(4, 5), Fraction(0, 1)},
+            {Fraction(2, 3), Fraction(3, 4), Fraction(-1, 1), Fraction(5, 7)},
+            {Fraction(1, 2), Fraction(0, 1), Fraction(-1, 3), Fraction(5, 3)},
+            {Fraction(2, 1), Fraction(5, 4), Fraction(9, 7), Fraction(-1, 5)}},
+           {{Fraction(1, 1), Fraction(-1, 2), Fraction(4, 5), Fraction(0, 1)},
+            {Fraction(2, 3), Fraction(3, 4), Fraction(-1, 1), Fraction(5, 7)},
+            {Fraction(1, 3), Fraction(-2, 3), Fraction(1, 1), Fraction(0, 1)}},
+           {{Fraction(1, 2), Fraction(1, 1), Fraction(4, 5), Fraction(1, 1)},
+            {Fraction(1, 2), Fraction(0, 1), Fraction(-1, 3), Fraction(5, 3)},
+            {Fraction(2, 1), Fraction(5, 4), Fraction(9, 7), Fraction(-1, 5)}},
+           {{Fraction(1, 2), Fraction(1, 1), Fraction(4, 5), Fraction(1, 1)},
+            {Fraction(1, 3), Fraction(-2, 3), Fraction(1, 1), Fraction(0, 1)}},
+           {{Fraction(-3, 2), Fraction(1, 1), Fraction(5, 6), Fraction(7, 5)},
+            {Fraction(1, 4), Fraction(2, 1), Fraction(6, 5), Fraction(-9, 8)},
+            {Fraction(3, 2), Fraction(2, 5), Fraction(-7, 4), Fraction(0, 1)},
+            {Fraction(1, 2), Fraction(0, 1), Fraction(-1, 3), Fraction(5, 3)},
+            {Fraction(2, 1), Fraction(5, 4), Fraction(9, 7), Fraction(-1, 5)}},
+           {{Fraction(-3, 2), Fraction(1, 1), Fraction(5, 6), Fraction(7, 5)},
+            {Fraction(1, 4), Fraction(2, 1), Fraction(6, 5), Fraction(-9, 8)},
+            {Fraction(3, 2), Fraction(2, 5), Fraction(-7, 4), Fraction(0, 1)},
+            {Fraction(1, 3), Fraction(-2, 3), Fraction(1, 1),
+             Fraction(0, 1)}}}));
+
+  QuasiPolynomial quot = qp1 / 2;
+  EXPECT_EQ_REPR_QUASIPOLYNOMIAL(
+      quot,
+      QuasiPolynomial(
+          3, {Fraction(1, 6), Fraction(1, 2), Fraction(1, 4)},
+          {{{Fraction(1, 1), Fraction(-1, 2), Fraction(4, 5), Fraction(0, 1)},
+            {Fraction(2, 3), Fraction(3, 4), Fraction(-1, 1), Fraction(5, 7)}},
+           {{Fraction(1, 2), Fraction(1, 1), Fraction(4, 5), Fraction(1, 1)}},
+           {{Fraction(-3, 2), Fraction(1, 1), Fraction(5, 6), Fraction(7, 5)},
+            {Fraction(1, 4), Fraction(2, 1), Fraction(6, 5), Fraction(-9, 8)},
+            {Fraction(3, 2), Fraction(2, 5), Fraction(-7, 4),
+             Fraction(0, 1)}}}));
+}
+
+// Test the simplify() operation on QPs, which removes terms that
+// are identically zero. A random QP was generated and terms were
+// changed to account for each condition in simplify() – 
+// the term coefficient being zero, or all the coefficients in some
+// affine term in the product being zero.
+TEST(QuasiPolynomialTest, simplify) {
+  QuasiPolynomial qp(2,
+                     {Fraction(2, 3), Fraction(0, 1), Fraction(1, 1),
+                      Fraction(1, 2), Fraction(0, 1)},
+                     {{{Fraction(1, 1), Fraction(3, 4), Fraction(5, 3)},
+                       {Fraction(2, 1), Fraction(0, 1), Fraction(0, 1)}},
+                      {{Fraction(1, 3), Fraction(8, 5), Fraction(2, 5)}},
+                      {{Fraction(2, 7), Fraction(9, 5), Fraction(0, 1)},
+                       {Fraction(0, 1), Fraction(0, 1), Fraction(0, 1)}},
+                      {{Fraction(1, 1), Fraction(4, 5), Fraction(6, 5)}},
+                      {{Fraction(1, 3), Fraction(4, 3), Fraction(7, 8)}}});
+  EXPECT_EQ_REPR_QUASIPOLYNOMIAL(
+      qp.simplify(),
+      QuasiPolynomial(2, {Fraction(2, 3), Fraction(1, 2)},
+                      {{{Fraction(1, 1), Fraction(3, 4), Fraction(5, 3)},
+                        {Fraction(2, 1), Fraction(0, 1), Fraction(0, 1)}},
+                       {{Fraction(1, 1), Fraction(4, 5), Fraction(6, 5)}}}));
+}
+\ No newline at end of file
diff --git a/mlir/unittests/Analysis/Presburger/Utils.h b/mlir/unittests/Analysis/Presburger/Utils.h
index 544577375dd1..2a9966c7ce2e 100644
--- a/mlir/unittests/Analysis/Presburger/Utils.h
+++ b/mlir/unittests/Analysis/Presburger/Utils.h
@@ -17,6 +17,7 @@
 #include "mlir/Analysis/Presburger/Matrix.h"
 #include "mlir/Analysis/Presburger/PWMAFunction.h"
 #include "mlir/Analysis/Presburger/PresburgerRelation.h"
+#include "mlir/Analysis/Presburger/QuasiPolynomial.h"
 #include "mlir/Analysis/Presburger/Simplex.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
@@ -71,6 +72,28 @@ inline void EXPECT_EQ_FRAC_MATRIX(FracMatrix a, FracMatrix b) {
       EXPECT_EQ(a(row, col), b(row, col));
 }
 
+// Check the coefficients (in order) of two quasipolynomials.
+// Note that this is not a true equality check.
+inline void EXPECT_EQ_REPR_QUASIPOLYNOMIAL(QuasiPolynomial a, QuasiPolynomial b) {
+  EXPECT_EQ(a.getNumInputs(), b.getNumInputs());
+
+  SmallVector<Fraction> aCoeffs = a.getCoefficients(),
+                        bCoeffs = b.getCoefficients();
+  EXPECT_EQ(aCoeffs.size(), bCoeffs.size());
+  for (unsigned i = 0, e = aCoeffs.size(); i < e; i++)
+    EXPECT_EQ(aCoeffs[i], bCoeffs[i]);
+
+  std::vector<std::vector<SmallVector<Fraction>>> aAff = a.getAffine(),
+                                                  bAff = b.getAffine();
+  EXPECT_EQ(aAff.size(), bAff.size());
+  for (unsigned i = 0, e = aAff.size(); i < e; i++) {
+    EXPECT_EQ(aAff[i].size(), bAff[i].size());
+    for (unsigned j = 0, f = aAff[i].size(); j < f; j++)
+      for (unsigned k = 0, g = a.getNumInputs(); k <= g; k++)
+        EXPECT_EQ(aAff[i][j][k], bAff[i][j][k]);
+  }
+}
+
 /// lhs and rhs represent non-negative integers or positive infinity. The
 /// infinity case corresponds to when the Optional is empty.
 inline bool infinityOrUInt64LE(std::optional<MPInt> lhs,
diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index 66925ccbe030..31e475d86960 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -150,3 +150,11 @@ add_subdirectory(${LIBOMPTARGET_SRC_DIR})
 
 # Add tests.
 add_subdirectory(test)
+
+# Add unit tests if GMock/GTest is present
+if (EXISTS ${LLVM_THIRD_PARTY_DIR}/unittest)
+  if (NOT TARGET llvm_gtest)
+    add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
+  endif()
+  add_subdirectory(unittests)
+endif()
diff --git a/openmp/libomptarget/include/Shared/Profile.h b/openmp/libomptarget/include/Shared/Profile.h
index 19ca0cf22751..7e580988a39b 100644
--- a/openmp/libomptarget/include/Shared/Profile.h
+++ b/openmp/libomptarget/include/Shared/Profile.h
@@ -97,4 +97,16 @@ public:
   std::string RTM = RegionTypeMsg;                                             \
   llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
 
+/// Time spend in the current scope, assigned to the regionType
+/// with details from runtime
+#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT)        \
+  SourceInfo SI(IDENT);                                                        \
+  std::string ProfileLocation = SI.getProfileLocation();                       \
+  llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details)
+
+/// Time spend in the current scope, assigned to the function name and source
+/// with details
+#define TIMESCOPE_WITH_DETAILS(Details)                                        \
+  llvm::TimeTraceScope TimeScope(__FUNCTION__, Details)
+
 #endif // OMPTARGET_SHARED_PROFILE_H
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index 1cb99c0a5dca..2471590c27b3 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -202,7 +202,7 @@ private:
       KernelData.VGPRSpillCount = V.second.getUInt();
     } else if (IsKey(V.first, ".private_segment_fixed_size")) {
       KernelData.PrivateSegmentSize = V.second.getUInt();
-    } else if (IsKey(V.first, ".group_segement_fixed_size")) {
+    } else if (IsKey(V.first, ".group_segment_fixed_size")) {
       KernelData.GroupSegmentList = V.second.getUInt();
     } else if (IsKey(V.first, ".reqd_workgroup_size")) {
       GetSequenceOfThreeInts(V.second, KernelData.RequestedWorkgroupSize);
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
index fa079ac9660e..d9fe938790ca 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
@@ -89,9 +89,6 @@ public:
 /// global metadata (size, addr) from the device.
 /// \see getGlobalMetadataFromDevice
 class GenericGlobalHandlerTy {
-  /// Map to store the ELF object files that have been loaded.
-  llvm::DenseMap<int32_t, ELF64LEObjectFile> ELFObjectFiles;
-
   /// Actually move memory between host and device. See readGlobalFromDevice and
   /// writeGlobalToDevice for the interface description.
   Error moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device,
@@ -109,10 +106,8 @@ class GenericGlobalHandlerTy {
 public:
   virtual ~GenericGlobalHandlerTy() {}
 
-  /// Get the cached ELF64LEObjectFile previosuly created for a specific
-  /// device image or create it if did not exist.
-  const ELF64LEObjectFile *
-  getOrCreateELFObjectFile(const GenericDeviceTy &Device, DeviceImageTy &Image);
+  /// Helper function for getting an ELF from a device image.
+  Expected<ELF64LEObjectFile> getELFObjectFile(DeviceImageTy &Image);
 
   /// Returns whether the symbol named \p SymName is present in the given \p
   /// Image.
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/JIT.h b/openmp/libomptarget/plugins-nextgen/common/include/JIT.h
index 3ec4424f856a..b22197b89208 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/JIT.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/JIT.h
@@ -57,7 +57,7 @@ struct JITEngine {
 
   /// Return true if \p Image is a bitcode image that can be JITed for the given
   /// architecture.
-  bool checkBitcodeImage(const __tgt_device_image &Image);
+  Expected<bool> checkBitcodeImage(StringRef Buffer) const;
 
 private:
   /// Compile the bitcode image \p Image and generate the binary image that can
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
index cf02783d8b33..b85dc146d86d 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -1067,7 +1067,7 @@ struct GenericPluginTy {
 
   /// Top level interface to verify if a given ELF image can be executed on a
   /// given target. Returns true if the \p Image is compatible with the plugin.
-  Expected<bool> checkELFImage(__tgt_device_image &Image) const;
+  Expected<bool> checkELFImage(StringRef Image) const;
 
   /// Indicate if an image is compatible with the plugin devices. Notice that
   /// this function may be called before actually initializing the devices. So
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index 3a272e228c7d..d398f60c55bd 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -25,29 +25,14 @@ using namespace omp;
 using namespace target;
 using namespace plugin;
 
-const ELF64LEObjectFile *
-GenericGlobalHandlerTy::getOrCreateELFObjectFile(const GenericDeviceTy &Device,
-                                                 DeviceImageTy &Image) {
+Expected<ELF64LEObjectFile>
+GenericGlobalHandlerTy::getELFObjectFile(DeviceImageTy &Image) {
+  assert(utils::elf::isELF(Image.getMemoryBuffer().getBuffer()) &&
+         "Input is not an ELF file");
 
-  auto Search = ELFObjectFiles.find(Image.getId());
-  if (Search != ELFObjectFiles.end())
-    // The ELF object file was already there.
-    return &Search->second;
-
-  // The ELF object file we are checking is not created yet.
   Expected<ELF64LEObjectFile> ElfOrErr =
       ELF64LEObjectFile::create(Image.getMemoryBuffer());
-  if (!ElfOrErr) {
-    consumeError(ElfOrErr.takeError());
-    return nullptr;
-  }
-
-  auto Result =
-      ELFObjectFiles.try_emplace(Image.getId(), std::move(ElfOrErr.get()));
-  assert(Result.second && "Map insertion failed");
-  assert(Result.first != ELFObjectFiles.end() && "Map insertion failed");
-
-  return &Result.first->second;
+  return ElfOrErr;
 }
 
 Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost(
@@ -83,7 +68,8 @@ Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost(
       return Err;
   }
 
-  DP("Succesfully %s %u bytes associated with global symbol '%s' %s the device "
+  DP("Succesfully %s %u bytes associated with global symbol '%s' %s the "
+     "device "
      "(%p -> %p).\n",
      Device2Host ? "read" : "write", HostGlobal.getSize(),
      HostGlobal.getName().data(), Device2Host ? "from" : "to",
@@ -98,12 +84,14 @@ bool GenericGlobalHandlerTy::isSymbolInImage(GenericDeviceTy &Device,
   // Get the ELF object file for the image. Notice the ELF object may already
   // be created in previous calls, so we can reuse it. If this is unsuccessful
   // just return false as we couldn't find it.
-  const ELF64LEObjectFile *ELFObj = getOrCreateELFObjectFile(Device, Image);
-  if (!ELFObj)
+  auto ELFObjOrErr = getELFObjectFile(Image);
+  if (!ELFObjOrErr) {
+    consumeError(ELFObjOrErr.takeError());
     return false;
+  }
 
   // Search the ELF symbol using the symbol name.
-  auto SymOrErr = utils::elf::getSymbol(*ELFObj, SymName);
+  auto SymOrErr = utils::elf::getSymbol(*ELFObjOrErr, SymName);
   if (!SymOrErr) {
     consumeError(SymOrErr.takeError());
     return false;
@@ -117,10 +105,9 @@ Error GenericGlobalHandlerTy::getGlobalMetadataFromImage(
 
   // Get the ELF object file for the image. Notice the ELF object may already
   // be created in previous calls, so we can reuse it.
-  const ELF64LEObjectFile *ELFObj = getOrCreateELFObjectFile(Device, Image);
+  auto ELFObj = getELFObjectFile(Image);
   if (!ELFObj)
-    return Plugin::error("Unable to create ELF object for image %p",
-                         Image.getStart());
+    return ELFObj.takeError();
 
   // Search the ELF symbol using the symbol name.
   auto SymOrErr = utils::elf::getSymbol(*ELFObj, ImageGlobal.getName());
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/JIT.cpp b/openmp/libomptarget/plugins-nextgen/common/src/JIT.cpp
index 08080c9d6091..7275be4edfca 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/JIT.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/JIT.cpp
@@ -330,24 +330,18 @@ JITEngine::process(const __tgt_device_image &Image,
   return &Image;
 }
 
-bool JITEngine::checkBitcodeImage(const __tgt_device_image &Image) {
+Expected<bool> JITEngine::checkBitcodeImage(StringRef Buffer) const {
   TimeTraceScope TimeScope("Check bitcode image");
 
-  if (!isImageBitcode(Image))
-    return false;
-
-  StringRef Data(reinterpret_cast<const char *>(Image.ImageStart),
-                 target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
-  auto MB = MemoryBuffer::getMemBuffer(Data, /*BufferName=*/"",
-                                       /*RequiresNullTerminator=*/false);
-  if (!MB)
-    return false;
+  assert(identify_magic(Buffer) == file_magic::bitcode &&
+         "Input is not bitcode");
 
   LLVMContext Context;
-  SMDiagnostic Diagnostic;
-  std::unique_ptr<Module> M =
-      llvm::getLazyIRModule(std::move(MB), Diagnostic, Context,
-                            /*ShouldLazyLoadMetadata=*/true);
+  auto ModuleOrErr = getLazyBitcodeModule(MemoryBufferRef(Buffer, ""), Context,
+                                          /*ShouldLazyLoadMetadata=*/true);
+  if (!ModuleOrErr)
+    return ModuleOrErr.takeError();
+  Module &M = **ModuleOrErr;
 
-  return M && Triple(M->getTargetTriple()).getArch() == TT.getArch();
+  return Triple(M.getTargetTriple()).getArch() == TT.getArch();
 }
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 178c60a77ab5..be9ace571f54 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1632,16 +1632,13 @@ Error GenericPluginTy::deinitDevice(int32_t DeviceId) {
   return Plugin::success();
 }
 
-Expected<bool> GenericPluginTy::checkELFImage(__tgt_device_image &Image) const {
-  StringRef Buffer(reinterpret_cast<const char *>(Image.ImageStart),
-                   target::getPtrDiff(Image.ImageEnd, Image.ImageStart));
-
+Expected<bool> GenericPluginTy::checkELFImage(StringRef Image) const {
   // First check if this image is a regular ELF file.
-  if (!utils::elf::isELF(Buffer))
+  if (!utils::elf::isELF(Image))
     return false;
 
   // Check if this image is an ELF with a matching machine value.
-  auto MachineOrErr = utils::elf::checkMachine(Buffer, getMagicElfBits());
+  auto MachineOrErr = utils::elf::checkMachine(Image, getMagicElfBits());
   if (!MachineOrErr)
     return MachineOrErr.takeError();
 
@@ -1649,7 +1646,7 @@ Expected<bool> GenericPluginTy::checkELFImage(__tgt_device_image &Image) const {
     return false;
 
   // Perform plugin-dependent checks for the specific architecture if needed.
-  return isELFCompatible(Buffer);
+  return isELFCompatible(Image);
 }
 
 const bool llvm::omp::target::plugin::libomptargetSupportsRPC() {
@@ -1678,27 +1675,38 @@ int32_t __tgt_rtl_init_plugin() {
   return OFFLOAD_SUCCESS;
 }
 
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *TgtImage) {
-  // TODO: We should be able to perform a trivial ELF machine check without
-  // initializing the plugin first to save time if the plugin is not needed.
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
   if (!Plugin::isActive())
     return false;
 
-  // Check if this is a valid ELF with a matching machine and processor.
-  auto MatchOrErr = Plugin::get().checkELFImage(*TgtImage);
-  if (Error Err = MatchOrErr.takeError()) {
+  StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
+                   target::getPtrDiff(Image->ImageEnd, Image->ImageStart));
+
+  auto HandleError = [&](Error Err) -> bool {
     [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
-    DP("Failure to check validity of image %p: %s", TgtImage, ErrStr.c_str());
+    DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str());
+    return false;
+  };
+  switch (identify_magic(Buffer)) {
+  case file_magic::elf:
+  case file_magic::elf_relocatable:
+  case file_magic::elf_executable:
+  case file_magic::elf_shared_object:
+  case file_magic::elf_core: {
+    auto MatchOrErr = Plugin::get().checkELFImage(Buffer);
+    if (Error Err = MatchOrErr.takeError())
+      return HandleError(std::move(Err));
+    return *MatchOrErr;
+  }
+  case file_magic::bitcode: {
+    auto MatchOrErr = Plugin::get().getJIT().checkBitcodeImage(Buffer);
+    if (Error Err = MatchOrErr.takeError())
+      return HandleError(std::move(Err));
+    return *MatchOrErr;
+  }
+  default:
     return false;
-  } else if (*MatchOrErr) {
-    return true;
   }
-
-  // Check if this is a valid LLVM-IR file with matching triple.
-  if (Plugin::get().getJIT().checkBitcodeImage(*TgtImage))
-    return true;
-
-  return false;
 }
 
 int32_t __tgt_rtl_supports_empty_images() {
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp b/openmp/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp
index bdac6c1db5d2..c84c3bad5def 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp
@@ -37,8 +37,7 @@ bool utils::elf::isELF(StringRef Buffer) {
 }
 
 Expected<bool> utils::elf::checkMachine(StringRef Object, uint16_t EMachine) {
-  if (!isELF(Object))
-    return createError("Input is not an ELF.");
+  assert(isELF(Object) && "Input is not an ELF!");
 
   Expected<ELF64LEObjectFile> ElfOrErr =
       ELF64LEObjectFile::create(MemoryBufferRef(Object, /*Identifier=*/""),
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index 0c7535a0da8b..b0dff917dd0b 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -1063,15 +1063,13 @@ private:
     // automatically so we must create it ourselves. The backend will emit
     // several globals that contain function pointers we can call. These are
     // prefixed with a known name due to Nvidia's lack of section support.
-    const ELF64LEObjectFile *ELFObj =
-        Handler.getOrCreateELFObjectFile(*this, Image);
-    if (!ELFObj)
-      return Plugin::error("Unable to create ELF object for image %p",
-                           Image.getStart());
+    auto ELFObjOrErr = Handler.getELFObjectFile(Image);
+    if (!ELFObjOrErr)
+      return ELFObjOrErr.takeError();
 
     // Search for all symbols that contain a constructor or destructor.
     SmallVector<std::pair<StringRef, uint16_t>> Funcs;
-    for (ELFSymbolRef Sym : ELFObj->symbols()) {
+    for (ELFSymbolRef Sym : ELFObjOrErr->symbols()) {
       auto NameOrErr = Sym.getName();
       if (!NameOrErr)
         return NameOrErr.takeError();
diff --git a/openmp/libomptarget/src/OpenMP/API.cpp b/openmp/libomptarget/src/OpenMP/API.cpp
index 1769404faf88..a7b6eac8bcd6 100644
--- a/openmp/libomptarget/src/OpenMP/API.cpp
+++ b/openmp/libomptarget/src/OpenMP/API.cpp
@@ -83,6 +83,8 @@ EXTERN int omp_get_initial_device(void) {
 }
 
 EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
+  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
+                         ";size=" + std::to_string(Size));
   return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
 }
 
@@ -99,6 +101,7 @@ EXTERN void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum) {
 }
 
 EXTERN void omp_target_free(void *Ptr, int DeviceNum) {
+  TIMESCOPE();
   return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
 }
 
@@ -161,7 +164,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
 EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                              size_t DstOffset, size_t SrcOffset, int DstDevice,
                              int SrcDevice) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+                         ";src_dev=" + std::to_string(SrcDevice) +
+                         ";size=" + std::to_string(Length));
   DP("Call to omp_target_memcpy, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
      "src offset %zu, length %zu\n",
@@ -400,7 +405,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
                                    size_t DstOffset, size_t SrcOffset,
                                    int DstDevice, int SrcDevice,
                                    int DepObjCount, omp_depend_t *DepObjList) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+                         ";src_dev=" + std::to_string(SrcDevice) +
+                         ";size=" + std::to_string(Length));
   DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
      "src offset %zu, length %zu\n",
@@ -429,7 +436,6 @@ omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
                        const size_t *DstOffsets, const size_t *SrcOffsets,
                        const size_t *DstDimensions, const size_t *SrcDimensions,
                        int DstDevice, int SrcDevice) {
-  TIMESCOPE();
   DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
@@ -488,7 +494,10 @@ EXTERN int omp_target_memcpy_rect_async(
     const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
     const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
     int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+                         ";src_dev=" + std::to_string(SrcDevice) +
+                         ";size=" + std::to_string(ElementSize) +
+                         ";num_dims=" + std::to_string(NumDims));
   DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index d9e87640161f..61d9db17f510 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -33,14 +33,12 @@ using namespace llvm::omp::target::ompt;
 ////////////////////////////////////////////////////////////////////////////////
 /// adds requires flags
 EXTERN void __tgt_register_requires(int64_t Flags) {
-  TIMESCOPE();
   PM->addRequirements(Flags);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 /// adds a target shared library to the target execution image
 EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
-  TIMESCOPE();
   if (PM->delayRegisterLib(Desc))
     return;
 
@@ -54,7 +52,6 @@ EXTERN void __tgt_init_all_rtls() { PM->initAllPlugins(); }
 ////////////////////////////////////////////////////////////////////////////////
 /// unloads a target shared library
 EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
-  TIMESCOPE();
   PM->unregisterLib(Desc);
 }
 
@@ -68,7 +65,8 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
+  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
+                                   "NumArgs=" + std::to_string(ArgNum), Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
      RegionName, DeviceId, ArgNum);
@@ -240,9 +238,6 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                                KernelArgsTy *KernelArgs) {
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
-
-  TIMESCOPE_WITH_IDENT(Loc);
-
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
      "\n",
      DeviceId, DPxPTR(HostPtr));
@@ -267,6 +262,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
          !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
          "OpenMP interface should not use multiple dimensions");
+  TIMESCOPE_WITH_DETAILS_AND_IDENT(
+      "Runtime: target exe",
+      "NumTeams=" + std::to_string(NumTeams) +
+          ";NumArgs=" + std::to_string(KernelArgs->NumArgs),
+      Loc);
 
   if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
     printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
@@ -297,13 +297,14 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
 
   int Rc = OFFLOAD_SUCCESS;
   Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
+  { // required to show syncronization
+    TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: syncronize", "", Loc);
+    if (Rc == OFFLOAD_SUCCESS)
+      Rc = AsyncInfo.synchronize();
 
-  if (Rc == OFFLOAD_SUCCESS)
-    Rc = AsyncInfo.synchronize();
-
-  handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
-  assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
-
+    handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
+    assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
+  }
   return OMP_TGT_SUCCESS;
 }
 
@@ -402,7 +403,6 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
 
 // Get the current number of components for a user-defined mapper.
 EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
-  TIMESCOPE();
   auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
   int64_t Size = MapperComponentsPtr->Components.size();
   DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
@@ -414,7 +414,6 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
 EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
                                         void *Begin, int64_t Size, int64_t Type,
                                         void *Name) {
-  TIMESCOPE();
   DP("__tgt_push_mapper_component(Handle=" DPxMOD
      ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
      ", Type=0x%" PRIx64 ", Name=%s).\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index e724b2f6db8b..a7d55d7ebd53 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -392,7 +392,6 @@ static int32_t getParentIndex(int64_t Type) {
 
 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
                           const char *Name) {
-  TIMESCOPE();
   DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
@@ -419,7 +418,6 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
 
 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
                         const char *Name) {
-  TIMESCOPE();
   DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
      DPxPTR(DevicePtr));
 
@@ -444,7 +442,6 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
 
 void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
                          const char *Name) {
-  TIMESCOPE();
   DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
@@ -471,7 +468,6 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
 }
 
 void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
-  TIMESCOPE();
   DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
 
   auto DeviceOrErr = PM->getDevice(DeviceNum);
@@ -531,14 +527,14 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                     int64_t *ArgTypes, map_var_info_t *ArgNames,
                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
                     bool FromMapper) {
-  TIMESCOPE_WITH_IDENT(Loc);
   // process each input.
   for (int32_t I = 0; I < ArgNum; ++I) {
     // Ignore private variables and arrays - there is no mapping for them.
     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
       continue;
-
+    TIMESCOPE_WITH_DETAILS_AND_IDENT(
+        "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
     if (ArgMappers && ArgMappers[I]) {
       // Instead of executing the regular path of targetDataBegin, call the
       // targetDataMapper variant which will call targetDataBegin again
@@ -913,7 +909,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         !TPR.Flags.IsHostPointer && DataSize != 0) {
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-
+      TIMESCOPE_WITH_DETAILS_AND_IDENT(
+          "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
       // Wait for any previous transfer if an event is present.
       if (void *Event = TPR.getEntry()->getEvent()) {
         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1403,7 +1400,6 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
                              SmallVector<ptrdiff_t> &TgtOffsets,
                              PrivateArgumentManagerTy &PrivateArgumentManager,
                              AsyncInfoTy &AsyncInfo) {
-  TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
 
   auto DeviceOrErr = PM->getDevice(DeviceId);
   if (!DeviceOrErr)
@@ -1537,7 +1533,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
                             map_var_info_t *ArgNames, void **ArgMappers,
                             PrivateArgumentManagerTy &PrivateArgumentManager,
                             AsyncInfoTy &AsyncInfo) {
-  TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
+
   auto DeviceOrErr = PM->getDevice(DeviceId);
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
@@ -1639,7 +1635,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
 
   {
     assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
-    TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc);
+    TIMESCOPE_WITH_DETAILS_AND_IDENT(
+        "Kernel Target",
+        "NumArguments=" + std::to_string(KernelArgs.NumArgs) +
+            ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
+            ";TripCount=" + std::to_string(KernelArgs.Tripcount),
+        Loc);
 
 #ifdef OMPT_SUPPORT
     assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index adbdd7cc35cc..19c5e5c45722 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -78,6 +78,10 @@ config.test_flags = " -I " + config.test_source_root + \
     " -L " + config.library_dir + \
     " -L " + config.llvm_lib_directory
 
+# compiler specific flags
+config.test_flags_clang = ""
+config.test_flags_flang = ""
+
 if config.omp_host_rtl_directory:
     config.test_flags = config.test_flags + " -L " + \
         config.omp_host_rtl_directory
@@ -136,7 +140,7 @@ else: # Unices
     if config.cuda_libdir:
         config.test_flags += " -Wl,-rpath," + config.cuda_libdir
     if config.libomptarget_current_target.startswith('nvptx'):
-        config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir + '/DeviceRTL'
+        config.test_flags_clang += " --libomptarget-nvptx-bc-path=" + config.library_dir + '/DeviceRTL'
     if config.libomptarget_current_target.endswith('-LTO'):
         config.test_flags += " -foffload-lto"
     if config.libomptarget_current_target.endswith('-JIT-LTO') and evaluate_bool_env(
@@ -273,13 +277,13 @@ for libomptarget_target in config.libomptarget_all_targets:
             libomptarget_target, \
             "%not --crash %t"))
         config.substitutions.append(("%clangxx-" + libomptarget_target, \
-                                     "%clangxx %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\
+                                     "%clangxx %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
                                      remove_suffix_if_present(libomptarget_target)))
         config.substitutions.append(("%clang-" + libomptarget_target, \
-                                     "%clang %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\
+                                     "%clang %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
                                      remove_suffix_if_present(libomptarget_target)))
         config.substitutions.append(("%flang-" + libomptarget_target, \
-                                     "%flang %openmp_flags %flags -fopenmp-targets=" +\
+                                     "%flang %openmp_flags %flags %flags_flang -fopenmp-targets=" +\
                                      remove_suffix_if_present(libomptarget_target)))
         config.substitutions.append(("%fcheck-" + libomptarget_target, \
             config.libomptarget_filecheck + " %s"))
@@ -356,5 +360,7 @@ if config.libomptarget_current_target.startswith('nvptx') and config.cuda_path:
     config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path))
 else:
     config.substitutions.append(("%cuda_flags", ""))
+config.substitutions.append(("%flags_clang", config.test_flags_clang))
+config.substitutions.append(("%flags_flang", config.test_flags_flang))
 config.substitutions.append(("%flags", config.test_flags))
 config.substitutions.append(("%not", config.libomptarget_not))
diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
index 993b91d4eb62..476b77e4a549 100644
--- a/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
@@ -1,6 +1,6 @@
 ! Basic offloading test of arrays with provided lower 
 ! and upper bounds as specified by OpenMP's sectioning
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
index 669d3674926f..229798b57477 100644
--- a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
@@ -1,6 +1,6 @@
 ! Basic offloading test of a regular array explicitly
 ! passed within a target region
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
index c87d6ee24aed..ea3048185d52 100644
--- a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
@@ -1,6 +1,6 @@
 ! Basic offloading test of a regular array explicitly
 ! passed within a target region
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/basic_target_region.f90 b/openmp/libomptarget/test/offloading/fortran/basic_target_region.f90
index 6423ac765670..d856d42bb8cd 100644
--- a/openmp/libomptarget/test/offloading/fortran/basic_target_region.f90
+++ b/openmp/libomptarget/test/offloading/fortran/basic_target_region.f90
@@ -1,5 +1,5 @@
 ! Basic offloading test with a target region
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/constant-arr-index.f90 b/openmp/libomptarget/test/offloading/fortran/constant-arr-index.f90
index 9064f60896f1..669630555c31 100644
--- a/openmp/libomptarget/test/offloading/fortran/constant-arr-index.f90
+++ b/openmp/libomptarget/test/offloading/fortran/constant-arr-index.f90
@@ -2,8 +2,7 @@
 ! that checks constant indexing on device
 ! correctly works (regression test for prior
 ! bug).
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/declare-target-array-in-target-region.f90 b/openmp/libomptarget/test/offloading/fortran/declare-target-array-in-target-region.f90
index d2e59d93a020..c09146198768 100644
--- a/openmp/libomptarget/test/offloading/fortran/declare-target-array-in-target-region.f90
+++ b/openmp/libomptarget/test/offloading/fortran/declare-target-array-in-target-region.f90
@@ -1,7 +1,7 @@
 ! Offloading test with a target region mapping a declare target
 ! Fortran array writing some values to it and checking the host
 ! correctly receives the updates made on the device.
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90 b/openmp/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90
index 884acb275a0e..56c96727d475 100644
--- a/openmp/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90
+++ b/openmp/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90
@@ -2,7 +2,8 @@
 ! declare target Fortran array and writing some values to 
 ! it before checking the host correctly receives the 
 ! correct updates made on the device.
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/target-region-implicit-array.f90 b/openmp/libomptarget/test/offloading/fortran/target-region-implicit-array.f90
index 5ef2547545e4..ada6ef2ad730 100644
--- a/openmp/libomptarget/test/offloading/fortran/target-region-implicit-array.f90
+++ b/openmp/libomptarget/test/offloading/fortran/target-region-implicit-array.f90
@@ -1,6 +1,6 @@
 ! Basic offloading test of a regular array explicitly
 ! passed within a target region
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90 b/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90
index e782ef8a670a..f20423edb957 100644
--- a/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90
+++ b/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90
@@ -1,5 +1,5 @@
 ! Basic offloading test with a target region
-! REQUIRES: flang, amdgcn-amd-amdhsa
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
diff --git a/openmp/libomptarget/test/offloading/fortran/target_map_common_block2.f90 b/openmp/libomptarget/test/offloading/fortran/target_map_common_block2.f90
index 8a9c47545fbd..24e3e2b7ab11 100644
--- a/openmp/libomptarget/test/offloading/fortran/target_map_common_block2.f90
+++ b/openmp/libomptarget/test/offloading/fortran/target_map_common_block2.f90
@@ -1,4 +1,4 @@
-! REQUIRES: flang, amdgcn-amd-amdhsa
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
diff --git a/openmp/libomptarget/test/offloading/fortran/target_update.f90 b/openmp/libomptarget/test/offloading/fortran/target_update.f90
new file mode 100644
index 000000000000..fb35c5a36ab0
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target_update.f90
@@ -0,0 +1,50 @@
+! Offloading test for the `target update` directive.
+
+! REQUIRES: flang, amdgcn-amd-amdhsa
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program target_update
+    implicit none
+    integer :: x(1)
+    integer :: host_id
+    integer :: device_id(1)
+
+    INTERFACE
+        FUNCTION omp_get_device_num() BIND(C)
+            USE, INTRINSIC :: iso_c_binding, ONLY: C_INT
+            integer :: omp_get_device_num
+        END FUNCTION omp_get_device_num
+    END INTERFACE
+
+    x(1) = 5
+    host_id = omp_get_device_num()
+
+!$omp target enter data map(to:x, device_id)
+!$omp target
+    x(1) = 42
+!$omp end target
+
+    ! Test that without a `target update` directive, the target update to x is
+    ! not yet seen by the host.
+    ! CHECK: After first target regions and before target update: x = 5
+    print *, "After first target regions and before target update: x =", x(1)
+
+!$omp target
+    x(1) = 84
+    device_id(1) = omp_get_device_num()
+!$omp end target
+!$omp target update from(x, device_id)
+
+    ! Test that after the `target update`, the host can see the new x value.
+    ! CHECK: After second target regions and target update: x = 84
+    print *, "After second target regions and target update: x =", x(1)
+
+    ! Make sure that offloading to the device actually happened. This way we
+    ! verify that we didn't take the fallback host execution path.
+    ! CHECK: Offloading succeeded!
+    if (host_id /= device_id(1)) then
+        print *, "Offloading succeeded!"
+    else
+        print *, "Offloading failed!"
+    end if
+end program target_update
diff --git a/openmp/libomptarget/test/offloading/malloc.c b/openmp/libomptarget/test/offloading/malloc.c
index ad49ace20062..7b98e1f1110e 100644
--- a/openmp/libomptarget/test/offloading/malloc.c
+++ b/openmp/libomptarget/test/offloading/malloc.c
@@ -6,9 +6,9 @@
 
 int main() {
   long unsigned *DP = 0;
-  int N = 128;
-  int Threads = 128;
-  int Teams = 440;
+  int N = 32;
+  int Threads = 64;
+  int Teams = 10;
 
   // Allocate ~55MB on the device.
 #pragma omp target map(from : DP)
@@ -31,7 +31,7 @@ int main() {
     }
   }
 
-  // CHECK: Sum: 203458478080
+  // CHECK: Sum: 6860800
   printf("Sum: %li\n", s);
   return 0;
 }
diff --git a/openmp/libomptarget/test/offloading/malloc_parallel.c b/openmp/libomptarget/test/offloading/malloc_parallel.c
index 4908e00694d9..076a7ba397a3 100644
--- a/openmp/libomptarget/test/offloading/malloc_parallel.c
+++ b/openmp/libomptarget/test/offloading/malloc_parallel.c
@@ -7,9 +7,9 @@
 
 int main() {
   long unsigned **DP = 0;
-  int N = 128;
-  int Threads = 128;
-  int Teams = 440;
+  int N = 32;
+  int Threads = 64;
+  int Teams = 10;
 
 #pragma omp target map(from : DP)
   DP = (long unsigned **)malloc(sizeof(long unsigned *) * Threads * Teams);
@@ -36,7 +36,7 @@ int main() {
     }
   }
 
-  // CHECK: Sum: 203458478080
+  // CHECK: Sum: 6860800
   printf("Sum: %li\n", s);
   return 0;
 }
diff --git a/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp b/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp
index befed120ca13..f0fde50889da 100644
--- a/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp
+++ b/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp
@@ -29,6 +29,7 @@ int main() {
 
   dat.datum[7] = 7;
   dat.more_datum[17] = 17;
+  dat.datum[dat.arr[0][0]] = 0;
 
   /// The struct is mapped with type 0x0 when the pointer fields are mapped.
   /// The struct is also map explicitely by the user. The second mapping by
diff --git a/openmp/libomptarget/unittests/CMakeLists.txt b/openmp/libomptarget/unittests/CMakeLists.txt
new file mode 100644
index 000000000000..73c87b708d25
--- /dev/null
+++ b/openmp/libomptarget/unittests/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_custom_target(LibomptUnitTests)
+set_target_properties(LibomptUnitTests PROPERTIES FOLDER "Tests/UnitTests")
+
+function(add_libompt_unittest test_dirname)
+  add_unittest(LibomptUnitTests ${test_dirname} ${ARGN})
+endfunction()
+
+add_subdirectory(Plugins)
diff --git a/openmp/libomptarget/unittests/Plugins/CMakeLists.txt b/openmp/libomptarget/unittests/Plugins/CMakeLists.txt
new file mode 100644
index 000000000000..67d4d9b463b0
--- /dev/null
+++ b/openmp/libomptarget/unittests/Plugins/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(PLUGINS_TEST_COMMON omptarget)
+set(PLUGINS_TEST_SOURCES NextgenPluginsTest.cpp)
+set(PLUGINS_TEST_INCLUDE ${LIBOMPTARGET_INCLUDE_DIR})
+
+foreach(PLUGIN IN LISTS LIBOMPTARGET_TESTED_PLUGINS)
+  libomptarget_say("Building plugin unit tests for ${PLUGIN}")
+  add_libompt_unittest("${PLUGIN}.unittests" ${PLUGINS_TEST_SOURCES})
+  add_dependencies("${PLUGIN}.unittests" ${PLUGINS_TEST_COMMON} ${PLUGIN})
+  target_link_libraries("${PLUGIN}.unittests" PRIVATE ${PLUGINS_TEST_COMMON} ${PLUGIN})
+  target_include_directories("${PLUGIN}.unittests" PRIVATE ${PLUGINS_TEST_INCLUDE})
+endforeach()
diff --git a/openmp/libomptarget/unittests/Plugins/NextgenPluginsTest.cpp b/openmp/libomptarget/unittests/Plugins/NextgenPluginsTest.cpp
new file mode 100644
index 000000000000..635bd1637c90
--- /dev/null
+++ b/openmp/libomptarget/unittests/Plugins/NextgenPluginsTest.cpp
@@ -0,0 +1,168 @@
+//===------- unittests/Plugins/NextgenPluginsTest.cpp - Plugin tests ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Shared/PluginAPI.h"
+#include "omptarget.h"
+#include "gtest/gtest.h"
+
+#include <unordered_set>
+
+const int DEVICE_ID = 0;
+std::unordered_set<int> setup_map;
+
+int init_test_device(int ID) {
+  if (setup_map.find(ID) != setup_map.end()) {
+    return OFFLOAD_SUCCESS;
+  }
+  if (__tgt_rtl_init_plugin() == OFFLOAD_FAIL ||
+      __tgt_rtl_init_device(ID) == OFFLOAD_FAIL) {
+    return OFFLOAD_FAIL;
+  }
+  setup_map.insert(ID);
+  return OFFLOAD_SUCCESS;
+}
+
+// Test plugin initialization
+TEST(NextgenPluginsTest, PluginInit) {
+  EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_ID));
+}
+
+// Test GPU allocation and R/W
+TEST(NextgenPluginsTest, PluginAlloc) {
+  int32_t test_value = 23;
+  int32_t host_value = -1;
+  int64_t var_size = sizeof(int32_t);
+
+  // Init plugin and device
+  EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_ID));
+
+  // Allocate memory
+  void *device_ptr =
+      __tgt_rtl_data_alloc(DEVICE_ID, var_size, nullptr, TARGET_ALLOC_DEFAULT);
+
+  // Check that the result is not null
+  EXPECT_NE(device_ptr, nullptr);
+
+  // Submit data to device
+  EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_submit(DEVICE_ID, device_ptr,
+                                                   &test_value, var_size));
+
+  // Read data from device
+  EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_retrieve(DEVICE_ID, &host_value,
+                                                     device_ptr, var_size));
+
+  // Compare values
+  EXPECT_EQ(host_value, test_value);
+
+  // Cleanup data
+  EXPECT_EQ(OFFLOAD_SUCCESS,
+            __tgt_rtl_data_delete(DEVICE_ID, device_ptr, TARGET_ALLOC_DEFAULT));
+}
+
+// Test async GPU allocation and R/W
+TEST(NextgenPluginsTest, PluginAsyncAlloc) {
+  int32_t test_value = 47;
+  int32_t host_value = -1;
+  int64_t var_size = sizeof(int32_t);
+  __tgt_async_info *info;
+
+  // Init plugin and device
+  EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_ID));
+
+  // Check if device supports async
+  // Platforms like x86_64 don't support it
+  if (__tgt_rtl_init_async_info(DEVICE_ID, &info) == OFFLOAD_SUCCESS) {
+    // Allocate memory
+    void *device_ptr = __tgt_rtl_data_alloc(DEVICE_ID, var_size, nullptr,
+                                            TARGET_ALLOC_DEFAULT);
+
+    // Check that the result is not null
+    EXPECT_NE(device_ptr, nullptr);
+
+    // Submit data to device asynchronously
+    EXPECT_EQ(OFFLOAD_SUCCESS,
+              __tgt_rtl_data_submit_async(DEVICE_ID, device_ptr, &test_value,
+                                          var_size, info));
+
+    // Wait for async request to process
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_synchronize(DEVICE_ID, info));
+
+    // Read data from device
+    EXPECT_EQ(OFFLOAD_SUCCESS,
+              __tgt_rtl_data_retrieve_async(DEVICE_ID, &host_value, device_ptr,
+                                            var_size, info));
+
+    // Wait for async request to process
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_synchronize(DEVICE_ID, info));
+
+    // Compare values
+    EXPECT_EQ(host_value, test_value);
+
+    // Cleanup data
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_delete(DEVICE_ID, device_ptr,
+                                                     TARGET_ALLOC_DEFAULT));
+  }
+}
+
+// Test GPU data exchange
+TEST(NextgenPluginsTest, PluginDataSwap) {
+  int32_t test_value = 23;
+  int32_t host_value = -1;
+  int64_t var_size = sizeof(int32_t);
+
+  // Look for compatible device
+  int DEVICE_TWO = -1;
+  for (int i = 1; i < __tgt_rtl_number_of_devices(); i++) {
+    if (__tgt_rtl_is_data_exchangable(DEVICE_ID, i)) {
+      DEVICE_TWO = i;
+      break;
+    }
+  }
+
+  // Only run test if we have multiple GPUs to test
+  // GPUs must be compatible for test to work
+  if (DEVICE_TWO >= 1) {
+    // Init both GPUs
+    EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_ID));
+    EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_TWO));
+
+    // Allocate memory on both GPUs
+    // DEVICE_ID will be the source
+    // DEVICE_TWO will be the destination
+    void *source_ptr = __tgt_rtl_data_alloc(DEVICE_ID, var_size, nullptr,
+                                            TARGET_ALLOC_DEFAULT);
+    void *dest_ptr = __tgt_rtl_data_alloc(DEVICE_TWO, var_size, nullptr,
+                                          TARGET_ALLOC_DEFAULT);
+
+    // Check for success in allocation
+    EXPECT_NE(source_ptr, nullptr);
+    EXPECT_NE(dest_ptr, nullptr);
+
+    // Write data to source
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_submit(DEVICE_ID, source_ptr,
+                                                     &test_value, var_size));
+
+    // Transfer data between devices
+    EXPECT_EQ(OFFLOAD_SUCCESS,
+              __tgt_rtl_data_exchange(DEVICE_ID, source_ptr, DEVICE_TWO,
+                                      dest_ptr, var_size));
+
+    // Read from destination device (DEVICE_TWO) memory
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_retrieve(DEVICE_TWO, &host_value,
+                                                       dest_ptr, var_size));
+
+    // Ensure match
+    EXPECT_EQ(host_value, test_value);
+
+    // Cleanup
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_delete(DEVICE_ID, source_ptr,
+                                                     TARGET_ALLOC_DEFAULT));
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_delete(DEVICE_TWO, dest_ptr,
+                                                     TARGET_ALLOC_DEFAULT));
+  }
+}
diff --git a/polly/cmake/polly_macros.cmake b/polly/cmake/polly_macros.cmake
index 518a09b45a42..df541eeccc4c 100644
--- a/polly/cmake/polly_macros.cmake
+++ b/polly/cmake/polly_macros.cmake
@@ -43,9 +43,12 @@ macro(add_polly_library name)
   endif( LLVM_LINK_COMPONENTS )
   if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR ${name} STREQUAL "LLVMPolly")
     install(TARGETS ${name}
+      COMPONENT ${name}
       EXPORT LLVMExports
       LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
       ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
+    add_llvm_install_targets(install-${name}
+      COMPONENT ${name})
   endif()
   set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS ${name})
 endmacro(add_polly_library)
diff --git a/polly/lib/External/CMakeLists.txt b/polly/lib/External/CMakeLists.txt
index 458925f9b6e4..1869410c8baa 100644
--- a/polly/lib/External/CMakeLists.txt
+++ b/polly/lib/External/CMakeLists.txt
@@ -306,6 +306,7 @@ if (POLLY_BUNDLED_ISL)
 
   target_link_libraries(polly-isl-test PRIVATE
     PollyISL
+    LLVMSupport
     )
 
   # ISL requires at least C99 to compile. gcc < 5.0 use -std=gnu89 as default.
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 35a0a4def040..5a0ea3b40675 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -96,6 +96,13 @@ static cl::opt<std::string>
                       cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
                       cl::init("yes"), cl::cat(PollyCategory));
 
+static cl::opt<int>
+    ScheduleComputeOut("polly-schedule-computeout",
+                       cl::desc("Bound the scheduler by maximal amount"
+                                "of computational steps. "),
+                       cl::Hidden, cl::init(300000), cl::ZeroOrMore,
+                       cl::cat(PollyCategory));
+
 static cl::opt<bool>
     GreedyFusion("polly-loopfusion-greedy",
                  cl::desc("Aggressively try to fuse everything"), cl::Hidden,
@@ -860,7 +867,16 @@ static void runIslScheduleOptimizer(
     SC = SC.set_proximity(Proximity);
     SC = SC.set_validity(Validity);
     SC = SC.set_coincidence(Validity);
-    Schedule = SC.compute_schedule();
+
+    {
+      IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut);
+      Schedule = SC.compute_schedule();
+
+      if (MaxOpGuard.hasQuotaExceeded())
+        LLVM_DEBUG(
+            dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
+    }
+
     isl_options_set_on_error(Ctx, OnErrorStatus);
 
     ScopsRescheduled++;
diff --git a/polly/test/ScheduleOptimizer/schedule_computeout.ll b/polly/test/ScheduleOptimizer/schedule_computeout.ll
new file mode 100644
index 000000000000..db46ef502193
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/schedule_computeout.ll
@@ -0,0 +1,94 @@
+; RUN: opt -S -polly-optree -polly-delicm  -polly-opt-isl -polly-schedule-computeout=100000 -debug-only="polly-opt-isl" < %s 2>&1 | FileCheck %s
+; Bailout if the computations of schedule compute exceeds the max scheduling quota.
+; Max compute out is initialized to 300000, Here it is set to 100000 for test purpose.
+
+@a = dso_local local_unnamed_addr global ptr null, align 8
+@b = dso_local local_unnamed_addr global ptr null, align 8
+@c = dso_local local_unnamed_addr global ptr null, align 8
+
+define dso_local void @foo(i32 noundef %I, i32 noundef %J, i32 noundef %K1, i32 noundef %K2, i32 noundef %L1, i32 noundef %L2) local_unnamed_addr {
+entry:
+  %j = alloca i32, align 4
+  store volatile i32 0, ptr %j, align 4
+  %j.0.j.0.j.0.54 = load volatile i32, ptr %j, align 4
+  %cmp55 = icmp slt i32 %j.0.j.0.j.0.54, %J
+  br i1 %cmp55, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load ptr, ptr @a, align 8
+  %1 = load ptr, ptr @b, align 8
+  %2 = load ptr, ptr %1, align 8
+  %cmp352 = icmp slt i32 %L1, %L2
+  %cmp750 = icmp slt i32 %K1, %K2
+  %3 = sext i32 %K1 to i64
+  %4 = sext i32 %L1 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup4, %entry
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup4, %for.body.lr.ph
+  br i1 %cmp352, label %for.cond6.preheader.preheader, label %for.cond.cleanup4
+
+for.cond6.preheader.preheader:                    ; preds = %for.body
+  %wide.trip.count66 = sext i32 %L2 to i64
+  br label %for.cond6.preheader
+
+for.cond6.preheader:                              ; preds = %for.cond.cleanup8, %for.cond6.preheader.preheader
+  %indvars.iv61 = phi i64 [ %4, %for.cond6.preheader.preheader ], [ %indvars.iv.next62, %for.cond.cleanup8 ]
+  br i1 %cmp750, label %for.cond10.preheader.lr.ph, label %for.cond.cleanup8
+
+for.cond10.preheader.lr.ph:                       ; preds = %for.cond6.preheader
+  %5 = mul nsw i64 %indvars.iv61, 516
+  %6 = mul nsw i64 %indvars.iv61, 516
+  %wide.trip.count = sext i32 %K2 to i64
+  br label %for.cond10.preheader
+
+for.cond.cleanup4:                                ; preds = %for.cond.cleanup8, %for.body
+  %j.0.j.0.j.0.45 = load volatile i32, ptr %j, align 4
+  %inc34 = add nsw i32 %j.0.j.0.j.0.45, 1
+  store volatile i32 %inc34, ptr %j, align 4
+  %j.0.j.0.j.0. = load volatile i32, ptr %j, align 4
+  %cmp = icmp slt i32 %j.0.j.0.j.0., %J
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond10.preheader:                             ; preds = %for.cond.cleanup12, %for.cond10.preheader.lr.ph
+  %indvars.iv = phi i64 [ %3, %for.cond10.preheader.lr.ph ], [ %indvars.iv.next, %for.cond.cleanup12 ]
+  %7 = getelementptr float, ptr %0, i64 %indvars.iv
+  %arrayidx18 = getelementptr float, ptr %7, i64 %5
+  %8 = load float, ptr %arrayidx18, align 4
+  br label %for.cond14.preheader
+
+for.cond.cleanup8:                                ; preds = %for.cond.cleanup12, %for.cond6.preheader
+  %indvars.iv.next62 = add nsw i64 %indvars.iv61, 1
+  %exitcond67.not = icmp eq i64 %indvars.iv.next62, %wide.trip.count66
+  br i1 %exitcond67.not, label %for.cond.cleanup4, label %for.cond6.preheader
+
+for.cond14.preheader:                             ; preds = %for.cond.cleanup16, %for.cond10.preheader
+  %m.049 = phi i32 [ -2, %for.cond10.preheader ], [ %inc21, %for.cond.cleanup16 ]
+  %sum.048 = phi float [ 0.000000e+00, %for.cond10.preheader ], [ %add19, %for.cond.cleanup16 ]
+  br label %for.body17
+
+for.cond.cleanup12:                               ; preds = %for.cond.cleanup16
+  %9 = getelementptr float, ptr %2, i64 %indvars.iv
+  %arrayidx26 = getelementptr float, ptr %9, i64 %6
+  store float %add19, ptr %arrayidx26, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond60.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond60.not, label %for.cond.cleanup8, label %for.cond10.preheader
+
+for.cond.cleanup16:                               ; preds = %for.body17
+  %inc21 = add nsw i32 %m.049, 1
+  %exitcond56.not = icmp eq i32 %inc21, 3
+  br i1 %exitcond56.not, label %for.cond.cleanup12, label %for.cond14.preheader
+
+for.body17:                                       ; preds = %for.body17, %for.cond14.preheader
+  %n.047 = phi i32 [ -2, %for.cond14.preheader ], [ %inc, %for.body17 ]
+  %sum.146 = phi float [ %sum.048, %for.cond14.preheader ], [ %add19, %for.body17 ]
+  %add19 = fadd float %sum.146, %8
+  %inc = add nsw i32 %n.047, 1
+  %exitcond.not = icmp eq i32 %inc, 3
+  br i1 %exitcond.not, label %for.cond.cleanup16, label %for.body17
+}
+
+; CHECK: Schedule optimizer calculation exceeds ISL quota
diff --git a/third-party/benchmark/src/sysinfo.cc b/third-party/benchmark/src/sysinfo.cc
index d9bd651f8a4d..3a56e8cace48 100644
--- a/third-party/benchmark/src/sysinfo.cc
+++ b/third-party/benchmark/src/sysinfo.cc
@@ -449,6 +449,8 @@ std::string GetSystemName() {
 #define HOST_NAME_MAX 154
 #elif defined(BENCHMARK_OS_RTEMS)
 #define HOST_NAME_MAX 256
+#elif defined(BENCHMARK_OS_ZOS)
+#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
 #else
 #pragma message("HOST_NAME_MAX not defined. using 64")
 #define HOST_NAME_MAX 64
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 0b68f7d5fcd6..a1f1b8b4a0db 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -2386,6 +2386,7 @@ cc_binary(
     name = "clang-format",
     srcs = [
         "tools/clang-format/ClangFormat.cpp",
+        "lib/Format/MatchFilePath.h",
     ],
     stamp = 0,
     deps = [
diff --git a/utils/bazel/llvm-project-overlay/clang/include/clang/Config/config.h b/utils/bazel/llvm-project-overlay/clang/include/clang/Config/config.h
index 88ace233b541..ac0d9eb24931 100644
--- a/utils/bazel/llvm-project-overlay/clang/include/clang/Config/config.h
+++ b/utils/bazel/llvm-project-overlay/clang/include/clang/Config/config.h
@@ -71,6 +71,12 @@
 /* Define if we have sys/resource.h (rlimits) */
 /* CLANG_HAVE_RLIMITS defined conditionally below */
 
+/* Define if we have dlfcn.h */
+#define CLANG_HAVE_DLFCN_H 1
+
+/* Define if dladdr() is available on this platform. */
+#define CLANG_HAVE_DLADDR 1
+
 /* Linker version detected at compile time. */
 /* #undef HOST_LINK_VERSION */
 
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index d4f2c078db79..b5238f7686e5 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -659,17 +659,6 @@ libc_support_library(
 )
 
 libc_support_library(
-    name = "__support_fputil_float_properties",
-    hdrs = ["src/__support/FPUtil/FloatProperties.h"],
-    deps = [
-        ":__support_macros_attributes",
-        ":__support_macros_properties_float",
-        ":__support_math_extras",
-        ":__support_uint128",
-    ],
-)
-
-libc_support_library(
     name = "__support_fputil_fp_bits",
     hdrs = ["src/__support/FPUtil/FPBits.h"],
     textual_hdrs = ["src/__support/FPUtil/x86_64/LongDoubleBits.h"],
@@ -677,8 +666,9 @@ libc_support_library(
         ":__support_common",
         ":__support_cpp_bit",
         ":__support_cpp_type_traits",
-        ":__support_fputil_float_properties",
         ":__support_macros_attributes",
+        ":__support_macros_properties_float",
+        ":__support_math_extras",
         ":__support_uint128",
     ],
 )
@@ -690,7 +680,6 @@ libc_support_library(
         ":__support_common",
         ":__support_cpp_string",
         ":__support_cpp_type_traits",
-        ":__support_fputil_float_properties",
         ":__support_fputil_fp_bits",
         ":__support_integer_to_string",
         ":__support_uint128",
@@ -801,7 +790,6 @@ libc_support_library(
         ":__support_cpp_bit",
         ":__support_cpp_type_traits",
         ":__support_fputil_fenv_impl",
-        ":__support_fputil_float_properties",
         ":__support_fputil_fp_bits",
         ":__support_fputil_rounding_mode",
         ":__support_macros_attributes",
@@ -880,7 +868,6 @@ libc_support_library(
     hdrs = ["src/__support/FPUtil/dyadic_float.h"],
     deps = [
         ":__support_common",
-        ":__support_fputil_float_properties",
         ":__support_fputil_fp_bits",
         ":__support_fputil_multiply_add",
         ":__support_macros_optimization",
@@ -2782,7 +2769,6 @@ libc_support_library(
         ":__support_cpp_string_view",
         ":__support_float_to_string",
         ":__support_fputil_fenv_impl",
-        ":__support_fputil_float_properties",
         ":__support_fputil_fp_bits",
         ":__support_fputil_rounding_mode",
         ":__support_integer_to_string",
diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
index 0c63bdb96333..cf27001be9df 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
@@ -139,7 +139,6 @@ def libc_math_function(
         ":__support_fputil_division_and_remainder_operations",
         ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
-        ":__support_fputil_float_properties",
         ":__support_fputil_hypot",
         ":__support_fputil_manipulation_functions",
         ":__support_fputil_nearest_integer_operations",
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
index 11a7b3b7404f..22f4d03ee900 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
@@ -49,7 +49,7 @@ libc_test(
         "str_to_long_double_test.cpp",
     ],
     deps = [
-        "//libc:__support_fputil_float_properties",
+        "//libc:__support_fputil_fp_bits",
         "//libc:__support_str_to_float",
         "//libc:__support_uint128",
     ],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
index 1dffafdd5394..aba259ba6401 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
@@ -27,7 +27,6 @@ def math_test(name, hdrs = [], deps = [], **kwargs):
         deps = [
             "//libc:__support_fputil_basic_operations",
             "//libc:__support_fputil_fenv_impl",
-            "//libc:__support_fputil_float_properties",
             "//libc:__support_fputil_fp_bits",
             "//libc:__support_fputil_manipulation_functions",
             "//libc:__support_fputil_nearest_integer_operations",
diff --git a/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel
index 6c8c20e3b9e8..564af38c20e0 100644
--- a/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel
@@ -44,7 +44,6 @@ libc_support_library(
         "//libc:__support_cpp_string",
         "//libc:__support_cpp_string_view",
         "//libc:__support_cpp_type_traits",
-        "//libc:__support_fputil_float_properties",
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_fputil_fpbits_str",
         "//libc/test/UnitTest:fp_test_helpers",
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index fd74bac5a8c5..f035a17833d8 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -5225,6 +5225,7 @@ cc_binary(
     copts = llvm_copts,
     stamp = 0,
     deps = [
+        ":BinaryFormat",
         ":Object",
         ":Option",
         ":ReadTAPIOptsTableGen",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index c80d714f6a99..0850f7095458 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -5457,6 +5457,7 @@ cc_library(
     deps = [
         ":GPUDialect",
         ":GPUPassIncGen",
+        ":IR",
         ":Pass",
         ":SPIRVDialect",
         ":Support",
@@ -9360,6 +9361,7 @@ cc_binary(
         ":MlirJitRunner",
         ":Pass",
         ":ReconcileUnrealizedCasts",
+	":SCFDialect",
         ":SPIRVDialect",
         ":SPIRVTransforms",
         ":ToLLVMIRTranslation",
diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
index 049098b158f2..6f6f2b3798e8 100644
--- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
@@ -1058,6 +1058,37 @@ filegroup(
 )
 
 ##---------------------------------------------------------------------------##
+# SPIRV dialect.
+##---------------------------------------------------------------------------##
+
+gentbl_filegroup(
+    name = "SPIRVOpsPyGen",
+    tbl_outs = [
+        (
+            [
+                "-gen-python-op-bindings",
+                "-bind-dialect=spirv",
+            ],
+            "mlir/dialects/_spirv_ops_gen.py",
+        ),
+    ],
+    tblgen = "//mlir:mlir-tblgen",
+    td_file = "mlir/dialects/SPIRVOps.td",
+    deps = [
+        "//mlir:OpBaseTdFiles",
+        "//mlir:SPIRVOpsTdFiles",
+    ],
+)
+
+filegroup(
+    name = "SPIRVOpsPyFiles",
+    srcs = [
+        "mlir/dialects/spirv.py",
+        ":SPIRVOpsPyGen",
+    ],
+)
+
+##---------------------------------------------------------------------------##
 # Tensor dialect.
 ##---------------------------------------------------------------------------##
 
@@ -1451,7 +1482,7 @@ filegroup(
 
 filegroup(
     name = "TransformExtrasPackagePyFiles",
-    srcs = glob(["mlir/extras/dialects/transform/*.py"]),
+    srcs = glob(["mlir/dialects/transform/extras/*.py"]),
 )
 
 ##---------------------------------------------------------------------------##
diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake
index d464263c190a..fc1f9bf342f8 100644
--- a/utils/bazel/llvm_configs/config.h.cmake
+++ b/utils/bazel/llvm_configs/config.h.cmake
@@ -50,9 +50,15 @@
    don't. */
 #cmakedefine01 HAVE_DECL_STRERROR_S
 
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#cmakedefine HAVE_DLFCN_H ${HAVE_DLFCN_H}
+
 /* Define if dlopen() is available on this platform. */
 #cmakedefine HAVE_DLOPEN ${HAVE_DLOPEN}
 
+/* Define if dladdr() is available on this platform. */
+#cmakedefine HAVE_DLADDR ${HAVE_DLADDR}
+
 /* Define to 1 if we can register EH frames on this platform. */
 #cmakedefine HAVE_REGISTER_FRAME ${HAVE_REGISTER_FRAME}
 
diff --git a/utils/bazel/llvm_configs/llvm-config.h.cmake b/utils/bazel/llvm_configs/llvm-config.h.cmake
index 483c5adc99ca..6605ea60df99 100644
--- a/utils/bazel/llvm_configs/llvm-config.h.cmake
+++ b/utils/bazel/llvm_configs/llvm-config.h.cmake
@@ -198,10 +198,4 @@
 /* Define if plugins enabled */
 #cmakedefine LLVM_ENABLE_PLUGINS
 
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#cmakedefine HAVE_DLFCN_H ${HAVE_DLFCN_H}
-
-/* Define if dladdr() is available on this platform. */
-#cmakedefine HAVE_DLADDR ${HAVE_DLADDR}
-
 #endif
author	Andrés Villegas <andresvi@google.com>	2024-01-02 20:11:15 +0000
committer	Andrés Villegas <andresvi@google.com>	2024-01-02 20:11:15 +0000
commit	9b11c0d602551ecf17d3dfaf487a0b76b1d8485b (patch)
tree	91d2c2779fc3d53bb2b3199d2cf1eb9a56cec347
parent	cba0ef6a22f7ced03e34434250edb1ff2f60653a (diff)
parent	d50e5f5778f0fba272ab3b2a4f7a8103631bcde4 (diff)